1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54 package org.w3c.tidy;
55
56 import java.io.PrintWriter;
57 import java.util.List;
58 import java.util.Stack;
59 import java.util.Vector;
60
61
62 /**
63 * Lexer for html parser.
64 * <p>
65 * Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
66 * level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
67 * null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
68 * mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
69 * to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
70 * Not yet done: - Doctype subset and marked sections
71 * </p>
72 * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
73 * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
74 * @author Fabrizio Giustina
75 * @version $Revision: 807 $ ($Author: fgiust $)
76 */
77 public class Lexer
78 {
79
80 /**
81 * state: ignore whitespace.
82 */
83 public static final short IGNORE_WHITESPACE = 0;
84
85 /**
86 * state: mixed content.
87 */
88 public static final short MIXED_CONTENT = 1;
89
90 /**
91 * state: preformatted.
92 */
93 public static final short PREFORMATTED = 2;
94
95 /**
96 * state: ignore markup.
97 */
98 public static final short IGNORE_MARKUP = 3;
99
100 /**
101 * URI for XHTML 1.0 transitional DTD.
102 */
103 private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
104
105 /**
106 * URI for XHTML 1.0 strict DTD.
107 */
108 private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
109
110 /**
111 * URI for XHTML 1.0 frameset DTD.
112 */
113 private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
114
115 /**
116 * URI for XHTML 1.1.
117 */
118 private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";
119
120 /**
121 * URI for XHTML Basic 1.0.
122 */
123
124 /**
125 * xhtml namespace.
126 */
127 private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
128
129 /**
130 * lists all the known versions.
131 */
132 private static final Lexer.W3CVersionInfo[] W3CVERSION = {
133 new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
134 new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
135 new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
136 new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
137 new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
138 new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
139 new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
140 new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
141 new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
142 new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20),
143 new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)};
144
145 /**
146 * getToken state: content.
147 */
148 private static final short LEX_CONTENT = 0;
149
150 /**
151 * getToken state: gt.
152 */
153 private static final short LEX_GT = 1;
154
155 /**
156 * getToken state: endtag.
157 */
158 private static final short LEX_ENDTAG = 2;
159
160 /**
161 * getToken state: start tag.
162 */
163 private static final short LEX_STARTTAG = 3;
164
165 /**
166 * getToken state: comment.
167 */
168 private static final short LEX_COMMENT = 4;
169
170 /**
171 * getToken state: doctype.
172 */
173 private static final short LEX_DOCTYPE = 5;
174
175 /**
176 * getToken state: procinstr.
177 */
178 private static final short LEX_PROCINSTR = 6;
179
180 /**
181 * getToken state: cdata.
182 */
183 private static final short LEX_CDATA = 8;
184
185 /**
186 * getToken state: section.
187 */
188 private static final short LEX_SECTION = 9;
189
190 /**
191 * getToken state: asp.
192 */
193 private static final short LEX_ASP = 10;
194
195 /**
196 * getToken state: jste.
197 */
198 private static final short LEX_JSTE = 11;
199
200 /**
201 * getToken state: php.
202 */
203 private static final short LEX_PHP = 12;
204
205 /**
206 * getToken state: xml declaration.
207 */
208 private static final short LEX_XMLDECL = 13;
209
210 /**
211 * file stream.
212 */
213 protected StreamIn in;
214
215 /**
216 * error output stream.
217 */
218 protected PrintWriter errout;
219
220 /**
221 * for accessibility errors.
222 */
223 protected short badAccess;
224
225 /**
226 * for bad style errors.
227 */
228 protected short badLayout;
229
230 /**
231 * for bad char encodings.
232 */
233 protected short badChars;
234
235 /**
236 * for mismatched/mispositioned form tags.
237 */
238 protected short badForm;
239
240 /**
241 * count of warnings in this document.
242 */
243 protected short warnings;
244
245 /**
246 * count of errors.
247 */
248 protected short errors;
249
250 /**
251 * lines seen.
252 */
253 protected int lines;
254
255 /**
256 * at start of current token.
257 */
258 protected int columns;
259
260 /**
261 * used to collapse contiguous white space.
262 */
263 protected boolean waswhite;
264
265 /**
266 * true after token has been pushed back.
267 */
268 protected boolean pushed;
269
270 /**
271 * when space is moved after end tag.
272 */
273 protected boolean insertspace;
274
275 /**
276 * Netscape compatibility.
277 */
278 protected boolean excludeBlocks;
279
280 /**
281 * true if moved out of table.
282 */
283 protected boolean exiled;
284
285 /**
286 * true if xmlns attribute on html element.
287 */
288 protected boolean isvoyager;
289
290 /**
291 * bit vector of HTML versions.
292 */
293 protected short versions;
294
295 /**
296 * version as given by doctype (if any).
297 */
298 protected int doctype;
299
300 /**
301 * set if html or PUBLIC is missing.
302 */
303 protected boolean badDoctype;
304
305 /**
306 * start of current node.
307 */
308 protected int txtstart;
309
310 /**
311 * end of current node.
312 */
313 protected int txtend;
314
315 /**
316 * state of lexer's finite state machine.
317 */
318 protected short state;
319
320 /**
321 * current node.
322 */
323 protected Node token;
324
325 /**
326 * Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
327 * all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
328 */
329 protected byte[] lexbuf;
330
331 /**
332 * allocated.
333 */
334 protected int lexlength;
335
336 /**
337 * used.
338 */
339 protected int lexsize;
340
341 /**
342 * Inline stack for compatibility with Mosaic. For deferring text node.
343 */
344 protected Node inode;
345
346 /**
347 * for inferring inline tags.
348 */
349 protected int insert;
350
351 /**
352 * stack.
353 */
354 protected Stack istack;
355
356 /**
357 * start of frame.
358 */
359 protected int istackbase;
360
361 /**
362 * used for cleaning up presentation markup.
363 */
364 protected Style styles;
365
366 /**
367 * configuration.
368 */
369 protected Configuration configuration;
370
371 /**
372 * already seen end body tag?
373 */
374 protected boolean seenEndBody;
375
376 /**
377 * already seen end html tag?
378 */
379 protected boolean seenEndHtml;
380
381 /**
382 * report.
383 */
384 protected Report report;
385
386 /**
387 * Root node is saved here.
388 */
389 protected Node root;
390
391 /**
392 * node list.
393 */
394 private List nodeList;
395
396 /**
397 * Instantiates a new Lexer.
398 * @param in StreamIn
399 * @param configuration configuation instance
400 * @param report report instance, for reporting errors
401 */
402 public Lexer(StreamIn in, Configuration configuration, Report report)
403 {
404 this.report = report;
405 this.in = in;
406 this.lines = 1;
407 this.columns = 1;
408 this.state = LEX_CONTENT;
409 this.versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY);
410 this.doctype = Dict.VERS_UNKNOWN;
411 this.insert = -1;
412 this.istack = new Stack();
413 this.configuration = configuration;
414 this.nodeList = new Vector();
415 }
416
417 /**
418 * Creates a new node and add it to nodelist.
419 * @return Node
420 */
421 public Node newNode()
422 {
423 Node node = new Node();
424 this.nodeList.add(node);
425 return node;
426 }
427
428 /**
429 * Creates a new node and add it to nodelist.
430 * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
431 * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
432 * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
433 * @param textarray array of bytes contained in the Node
434 * @param start start position
435 * @param end end position
436 * @return Node
437 */
438 public Node newNode(short type, byte[] textarray, int start, int end)
439 {
440 Node node = new Node(type, textarray, start, end);
441 this.nodeList.add(node);
442 return node;
443 }
444
445 /**
446 * Creates a new node and add it to nodelist.
447 * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
448 * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
449 * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
450 * @param textarray array of bytes contained in the Node
451 * @param start start position
452 * @param end end position
453 * @param element tag name
454 * @return Node
455 */
456 public Node newNode(short type, byte[] textarray, int start, int end, String element)
457 {
458 Node node = new Node(type, textarray, start, end, element, this.configuration.tt);
459 this.nodeList.add(node);
460 return node;
461 }
462
463 /**
464 * Clones a node and add it to node list.
465 * @param node Node
466 * @return cloned Node
467 */
468 public Node cloneNode(Node node)
469 {
470 Node cnode = (Node) node.clone();
471 this.nodeList.add(cnode);
472 for (AttVal att = cnode.attributes; att != null; att = att.next)
473 {
474 if (att.asp != null)
475 {
476 this.nodeList.add(att.asp);
477 }
478 if (att.php != null)
479 {
480 this.nodeList.add(att.php);
481 }
482 }
483 return cnode;
484 }
485
486 /**
487 * Clones an attribute value and add eventual asp or php node to node list.
488 * @param attrs original AttVal
489 * @return cloned AttVal
490 */
491 public AttVal cloneAttributes(AttVal attrs)
492 {
493 AttVal cattrs = (AttVal) attrs.clone();
494 for (AttVal att = cattrs; att != null; att = att.next)
495 {
496 if (att.asp != null)
497 {
498 this.nodeList.add(att.asp);
499 }
500 if (att.php != null)
501 {
502 this.nodeList.add(att.php);
503 }
504 }
505 return cattrs;
506 }
507
508 /**
509 * Update <code>oldtextarray</code> in the current nodes.
510 * @param oldtextarray previous text array
511 * @param newtextarray new text array
512 */
513 protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
514 {
515 Node node;
516 for (int i = 0; i < this.nodeList.size(); i++)
517 {
518 node = (Node) (this.nodeList.get(i));
519 if (node.textarray == oldtextarray)
520 {
521 node.textarray = newtextarray;
522 }
523 }
524 }
525
526 /**
527 * Adds a new line node. Used for creating preformatted text from Word2000.
528 * @return new line node
529 */
530 public Node newLineNode()
531 {
532 Node node = newNode();
533
534 node.textarray = this.lexbuf;
535 node.start = this.lexsize;
536 addCharToLexer('\n');
537 node.end = this.lexsize;
538 return node;
539 }
540
541 /**
542 * Has end of input stream been reached?
543 * @return <code>true</code> if end of input stream been reached
544 */
545 public boolean endOfInput()
546 {
547 return this.in.isEndOfStream();
548 }
549
550 /**
551 * Adds a byte to lexer buffer.
552 * @param c byte to add
553 */
554 public void addByte(int c)
555 {
556 if (this.lexsize + 1 >= this.lexlength)
557 {
558 while (this.lexsize + 1 >= this.lexlength)
559 {
560 if (this.lexlength == 0)
561 {
562 this.lexlength = 8192;
563 }
564 else
565 {
566 this.lexlength = this.lexlength * 2;
567 }
568 }
569
570 byte[] temp = this.lexbuf;
571 this.lexbuf = new byte[this.lexlength];
572 if (temp != null)
573 {
574 System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
575 updateNodeTextArrays(temp, this.lexbuf);
576 }
577 }
578
579 this.lexbuf[this.lexsize++] = (byte) c;
580 this.lexbuf[this.lexsize] = (byte) '\0';
581 }
582
583 /**
584 * Substitute the last char in buffer.
585 * @param c new char
586 */
587 public void changeChar(byte c)
588 {
589 if (this.lexsize > 0)
590 {
591 this.lexbuf[this.lexsize - 1] = c;
592 }
593 }
594
595 /**
596 * Store char c as UTF-8 encoded byte stream.
597 * @param c char to store
598 */
599 public void addCharToLexer(int c)
600 {
601
602
603
604 if ((this.configuration.xmlOut || this.configuration.xHTML)
605 && !((c >= 0x20 && c <= 0xD7FF)
606 || c == 0x9
607 || c == 0xA
608 || c == 0xD
609 || (c >= 0xE000 && c <= 0xFFFD)
610 || (c >= 0x10000 && c <= 0x10FFFF)))
611 {
612 return;
613 }
614
615 int i = 0;
616 int[] count = new int[]{0};
617 byte[] buf = new byte[10];
618
619 boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count);
620 if (err)
621 {
622
623 buf[0] = (byte) 0xEF;
624 buf[1] = (byte) 0xBF;
625 buf[2] = (byte) 0xBD;
626 count[0] = 3;
627 }
628
629 for (i = 0; i < count[0]; i++)
630 {
631 addByte(buf[i]);
632 }
633
634 }
635
636 /**
637 * Adds a string to lexer buffer.
638 * @param str String to add
639 */
640 public void addStringToLexer(String str)
641 {
642 for (int i = 0; i < str.length(); i++)
643 {
644 addCharToLexer(str.charAt(i));
645 }
646 }
647
648 /**
649 * Parse an html entity.
650 * @param mode mode
651 */
652 public void parseEntity(short mode)
653 {
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680 int start;
681 boolean first = true;
682 boolean semicolon = false;
683 int c, ch, startcol;
684 String str;
685
686 start = this.lexsize - 1;
687 startcol = this.in.getCurcol() - 1;
688
689 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
690 {
691 if (c == ';')
692 {
693 semicolon = true;
694 break;
695 }
696
697 if (first && c == '#')
698 {
699
700 if (!this.configuration.ncr
701 || "BIG5".equals(this.configuration.getInCharEncodingName())
702 || "SHIFTJIS".equals(this.configuration.getInCharEncodingName()))
703 {
704 this.in.ungetChar(c);
705 return;
706 }
707
708
709 addCharToLexer(c);
710 first = false;
711 continue;
712 }
713
714 first = false;
715
716 if (TidyUtils.isNamechar((char) c))
717 {
718 addCharToLexer(c);
719 continue;
720 }
721
722
723 this.in.ungetChar(c);
724 break;
725 }
726
727 str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start);
728
729 if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML)
730 {
731 report.entityError(this, Report.APOS_UNDEFINED, str, 39);
732 }
733
734 ch = EntityTable.getDefaultEntityTable().entityCode(str);
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749 if (ch <= 0 || (ch >= 256 && c != ';'))
750 {
751
752 this.lines = this.in.getCurline();
753 this.columns = startcol;
754
755 if (this.lexsize > start + 1)
756 {
757 if (ch >= 128 && ch <= 159)
758 {
759
760 int c1 = 0;
761
762 if ("WIN1252".equals(configuration.replacementCharEncoding))
763 {
764 c1 = EncodingUtils.decodeWin1252(ch);
765 }
766 else if ("MACROMAN".equals(configuration.replacementCharEncoding))
767 {
768 c1 = EncodingUtils.decodeMacRoman(ch);
769 }
770
771
772
773 int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;
774
775 if (c != ';')
776 {
777 report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c);
778 }
779
780 report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch);
781
782 if (c1 != 0)
783 {
784
785 this.lexsize = start;
786 addCharToLexer(c1);
787 semicolon = false;
788 }
789 else
790 {
791
792 this.lexsize = start;
793 semicolon = false;
794 }
795
796 }
797 else
798 {
799 report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
800 }
801
802 if (semicolon)
803 {
804 addCharToLexer(';');
805 }
806 }
807 else
808 {
809
810 report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
811 }
812 }
813 else
814 {
815
816 if (c != ';')
817 {
818
819 this.lines = this.in.getCurline();
820 this.columns = startcol;
821 report.entityError(this, Report.MISSING_SEMICOLON, str, c);
822 }
823
824 this.lexsize = start;
825
826 if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED))
827 {
828 ch = ' ';
829 }
830
831 addCharToLexer(ch);
832
833 if (ch == '&' && !this.configuration.quoteAmpersand)
834 {
835 addCharToLexer('a');
836 addCharToLexer('m');
837 addCharToLexer('p');
838 addCharToLexer(';');
839 }
840 }
841 }
842
843 /**
844 * Parses a tag name.
845 * @return first char after the tag name
846 */
847 public char parseTagName()
848 {
849 int c;
850
851
852 c = this.lexbuf[this.txtstart];
853
854 if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
855 {
856 c = TidyUtils.toLower((char) c);
857 this.lexbuf[this.txtstart] = (byte) c;
858 }
859
860 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
861 {
862 if (!TidyUtils.isNamechar((char) c))
863 {
864 break;
865 }
866
867
868 if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
869 {
870 c = TidyUtils.toLower((char) c);
871 }
872
873 addCharToLexer(c);
874 }
875
876 this.txtend = this.lexsize;
877 return (char) c;
878 }
879
880 /**
881 * calls addCharToLexer for any char in the string.
882 * @param str input String
883 */
884 public void addStringLiteral(String str)
885 {
886 int len = str.length();
887 for (int i = 0; i < len; i++)
888 {
889 addCharToLexer(str.charAt(i));
890 }
891 }
892
893 /**
894 * calls addCharToLexer for any char in the string till len is reached.
895 * @param str input String
896 * @param len length of the substring to be added
897 */
898 void addStringLiteralLen(String str, int len)
899 {
900 int strlen = str.length();
901 if (strlen < len)
902 {
903 len = strlen;
904 }
905 for (int i = 0; i < len; i++)
906 {
907 addCharToLexer(str.charAt(i));
908 }
909 }
910
911 /**
912 * Choose what version to use for new doctype.
913 * @return html version constant
914 */
915 public short htmlVersion()
916 {
917 if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20))
918 {
919 return Dict.VERS_HTML20;
920 }
921
922 if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager)
923 && TidyUtils.toBoolean(versions & Dict.VERS_HTML32))
924 {
925 return Dict.VERS_HTML32;
926 }
927 if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11))
928 {
929 return Dict.VERS_XHTML11;
930 }
931 if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT))
932 {
933 return Dict.VERS_HTML40_STRICT;
934 }
935
936 if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE))
937 {
938 return Dict.VERS_HTML40_LOOSE;
939 }
940
941 if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET))
942 {
943 return Dict.VERS_FRAMESET;
944 }
945
946 return Dict.VERS_UNKNOWN;
947 }
948
949 /**
950 * Choose what version to use for new doctype.
951 * @return html version name
952 */
953 public String htmlVersionName()
954 {
955 short guessed;
956 int j;
957
958 guessed = apparentVersion();
959
960 for (j = 0; j < W3CVERSION.length; ++j)
961 {
962 if (guessed == W3CVERSION[j].code)
963 {
964 if (this.isvoyager)
965 {
966 return W3CVERSION[j].voyagerName;
967 }
968
969 return W3CVERSION[j].name;
970 }
971 }
972
973 return null;
974 }
975
976 /**
977 * Add meta element for Tidy. If the meta tag is already present, update release date.
978 * @param root root node
979 * @return <code>true</code> if the tag has been added
980 */
981 public boolean addGenerator(Node root)
982 {
983 AttVal attval;
984 Node node;
985 Node head = root.findHEAD(this.configuration.tt);
986
987 if (head != null)
988 {
989 String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see www.w3.org";
990
991 for (node = head.content; node != null; node = node.next)
992 {
993 if (node.tag == this.configuration.tt.tagMeta)
994 {
995 attval = node.getAttrByName("name");
996
997 if (attval != null && attval.value != null && "generator".equalsIgnoreCase(attval.value))
998 {
999 attval = node.getAttrByName("content");
1000
1001 if (attval != null
1002 && attval.value != null
1003 && attval.value.length() >= 9
1004 && "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9)))
1005 {
1006 attval.value = meta;
1007 return false;
1008 }
1009 }
1010 }
1011 }
1012
1013 node = this.inferredTag("meta");
1014 node.addAttribute("content", meta);
1015 node.addAttribute("name", "generator");
1016 head.insertNodeAtStart(node);
1017 return true;
1018 }
1019
1020 return false;
1021 }
1022
1023 /**
1024 * Check system keywords (keywords should be uppercase).
1025 * @param doctype doctype node
1026 * @return true if doctype keywords are all uppercase
1027 */
1028 public boolean checkDocTypeKeyWords(Node doctype)
1029 {
1030 int len = doctype.end - doctype.start;
1031 String s = TidyUtils.getString(this.lexbuf, doctype.start, len);
1032
1033 return !(TidyUtils.findBadSubString("SYSTEM", s, len)
1034 || TidyUtils.findBadSubString("PUBLIC", s, len)
1035 || TidyUtils.findBadSubString("//DTD", s, len)
1036 || TidyUtils.findBadSubString("//W3C", s, len) || TidyUtils.findBadSubString("//EN", s, len));
1037 }
1038
1039 /**
1040 * Examine DOCTYPE to identify version.
1041 * @param doctype doctype node
1042 * @return version code
1043 */
1044 public short findGivenVersion(Node doctype)
1045 {
1046 String p, s;
1047 int i, j;
1048 int len;
1049 String str1;
1050 String str2;
1051
1052
1053 str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5);
1054 if (!"html ".equalsIgnoreCase(str1))
1055 {
1056 return 0;
1057 }
1058
1059 if (!checkDocTypeKeyWords(doctype))
1060 {
1061 report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
1062 }
1063
1064
1065 str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7);
1066 if ("SYSTEM ".equalsIgnoreCase(str1))
1067 {
1068
1069 if (!str1.substring(0, 6).equals("SYSTEM"))
1070 {
1071 System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6);
1072 }
1073 return 0;
1074 }
1075
1076 if ("PUBLIC ".equalsIgnoreCase(str1))
1077 {
1078 if (!str1.substring(0, 6).equals("PUBLIC"))
1079 {
1080 System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6);
1081 }
1082 }
1083 else
1084 {
1085 this.badDoctype = true;
1086 }
1087
1088 for (i = doctype.start; i < doctype.end; ++i)
1089 {
1090 if (this.lexbuf[i] == (byte) '"')
1091 {
1092 str1 = TidyUtils.getString(this.lexbuf, i + 1, 12);
1093 str2 = TidyUtils.getString(this.lexbuf, i + 1, 13);
1094 if (str1.equals("-//W3C//DTD "))
1095 {
1096
1097 for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
1098 {
1099
1100 }
1101 len = j - i - 13;
1102 p = TidyUtils.getString(this.lexbuf, i + 13, len);
1103
1104 for (j = 1; j < W3CVERSION.length; ++j)
1105 {
1106 s = W3CVERSION[j].name;
1107 if (len == s.length() && s.equals(p))
1108 {
1109 return W3CVERSION[j].code;
1110 }
1111 }
1112
1113
1114 }
1115 else if (str2.equals("-//IETF//DTD "))
1116 {
1117
1118 for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
1119 {
1120
1121 }
1122 len = j - i - 14;
1123
1124 p = TidyUtils.getString(this.lexbuf, i + 14, len);
1125 s = W3CVERSION[0].name;
1126 if (len == s.length() && s.equals(p))
1127 {
1128 return W3CVERSION[0].code;
1129 }
1130
1131
1132 }
1133 break;
1134 }
1135 }
1136
1137 return 0;
1138 }
1139
1140 /**
1141 * Fix xhtml namespace.
1142 * @param root root Node
1143 * @param profile current profile
1144 */
1145 public void fixHTMLNameSpace(Node root, String profile)
1146 {
1147 Node node;
1148 AttVal attr;
1149
1150 node = root.content;
1151 while (node != null && node.tag != this.configuration.tt.tagHtml)
1152 {
1153 node = node.next;
1154 }
1155
1156 if (node != null)
1157 {
1158
1159 for (attr = node.attributes; attr != null; attr = attr.next)
1160 {
1161 if (attr.attribute.equals("xmlns"))
1162 {
1163 break;
1164 }
1165
1166 }
1167
1168 if (attr != null)
1169 {
1170 if (!attr.value.equals(profile))
1171 {
1172 report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
1173 attr.value = profile;
1174 }
1175 }
1176 else
1177 {
1178 attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
1179 attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr);
1180 node.attributes = attr;
1181 }
1182 }
1183 }
1184
1185 /**
1186 * Put DOCTYPE declaration between the <:?xml version "1.0" ... ?> declaration, if any, and the
1187 * <code>html</code> tag. Should also work for any comments, etc. that may precede the <code>html</code> tag.
1188 * @param root root node
1189 * @return new doctype node
1190 */
1191 Node newXhtmlDocTypeNode(Node root)
1192 {
1193 Node html = root.findHTML(this.configuration.tt);
1194 if (html == null)
1195 {
1196 return null;
1197 }
1198
1199 Node newdoctype = newNode();
1200 newdoctype.setType(Node.DOCTYPE_TAG);
1201 newdoctype.next = html;
1202 newdoctype.parent = root;
1203 newdoctype.prev = null;
1204
1205 if (html == root.content)
1206 {
1207
1208 root.content.prev = newdoctype;
1209 root.content = newdoctype;
1210 newdoctype.prev = null;
1211 }
1212 else
1213 {
1214
1215 newdoctype.prev = html.prev;
1216 newdoctype.prev.next = newdoctype;
1217 }
1218 html.prev = newdoctype;
1219 return newdoctype;
1220 }
1221
1222 /**
1223 * Adds a new xhtml doctype to the document.
1224 * @param root root node
1225 * @return <code>true</code> if a doctype has been added
1226 */
1227 public boolean setXHTMLDocType(Node root)
1228 {
1229 String fpi = " ";
1230 String sysid = "";
1231 String namespace = XHTML_NAMESPACE;
1232 String dtdsub = null;
1233 Node doctype;
1234 int dtdlen = 0;
1235
1236 doctype = root.findDocType();
1237
1238 fixHTMLNameSpace(root, namespace);
1239
1240 if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
1241 {
1242 if (doctype != null)
1243 {
1244 Node.discardElement(doctype);
1245 }
1246 return true;
1247 }
1248
1249 if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
1250 {
1251
1252 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1253 {
1254
1255 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1256 sysid = VOYAGER_STRICT;
1257 }
1258 else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1259 {
1260
1261 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
1262 sysid = VOYAGER_FRAMESET;
1263 }
1264 else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE))
1265 {
1266 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1267 sysid = VOYAGER_LOOSE;
1268 }
1269 else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1270 {
1271
1272 fpi = "-//W3C//DTD XHTML 1.1//EN";
1273 sysid = VOYAGER_11;
1274 }
1275 else
1276 {
1277
1278 fpi = null;
1279 sysid = "";
1280 if (doctype != null)
1281 {
1282 Node.discardElement(doctype);
1283 }
1284 }
1285 }
1286 else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
1287 {
1288 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1289 sysid = VOYAGER_STRICT;
1290 }
1291 else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
1292 {
1293 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1294 sysid = VOYAGER_LOOSE;
1295 }
1296
1297 if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null)
1298 {
1299 fpi = this.configuration.docTypeStr;
1300 sysid = "";
1301 }
1302
1303 if (fpi == null)
1304 {
1305 return false;
1306 }
1307
1308 if (doctype != null)
1309 {
1310
1311 if (configuration.xHTML || configuration.xmlOut)
1312 {
1313
1314 int len = doctype.end - doctype.start + 1;
1315 String start = TidyUtils.getString(this.lexbuf, doctype.start, len);
1316
1317 int dtdbeg = start.indexOf('[');
1318 if (dtdbeg >= 0)
1319 {
1320 int dtdend = start.substring(dtdbeg).indexOf(']');
1321 if (dtdend >= 0)
1322 {
1323 dtdlen = dtdend + 1;
1324 dtdsub = start.substring(dtdbeg);
1325 }
1326 }
1327 }
1328 }
1329 else
1330 {
1331 if ((doctype = newXhtmlDocTypeNode(root)) == null)
1332 {
1333 return false;
1334 }
1335 }
1336
1337 this.txtstart = this.lexsize;
1338 this.txtend = this.lexsize;
1339
1340
1341 addStringLiteral("html PUBLIC ");
1342
1343
1344 if (fpi.charAt(0) == '"')
1345 {
1346 addStringLiteral(fpi);
1347 }
1348 else
1349 {
1350 addStringLiteral("\"");
1351 addStringLiteral(fpi);
1352 addStringLiteral("\"");
1353 }
1354
1355 if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen)
1356 {
1357 addStringLiteral("\n\"");
1358 }
1359 else
1360 {
1361
1362 addStringLiteral(" \"");
1363 }
1364
1365
1366 addStringLiteral(sysid);
1367 addStringLiteral("\"");
1368
1369 if (dtdlen > 0 && dtdsub != null)
1370 {
1371 addCharToLexer(' ');
1372 addStringLiteralLen(dtdsub, dtdlen);
1373 }
1374
1375 this.txtend = this.lexsize;
1376
1377 int length = this.txtend - this.txtstart;
1378 doctype.textarray = new byte[length];
1379
1380 System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
1381 doctype.start = 0;
1382 doctype.end = length;
1383
1384 return false;
1385 }
1386
1387 /**
1388 * Return the html version used in document.
1389 * @return version code
1390 */
1391 public short apparentVersion()
1392 {
1393 switch (this.doctype)
1394 {
1395 case Dict.VERS_UNKNOWN :
1396 return htmlVersion();
1397
1398 case Dict.VERS_HTML20 :
1399 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
1400 {
1401 return Dict.VERS_HTML20;
1402 }
1403
1404 break;
1405
1406 case Dict.VERS_HTML32 :
1407 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
1408 {
1409 return Dict.VERS_HTML32;
1410 }
1411
1412 break;
1413
1414 case Dict.VERS_HTML40_STRICT :
1415 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1416 {
1417 return Dict.VERS_HTML40_STRICT;
1418 }
1419
1420 break;
1421
1422 case Dict.VERS_HTML40_LOOSE :
1423 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
1424 {
1425 return Dict.VERS_HTML40_LOOSE;
1426 }
1427
1428 break;
1429
1430 case Dict.VERS_FRAMESET :
1431 if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1432 {
1433 return Dict.VERS_FRAMESET;
1434 }
1435
1436 break;
1437
1438 case Dict.VERS_XHTML11 :
1439 if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1440 {
1441 return Dict.VERS_XHTML11;
1442 }
1443
1444 break;
1445 default :
1446
1447 break;
1448 }
1449
1450
1451
1452
1453
1454 this.lines = 1;
1455 this.columns = 1;
1456
1457 report.warning(this, null, null, Report.INCONSISTENT_VERSION);
1458 return this.htmlVersion();
1459 }
1460
1461 /**
1462 * Fixup doctype if missing.
1463 * @param root root node
1464 * @return <code>false</code> if current version has not been identified
1465 */
1466 public boolean fixDocType(Node root)
1467 {
1468 Node doctype;
1469 int guessed = Dict.VERS_HTML40_STRICT, i;
1470
1471 if (this.badDoctype)
1472 {
1473 report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
1474 }
1475
1476 doctype = root.findDocType();
1477
1478 if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
1479 {
1480 if (doctype != null)
1481 {
1482 Node.discardElement(doctype);
1483 }
1484 return true;
1485 }
1486
1487 if (this.configuration.xmlOut)
1488 {
1489 return true;
1490 }
1491
1492 if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
1493 {
1494 Node.discardElement(doctype);
1495 doctype = null;
1496 guessed = Dict.VERS_HTML40_STRICT;
1497 }
1498 else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
1499 {
1500 Node.discardElement(doctype);
1501 doctype = null;
1502 guessed = Dict.VERS_HTML40_LOOSE;
1503 }
1504 else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
1505 {
1506 if (doctype != null)
1507 {
1508 if (this.doctype == Dict.VERS_UNKNOWN)
1509 {
1510 return false;
1511 }
1512
1513 switch (this.doctype)
1514 {
1515 case Dict.VERS_UNKNOWN :
1516 return false;
1517
1518 case Dict.VERS_HTML20 :
1519 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
1520 {
1521 return true;
1522 }
1523
1524 break;
1525
1526 case Dict.VERS_HTML32 :
1527 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
1528 {
1529 return true;
1530 }
1531
1532 break;
1533
1534 case Dict.VERS_HTML40_STRICT :
1535 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1536 {
1537 return true;
1538 }
1539
1540 break;
1541
1542 case Dict.VERS_HTML40_LOOSE :
1543 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
1544 {
1545 return true;
1546 }
1547
1548 break;
1549
1550 case Dict.VERS_FRAMESET :
1551 if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1552 {
1553 return true;
1554 }
1555
1556 break;
1557
1558 case Dict.VERS_XHTML11 :
1559 if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1560 {
1561 return true;
1562 }
1563
1564 break;
1565 default :
1566
1567 break;
1568 }
1569
1570
1571 }
1572
1573
1574 guessed = htmlVersion();
1575 }
1576
1577 if (guessed == Dict.VERS_UNKNOWN)
1578 {
1579 return false;
1580 }
1581
1582
1583 if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager)
1584 {
1585 if (doctype != null)
1586 {
1587 Node.discardElement(doctype);
1588 }
1589
1590 fixHTMLNameSpace(root, XHTML_NAMESPACE);
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604 }
1605
1606 if (doctype == null)
1607 {
1608 if ((doctype = newXhtmlDocTypeNode(root)) == null)
1609 {
1610 return false;
1611 }
1612 }
1613
1614 this.txtstart = this.lexsize;
1615 this.txtend = this.lexsize;
1616
1617
1618 addStringLiteral("html PUBLIC ");
1619
1620 if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER
1621 && this.configuration.docTypeStr != null
1622 && this.configuration.docTypeStr.length() > 0)
1623 {
1624
1625 if (this.configuration.docTypeStr.charAt(0) == '"')
1626 {
1627 addStringLiteral(this.configuration.docTypeStr);
1628 }
1629 else
1630 {
1631 addStringLiteral("\"");
1632 addStringLiteral(this.configuration.docTypeStr);
1633 addStringLiteral("\"");
1634 }
1635 }
1636 else if (guessed == Dict.VERS_HTML20)
1637 {
1638 addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
1639 }
1640 else
1641 {
1642 addStringLiteral("\"-//W3C//DTD ");
1643
1644 for (i = 0; i < W3CVERSION.length; ++i)
1645 {
1646 if (guessed == W3CVERSION[i].code)
1647 {
1648 addStringLiteral(W3CVERSION[i].name);
1649 break;
1650 }
1651 }
1652
1653 addStringLiteral("//EN\"");
1654 }
1655
1656 this.txtend = this.lexsize;
1657
1658 int length = this.txtend - this.txtstart;
1659 doctype.textarray = new byte[length];
1660
1661 System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
1662 doctype.start = 0;
1663 doctype.end = length;
1664
1665 return true;
1666 }
1667
1668 /**
1669 * Ensure XML document starts with <code><?XML version="1.0"?></code>. Add encoding attribute if not using
1670 * ASCII or UTF-8 output.
1671 * @param root root node
1672 * @return always true
1673 */
1674 public boolean fixXmlDecl(Node root)
1675 {
1676 Node xml;
1677 AttVal version;
1678 AttVal encoding;
1679
1680 if (root.content != null && root.content.type == Node.XML_DECL)
1681 {
1682 xml = root.content;
1683 }
1684 else
1685 {
1686 xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0);
1687 xml.next = root.content;
1688
1689 if (root.content != null)
1690 {
1691 root.content.prev = xml;
1692 xml.next = root.content;
1693 }
1694
1695 root.content = xml;
1696 }
1697
1698 version = xml.getAttrByName("version");
1699 encoding = xml.getAttrByName("encoding");
1700
1701
1702
1703 if (encoding == null && !"UTF8".equals(this.configuration.getOutCharEncodingName()))
1704 {
1705 if ("ISO8859_1".equals(this.configuration.getOutCharEncodingName()))
1706 {
1707 xml.addAttribute("encoding", "iso-8859-1");
1708 }
1709 if ("ISO2022".equals(this.configuration.getOutCharEncodingName()))
1710 {
1711 xml.addAttribute("encoding", "iso-2022");
1712 }
1713 }
1714
1715 if (version == null)
1716 {
1717 xml.addAttribute("version", "1.0");
1718 }
1719
1720 return true;
1721 }
1722
1723 /**
1724 * Generates and inserts a new node.
1725 * @param name tag name
1726 * @return generated node
1727 */
1728 public Node inferredTag(String name)
1729 {
1730 Node node;
1731
1732 node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name);
1733 node.implicit = true;
1734 return node;
1735 }
1736
1737 /**
1738 * Create a text node for the contents of a CDATA element like style or script which ends with </foo> for some
1739 * foo.
1740 * @param container container node
1741 * @return cdata node
1742 */
1743 public Node getCDATA(Node container)
1744 {
1745 int c, lastc, start, len, i;
1746 int qt = 0;
1747 int esc = 0;
1748 String str;
1749 boolean endtag = false;
1750 boolean begtag = false;
1751
1752 if (container.isJavaScript())
1753 {
1754 esc = '\\';
1755 }
1756
1757 this.lines = this.in.getCurline();
1758 this.columns = this.in.getCurcol();
1759 this.waswhite = false;
1760 this.txtstart = this.lexsize;
1761 this.txtend = this.lexsize;
1762
1763 lastc = '\0';
1764 start = -1;
1765
1766 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
1767 {
1768
1769 if (qt > 0)
1770 {
1771
1772
1773 if ((c == '\r' || c == '\n' || c == qt) && (!TidyUtils.toBoolean(esc) || lastc != esc))
1774 {
1775 qt = 0;
1776 }
1777 else if (c == '/' && lastc == '<')
1778 {
1779 start = this.lexsize + 1;
1780 }
1781
1782 else if (c == '>' && start >= 0)
1783 {
1784 len = this.lexsize - start;
1785
1786 this.lines = this.in.getCurline();
1787 this.columns = this.in.getCurcol() - 3;
1788
1789 report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1790
1791
1792 if (TidyUtils.toBoolean(esc))
1793 {
1794 for (i = this.lexsize; i > start - 1; --i)
1795 {
1796 this.lexbuf[i] = this.lexbuf[i - 1];
1797 }
1798
1799 this.lexbuf[start - 1] = (byte) esc;
1800 this.lexsize++;
1801 }
1802
1803 start = -1;
1804 }
1805 }
1806 else if (TidyUtils.isQuote(c) && (!TidyUtils.toBoolean(esc) || lastc != esc))
1807 {
1808 qt = c;
1809 }
1810 else if (c == '<')
1811 {
1812 start = this.lexsize + 1;
1813 endtag = false;
1814 begtag = true;
1815 }
1816 else if (c == '!' && lastc == '<')
1817 {
1818 start = -1;
1819 endtag = false;
1820 begtag = false;
1821 }
1822 else if (c == '/' && lastc == '<')
1823 {
1824 start = this.lexsize + 1;
1825 endtag = true;
1826 begtag = false;
1827 }
1828 else if (c == '>' && start >= 0)
1829 {
1830 int decr = 2;
1831
1832 if (endtag && ((len = this.lexsize - start) == container.element.length()))
1833 {
1834
1835 str = TidyUtils.getString(this.lexbuf, start, len);
1836 if (container.element.equalsIgnoreCase(str))
1837 {
1838 this.txtend = start - decr;
1839 this.lexsize = start - decr;
1840 break;
1841 }
1842 }
1843
1844
1845
1846 this.lines = this.in.getCurline();
1847 this.columns = this.in.getCurcol() - 3;
1848
1849 report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1850 if (begtag)
1851 {
1852 decr = 1;
1853 }
1854 this.txtend = start - decr;
1855 this.lexsize = start - decr;
1856 break;
1857 }
1858
1859 else if (c == '\r')
1860 {
1861 if (begtag || endtag)
1862 {
1863 continue;
1864 }
1865
1866 c = this.in.readChar();
1867
1868 if (c != '\n')
1869 {
1870 this.in.ungetChar(c);
1871 }
1872
1873 c = '\n';
1874
1875 }
1876 else if ((c == '\n' || c == '\t' || c == ' ') && (begtag || endtag))
1877 {
1878 continue;
1879 }
1880
1881 addCharToLexer(c);
1882 this.txtend = this.lexsize;
1883 lastc = c;
1884 }
1885
1886 if (c == StreamIn.END_OF_STREAM)
1887 {
1888 report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
1889 }
1890
1891 if (this.txtend > this.txtstart)
1892 {
1893 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
1894 return this.token;
1895 }
1896
1897 return null;
1898 }
1899
1900 /**
1901 *
1902 *
1903 */
1904 public void ungetToken()
1905 {
1906 this.pushed = true;
1907 }
1908
1909 /**
1910 * Gets a token.
1911 * @param mode one of the following:
1912 * <ul>
1913 * <li><code>MixedContent</code>-- for elements which don't accept PCDATA</li>
1914 * <li><code>Preformatted</code>-- white spacepreserved as is</li>
1915 * <li><code>IgnoreMarkup</code>-- for CDATA elements such as script, style</li>
1916 * </ul>
1917 * @return next Node
1918 */
1919 public Node getToken(short mode)
1920 {
1921 int c = 0;
1922 int badcomment = 0;
1923
1924 boolean[] isempty = new boolean[1];
1925 boolean inDTDSubset = false;
1926 AttVal attributes = null;
1927
1928 if (this.pushed)
1929 {
1930
1931 if (this.token.type != Node.TEXT_NODE || (this.insert == -1 && this.inode == null))
1932 {
1933 this.pushed = false;
1934 return this.token;
1935 }
1936 }
1937
1938
1939 if (this.insert != -1 || this.inode != null)
1940 {
1941 return insertedToken();
1942 }
1943
1944 this.lines = this.in.getCurline();
1945 this.columns = this.in.getCurcol();
1946 this.waswhite = false;
1947
1948 this.txtstart = this.lexsize;
1949 this.txtend = this.lexsize;
1950
1951 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
1952 {
1953
1954
1955 if (this.insertspace && mode != IGNORE_WHITESPACE)
1956 {
1957 addCharToLexer(' ');
1958 }
1959 if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
1960 {
1961 this.waswhite = true;
1962 this.insertspace = false;
1963 }
1964
1965
1966 if (c == '\r')
1967 {
1968 c = this.in.readChar();
1969
1970 if (c != '\n')
1971 {
1972 this.in.ungetChar(c);
1973 }
1974
1975 c = '\n';
1976 }
1977
1978 addCharToLexer(c);
1979
1980 switch (this.state)
1981 {
1982 case LEX_CONTENT :
1983
1984
1985
1986
1987
1988 if (TidyUtils.isWhite((char) c) && (mode == IGNORE_WHITESPACE) && this.lexsize == this.txtstart + 1)
1989 {
1990 --this.lexsize;
1991 this.waswhite = false;
1992 this.lines = this.in.getCurline();
1993 this.columns = this.in.getCurcol();
1994 continue;
1995 }
1996
1997 if (c == '<')
1998 {
1999 this.state = LEX_GT;
2000 continue;
2001 }
2002
2003 if (TidyUtils.isWhite((char) c))
2004 {
2005
2006 if (this.waswhite)
2007 {
2008 if (mode != PREFORMATTED && mode != IGNORE_MARKUP)
2009 {
2010 --this.lexsize;
2011 this.lines = this.in.getCurline();
2012 this.columns = this.in.getCurcol();
2013 }
2014 }
2015 else
2016 {
2017
2018 this.waswhite = true;
2019
2020 if (mode != PREFORMATTED && mode != IGNORE_MARKUP && c != ' ')
2021 {
2022 changeChar((byte) ' ');
2023 }
2024 }
2025
2026 continue;
2027 }
2028 else if (c == '&' && mode != IGNORE_MARKUP)
2029 {
2030 parseEntity(mode);
2031 }
2032
2033
2034 if (mode == IGNORE_WHITESPACE)
2035 {
2036 mode = MIXED_CONTENT;
2037 }
2038
2039 this.waswhite = false;
2040 continue;
2041
2042 case LEX_GT :
2043
2044
2045
2046 if (c == '/')
2047 {
2048 c = this.in.readChar();
2049 if (c == StreamIn.END_OF_STREAM)
2050 {
2051 this.in.ungetChar(c);
2052 continue;
2053 }
2054
2055 addCharToLexer(c);
2056
2057 if (TidyUtils.isLetter((char) c))
2058 {
2059 this.lexsize -= 3;
2060 this.txtend = this.lexsize;
2061 this.in.ungetChar(c);
2062 this.state = LEX_ENDTAG;
2063 this.lexbuf[this.lexsize] = (byte) '\0';
2064
2065
2066
2067 this.columns -= 2;
2068
2069
2070 if (this.txtend > this.txtstart)
2071 {
2072
2073 if (mode == IGNORE_WHITESPACE && this.lexbuf[this.lexsize - 1] == (byte) ' ')
2074 {
2075 this.lexsize -= 1;
2076 this.txtend = this.lexsize;
2077 }
2078
2079 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2080 return this.token;
2081 }
2082
2083 continue;
2084 }
2085
2086
2087 this.waswhite = false;
2088 this.state = LEX_CONTENT;
2089 continue;
2090 }
2091
2092 if (mode == IGNORE_MARKUP)
2093 {
2094
2095 this.waswhite = false;
2096 this.state = LEX_CONTENT;
2097 continue;
2098 }
2099
2100
2101 if (c == '!')
2102 {
2103 c = this.in.readChar();
2104
2105 if (c == '-')
2106 {
2107 c = this.in.readChar();
2108
2109 if (c == '-')
2110 {
2111 this.state = LEX_COMMENT;
2112 this.lexsize -= 2;
2113 this.txtend = this.lexsize;
2114
2115
2116 if (this.txtend > this.txtstart)
2117 {
2118 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2119 return this.token;
2120 }
2121
2122 this.txtstart = this.lexsize;
2123 continue;
2124 }
2125
2126 report.warning(this, null, null, Report.MALFORMED_COMMENT);
2127 }
2128 else if (c == 'd' || c == 'D')
2129 {
2130 this.state = LEX_DOCTYPE;
2131 this.lexsize -= 2;
2132 this.txtend = this.lexsize;
2133 mode = IGNORE_WHITESPACE;
2134
2135
2136
2137 for (;;)
2138 {
2139 c = this.in.readChar();
2140
2141 if (c == StreamIn.END_OF_STREAM || c == '>')
2142 {
2143 this.in.ungetChar(c);
2144 break;
2145 }
2146
2147 if (!TidyUtils.isWhite((char) c))
2148 {
2149 continue;
2150 }
2151
2152
2153
2154 for (;;)
2155 {
2156 c = this.in.readChar();
2157
2158 if (c == StreamIn.END_OF_STREAM || c == '>')
2159 {
2160 this.in.ungetChar(c);
2161 break;
2162 }
2163
2164 if (TidyUtils.isWhite((char) c))
2165 {
2166 continue;
2167 }
2168
2169 this.in.ungetChar(c);
2170 break;
2171 }
2172
2173 break;
2174 }
2175
2176
2177 if (this.txtend > this.txtstart)
2178 {
2179 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2180 return this.token;
2181 }
2182
2183 this.txtstart = this.lexsize;
2184 continue;
2185 }
2186 else if (c == '[')
2187 {
2188
2189 this.lexsize -= 2;
2190 this.state = LEX_SECTION;
2191 this.txtend = this.lexsize;
2192
2193
2194 if (this.txtend > this.txtstart)
2195 {
2196 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2197 return this.token;
2198 }
2199
2200 this.txtstart = this.lexsize;
2201 continue;
2202 }
2203
2204
2205 while (true)
2206 {
2207 c = this.in.readChar();
2208 if (c == '>')
2209 {
2210 break;
2211 }
2212 if (c == -1)
2213 {
2214 this.in.ungetChar(c);
2215 break;
2216 }
2217 }
2218
2219 this.lexsize -= 2;
2220 this.lexbuf[this.lexsize] = (byte) '\0';
2221 this.state = LEX_CONTENT;
2222 continue;
2223 }
2224
2225
2226
2227 if (c == '?')
2228 {
2229 this.lexsize -= 2;
2230 this.state = LEX_PROCINSTR;
2231 this.txtend = this.lexsize;
2232
2233
2234 if (this.txtend > this.txtstart)
2235 {
2236 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2237 return this.token;
2238 }
2239
2240 this.txtstart = this.lexsize;
2241 continue;
2242 }
2243
2244
2245 if (c == '%')
2246 {
2247 this.lexsize -= 2;
2248 this.state = LEX_ASP;
2249 this.txtend = this.lexsize;
2250
2251
2252 if (this.txtend > this.txtstart)
2253 {
2254 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2255 return this.token;
2256 }
2257
2258 this.txtstart = this.lexsize;
2259 continue;
2260 }
2261
2262
2263 if (c == '#')
2264 {
2265 this.lexsize -= 2;
2266 this.state = LEX_JSTE;
2267 this.txtend = this.lexsize;
2268
2269
2270 if (this.txtend > this.txtstart)
2271 {
2272 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2273 return this.token;
2274 }
2275
2276 this.txtstart = this.lexsize;
2277 continue;
2278 }
2279
2280
2281 if (TidyUtils.isLetter((char) c))
2282 {
2283 this.in.ungetChar(c);
2284 this.lexsize -= 2;
2285 this.txtend = this.lexsize;
2286 this.state = LEX_STARTTAG;
2287
2288
2289 if (this.txtend > this.txtstart)
2290 {
2291 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2292 return this.token;
2293 }
2294
2295 continue;
2296 }
2297
2298
2299 this.state = LEX_CONTENT;
2300 this.waswhite = false;
2301 continue;
2302
2303 case LEX_ENDTAG :
2304
2305 this.txtstart = this.lexsize - 1;
2306
2307
2308
2309 this.columns -= 2;
2310
2311 c = parseTagName();
2312 this.token = newNode(Node.END_TAG,
2313 this.lexbuf,
2314 this.txtstart,
2315 this.txtend,
2316 TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart));
2317 this.lexsize = this.txtstart;
2318 this.txtend = this.txtstart;
2319
2320
2321 while (c != '>')
2322 {
2323 c = this.in.readChar();
2324
2325 if (c == StreamIn.END_OF_STREAM)
2326 {
2327 break;
2328 }
2329 }
2330
2331 if (c == StreamIn.END_OF_STREAM)
2332 {
2333 this.in.ungetChar(c);
2334 continue;
2335 }
2336
2337 this.state = LEX_CONTENT;
2338 this.waswhite = false;
2339 return this.token;
2340
2341 case LEX_STARTTAG :
2342
2343 this.txtstart = this.lexsize - 1;
2344 c = parseTagName();
2345 isempty[0] = false;
2346 attributes = null;
2347 this.token = newNode(
2348 (isempty[0] ? Node.START_END_TAG : Node.START_TAG),
2349 this.lexbuf,
2350 this.txtstart,
2351 this.txtend,
2352 TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart));
2353
2354
2355 if (c != '>')
2356 {
2357 if (c == '/')
2358 {
2359 this.in.ungetChar(c);
2360 }
2361
2362 attributes = parseAttrs(isempty);
2363 }
2364
2365 if (isempty[0])
2366 {
2367 this.token.type = Node.START_END_TAG;
2368 }
2369
2370 this.token.attributes = attributes;
2371 this.lexsize = this.txtstart;
2372 this.txtend = this.txtstart;
2373
2374
2375
2376
2377
2378
2379 if (
2380
2381 (mode != PREFORMATTED || preContent(this.token))
2382 && (this.token.expectsContent() || this.token.tag == this.configuration.tt.tagBr))
2383 {
2384
2385 c = this.in.readChar();
2386
2387 if (c == '\r')
2388 {
2389 c = this.in.readChar();
2390
2391 if (c != '\n')
2392 {
2393 this.in.ungetChar(c);
2394 }
2395 }
2396 else if (c != '\n' && c != '\f')
2397 {
2398 this.in.ungetChar(c);
2399 }
2400
2401 this.waswhite = true;
2402 }
2403 else
2404 {
2405 this.waswhite = false;
2406 }
2407
2408 this.state = LEX_CONTENT;
2409
2410 if (this.token.tag == null)
2411 {
2412 report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
2413 }
2414 else if (!this.configuration.xmlTags)
2415 {
2416 constrainVersion(this.token.tag.versions);
2417
2418 if (TidyUtils.toBoolean(this.token.tag.versions & Dict.VERS_PROPRIETARY))
2419 {
2420
2421 if (this.configuration.makeClean && (this.token.tag != this.configuration.tt.tagNobr &&
2422 this.token.tag != this.configuration.tt.tagWbr))
2423 {
2424 report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
2425 }
2426
2427 else if (!this.configuration.makeClean)
2428 {
2429 report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
2430 }
2431 }
2432
2433 if (this.token.tag.getChkattrs() != null)
2434 {
2435 this.token.tag.getChkattrs().check(this, this.token);
2436 }
2437 else
2438 {
2439 this.token.checkAttributes(this);
2440 }
2441
2442
2443 this.token.repairDuplicateAttributes(this);
2444
2445 }
2446
2447 return this.token;
2448
2449 case LEX_COMMENT :
2450
2451
2452 if (c != '-')
2453 {
2454 continue;
2455 }
2456
2457 c = this.in.readChar();
2458 addCharToLexer(c);
2459
2460 if (c != '-')
2461 {
2462 continue;
2463 }
2464
2465 end_comment : while (true)
2466 {
2467 c = this.in.readChar();
2468
2469 if (c == '>')
2470 {
2471 if (badcomment != 0)
2472 {
2473 report.warning(this, null, null, Report.MALFORMED_COMMENT);
2474 }
2475
2476 this.txtend = this.lexsize - 2;
2477 this.lexbuf[this.lexsize] = (byte) '\0';
2478 this.state = LEX_CONTENT;
2479 this.waswhite = false;
2480 this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
2481
2482
2483
2484 c = this.in.readChar();
2485
2486 if (c == '\r')
2487 {
2488 c = this.in.readChar();
2489
2490 if (c != '\n')
2491 {
2492 this.token.linebreak = true;
2493 }
2494 }
2495
2496 if (c == '\n')
2497 {
2498 this.token.linebreak = true;
2499 }
2500 else
2501 {
2502 this.in.ungetChar(c);
2503 }
2504
2505 return this.token;
2506 }
2507
2508
2509 if (badcomment == 0)
2510 {
2511 this.lines = this.in.getCurline();
2512 this.columns = this.in.getCurcol() - 3;
2513 }
2514
2515 badcomment++;
2516 if (this.configuration.fixComments)
2517 {
2518 this.lexbuf[this.lexsize - 2] = (byte) '=';
2519 }
2520
2521 addCharToLexer(c);
2522
2523
2524 if (c != '-')
2525 {
2526 break end_comment;
2527 }
2528
2529 }
2530
2531 this.lexbuf[this.lexsize - 2] = (byte) '=';
2532 continue;
2533
2534 case LEX_DOCTYPE :
2535
2536
2537 if (TidyUtils.isWhite((char) c))
2538 {
2539 if (this.waswhite)
2540 {
2541 this.lexsize -= 1;
2542 }
2543
2544 this.waswhite = true;
2545 }
2546 else
2547 {
2548 this.waswhite = false;
2549 }
2550
2551 if (inDTDSubset)
2552 {
2553 if (c == ']')
2554 {
2555 inDTDSubset = false;
2556 }
2557 }
2558 else if (c == '[')
2559 {
2560 inDTDSubset = true;
2561 }
2562 if (inDTDSubset || c != '>')
2563 {
2564 continue;
2565 }
2566
2567 this.lexsize -= 1;
2568 this.txtend = this.lexsize;
2569 this.lexbuf[this.lexsize] = (byte) '\0';
2570 this.state = LEX_CONTENT;
2571 this.waswhite = false;
2572 this.token = newNode(Node.DOCTYPE_TAG, this.lexbuf, this.txtstart, this.txtend);
2573
2574 this.doctype = findGivenVersion(this.token);
2575 return this.token;
2576
2577 case LEX_PROCINSTR :
2578
2579
2580
2581 if (this.lexsize - this.txtstart == 3)
2582 {
2583 if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("php"))
2584 {
2585 this.state = LEX_PHP;
2586 continue;
2587 }
2588 }
2589
2590 if (this.lexsize - this.txtstart == 4)
2591 {
2592 if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("xml")
2593 && TidyUtils.isWhite((char) this.lexbuf[this.txtstart + 3]))
2594 {
2595 this.state = LEX_XMLDECL;
2596 attributes = null;
2597 continue;
2598 }
2599 }
2600
2601 if (this.configuration.xmlPIs)
2602 {
2603 if (c != '?')
2604 {
2605 continue;
2606 }
2607
2608
2609 c = this.in.readChar();
2610
2611 if (c == StreamIn.END_OF_STREAM)
2612 {
2613 report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
2614 this.in.ungetChar(c);
2615 continue;
2616 }
2617
2618 addCharToLexer(c);
2619 }
2620
2621 if (c != '>')
2622 {
2623 continue;
2624 }
2625
2626 this.lexsize -= 1;
2627 this.txtend = this.lexsize;
2628 this.lexbuf[this.lexsize] = (byte) '\0';
2629 this.state = LEX_CONTENT;
2630 this.waswhite = false;
2631 this.token = newNode(Node.PROC_INS_TAG, this.lexbuf, this.txtstart, this.txtend);
2632 return this.token;
2633
2634 case LEX_ASP :
2635
2636 if (c != '%')
2637 {
2638 continue;
2639 }
2640
2641
2642 c = this.in.readChar();
2643
2644 if (c != '>')
2645 {
2646 this.in.ungetChar(c);
2647 continue;
2648 }
2649
2650 this.lexsize -= 1;
2651 this.txtend = this.lexsize;
2652 this.lexbuf[this.lexsize] = (byte) '\0';
2653 this.state = LEX_CONTENT;
2654 this.waswhite = false;
2655 this.token = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
2656 return this.token;
2657
2658 case LEX_JSTE :
2659
2660 if (c != '#')
2661 {
2662 continue;
2663 }
2664
2665
2666 c = this.in.readChar();
2667
2668 if (c != '>')
2669 {
2670 this.in.ungetChar(c);
2671 continue;
2672 }
2673
2674 this.lexsize -= 1;
2675 this.txtend = this.lexsize;
2676 this.lexbuf[this.lexsize] = (byte) '\0';
2677 this.state = LEX_CONTENT;
2678 this.waswhite = false;
2679 this.token = newNode(Node.JSTE_TAG, this.lexbuf, this.txtstart, this.txtend);
2680 return this.token;
2681
2682 case LEX_PHP :
2683
2684 if (c != '?')
2685 {
2686 continue;
2687 }
2688
2689
2690 c = this.in.readChar();
2691
2692 if (c != '>')
2693 {
2694 this.in.ungetChar(c);
2695 continue;
2696 }
2697
2698 this.lexsize -= 1;
2699 this.txtend = this.lexsize;
2700 this.lexbuf[this.lexsize] = (byte) '\0';
2701 this.state = LEX_CONTENT;
2702 this.waswhite = false;
2703 this.token = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
2704 return this.token;
2705
2706 case LEX_XMLDECL :
2707
2708 if (TidyUtils.isWhite((char) c) && c != '?')
2709 {
2710 continue;
2711 }
2712
2713
2714 if (c != '?')
2715 {
2716 String name;
2717 Node[] asp = new Node[1];
2718 Node[] php = new Node[1];
2719 AttVal av = new AttVal();
2720 int[] pdelim = new int[1];
2721 isempty[0] = false;
2722
2723 this.in.ungetChar(c);
2724
2725 name = this.parseAttribute(isempty, asp, php);
2726 av.attribute = name;
2727
2728 av.value = this.parseValue(name, true, isempty, pdelim);
2729 av.delim = pdelim[0];
2730 av.next = attributes;
2731
2732 attributes = av;
2733
2734 }
2735
2736
2737 c = this.in.readChar();
2738
2739 if (c != '>')
2740 {
2741 this.in.ungetChar(c);
2742 continue;
2743 }
2744 this.lexsize -= 1;
2745 this.txtend = this.txtstart;
2746 this.lexbuf[this.txtend] = '\0';
2747 this.state = LEX_CONTENT;
2748 this.waswhite = false;
2749 this.token = newNode(Node.XML_DECL, this.lexbuf, this.txtstart, this.txtend);
2750 this.token.attributes = attributes;
2751 return this.token;
2752
2753 case LEX_SECTION :
2754
2755 if (c == '[')
2756 {
2757 if (this.lexsize == (this.txtstart + 6)
2758 && (TidyUtils.getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
2759 {
2760 this.state = LEX_CDATA;
2761 this.lexsize -= 6;
2762 continue;
2763 }
2764 }
2765
2766 if (c != ']')
2767 {
2768 continue;
2769 }
2770
2771
2772 c = this.in.readChar();
2773
2774 if (c != '>')
2775 {
2776 this.in.ungetChar(c);
2777 continue;
2778 }
2779
2780 this.lexsize -= 1;
2781 this.txtend = this.lexsize;
2782 this.lexbuf[this.lexsize] = (byte) '\0';
2783 this.state = LEX_CONTENT;
2784 this.waswhite = false;
2785 this.token = newNode(Node.SECTION_TAG, this.lexbuf, this.txtstart, this.txtend);
2786 return this.token;
2787
2788 case LEX_CDATA :
2789
2790 if (c != ']')
2791 {
2792 continue;
2793 }
2794
2795
2796 c = this.in.readChar();
2797
2798 if (c != ']')
2799 {
2800 this.in.ungetChar(c);
2801 continue;
2802 }
2803
2804
2805 c = this.in.readChar();
2806
2807 if (c != '>')
2808 {
2809 this.in.ungetChar(c);
2810 continue;
2811 }
2812
2813 this.lexsize -= 1;
2814 this.txtend = this.lexsize;
2815 this.lexbuf[this.lexsize] = (byte) '\0';
2816 this.state = LEX_CONTENT;
2817 this.waswhite = false;
2818 this.token = newNode(Node.CDATA_TAG, this.lexbuf, this.txtstart, this.txtend);
2819 return this.token;
2820
2821 default :
2822
2823 break;
2824 }
2825 }
2826
2827 if (this.state == LEX_CONTENT)
2828 {
2829 this.txtend = this.lexsize;
2830
2831 if (this.txtend > this.txtstart)
2832 {
2833 this.in.ungetChar(c);
2834
2835 if (this.lexbuf[this.lexsize - 1] == (byte) ' ')
2836 {
2837 this.lexsize -= 1;
2838 this.txtend = this.lexsize;
2839 }
2840
2841 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2842 return this.token;
2843 }
2844 }
2845 else if (this.state == LEX_COMMENT)
2846 {
2847 if (c == StreamIn.END_OF_STREAM)
2848 {
2849 report.warning(this, null, null, Report.MALFORMED_COMMENT);
2850 }
2851
2852 this.txtend = this.lexsize;
2853 this.lexbuf[this.lexsize] = (byte) '\0';
2854 this.state = LEX_CONTENT;
2855 this.waswhite = false;
2856 this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
2857 return this.token;
2858 }
2859
2860 return null;
2861 }
2862
2863 /**
2864 * parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to
2865 * dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to
2866 * tailor the attribute value. Here is an example of a work around for using ASP in attribute values:
2867 * <code>href='<%=rsSchool.Fields("ID").Value%>'</code> where the ASP that generates the attribute value is
2868 * masked from Tidy by the quotemarks.
2869 * @return parsed Node
2870 */
2871 public Node parseAsp()
2872 {
2873 int c;
2874 Node asp = null;
2875
2876 this.txtstart = this.lexsize;
2877
2878 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
2879 {
2880
2881 addCharToLexer(c);
2882
2883 if (c != '%')
2884 {
2885 continue;
2886 }
2887
2888 if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
2889 {
2890 break;
2891 }
2892 addCharToLexer(c);
2893
2894 if (c == '>')
2895 {
2896 break;
2897 }
2898 }
2899
2900 this.lexsize -= 2;
2901 this.txtend = this.lexsize;
2902
2903 if (this.txtend > this.txtstart)
2904 {
2905 asp = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
2906 }
2907
2908 this.txtstart = this.txtend;
2909 return asp;
2910 }
2911
2912 /**
2913 * PHP is like ASP but is based upon XML processing instructions, e.g. <code><?php ... ?></code>.
2914 * @return parsed Node
2915 */
2916 public Node parsePhp()
2917 {
2918 int c;
2919 Node php = null;
2920
2921 this.txtstart = this.lexsize;
2922
2923 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
2924 {
2925 addCharToLexer(c);
2926
2927 if (c != '?')
2928 {
2929 continue;
2930 }
2931
2932 if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
2933 {
2934 break;
2935 }
2936 addCharToLexer(c);
2937
2938 if (c == '>')
2939 {
2940 break;
2941 }
2942 }
2943
2944 this.lexsize -= 2;
2945 this.txtend = this.lexsize;
2946
2947 if (this.txtend > this.txtstart)
2948 {
2949 php = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
2950 }
2951
2952 this.txtstart = this.txtend;
2953 return php;
2954 }
2955
2956 /**
2957 * consumes the '>' terminating start tags.
2958 * @param isempty flag is passed as array so it can be modified
2959 * @param asp asp Node, passed as array so it can be modified
2960 * @param php php Node, passed as array so it can be modified
2961 * @return parsed attribute
2962 */
2963 public String parseAttribute(boolean[] isempty, Node[] asp, Node[] php)
2964 {
2965 int start = 0;
2966 String attr;
2967 int c = 0;
2968 int lastc = 0;
2969
2970 asp[0] = null;
2971 php[0] = null;
2972
2973
2974 for (;;)
2975 {
2976 c = this.in.readChar();
2977
2978 if (c == '/')
2979 {
2980 c = this.in.readChar();
2981
2982 if (c == '>')
2983 {
2984 isempty[0] = true;
2985 return null;
2986 }
2987
2988 this.in.ungetChar(c);
2989 c = '/';
2990 break;
2991 }
2992
2993 if (c == '>')
2994 {
2995 return null;
2996 }
2997
2998 if (c == '<')
2999 {
3000 c = this.in.readChar();
3001
3002 if (c == '%')
3003 {
3004 asp[0] = parseAsp();
3005 return null;
3006 }
3007 else if (c == '?')
3008 {
3009 php[0] = parsePhp();
3010 return null;
3011 }
3012
3013 this.in.ungetChar(c);
3014 if (this.state != LEX_XMLDECL)
3015 {
3016 this.in.ungetChar('<');
3017 }
3018 report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3019 return null;
3020 }
3021
3022 if (c == '=')
3023 {
3024 report.attrError(this, this.token, null, Report.UNEXPECTED_EQUALSIGN);
3025 continue;
3026 }
3027
3028 if (c == '"' || c == '\'')
3029 {
3030 report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
3031 continue;
3032 }
3033
3034 if (c == StreamIn.END_OF_STREAM)
3035 {
3036 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3037 this.in.ungetChar(c);
3038 return null;
3039 }
3040
3041 if (!TidyUtils.isWhite((char) c))
3042 {
3043 break;
3044 }
3045 }
3046
3047 start = this.lexsize;
3048 lastc = c;
3049
3050 for (;;)
3051 {
3052
3053 if (c == '=' || c == '>')
3054 {
3055 this.in.ungetChar(c);
3056 break;
3057 }
3058
3059 if (c == '<' || c == StreamIn.END_OF_STREAM)
3060 {
3061 this.in.ungetChar(c);
3062 break;
3063 }
3064 if (lastc == '-' && (c == '"' || c == '\''))
3065 {
3066 this.lexsize--;
3067 this.in.ungetChar(c);
3068 break;
3069 }
3070 if (TidyUtils.isWhite((char) c))
3071 {
3072 break;
3073 }
3074
3075
3076
3077
3078 if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
3079 {
3080 c = TidyUtils.toLower((char) c);
3081 }
3082
3083
3084 addCharToLexer(c);
3085
3086 lastc = c;
3087 c = this.in.readChar();
3088 }
3089
3090
3091 int len = this.lexsize - start;
3092 attr = (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
3093 this.lexsize = start;
3094
3095 return attr;
3096 }
3097
3098 /**
3099 * Invoked when < is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this
3100 * routine recognizes ' and " quoted strings.
3101 * @return delimiter
3102 */
3103 public int parseServerInstruction()
3104 {
3105 int c, delim = '"';
3106 boolean isrule = false;
3107
3108 c = this.in.readChar();
3109 addCharToLexer(c);
3110
3111
3112 if (c == '%' || c == '?' || c == '@')
3113 {
3114 isrule = true;
3115 }
3116
3117 for (;;)
3118 {
3119 c = this.in.readChar();
3120
3121 if (c == StreamIn.END_OF_STREAM)
3122 {
3123 break;
3124 }
3125
3126 if (c == '>')
3127 {
3128 if (isrule)
3129 {
3130 addCharToLexer(c);
3131 }
3132 else
3133 {
3134 this.in.ungetChar(c);
3135 }
3136
3137 break;
3138 }
3139
3140
3141
3142 if (!isrule)
3143 {
3144 if (TidyUtils.isWhite((char) c))
3145 {
3146 break;
3147 }
3148 }
3149
3150 addCharToLexer(c);
3151
3152 if (c == '"')
3153 {
3154 do
3155 {
3156 c = this.in.readChar();
3157
3158 if (endOfInput())
3159 {
3160 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3161 this.in.ungetChar(c);
3162 return 0;
3163 }
3164 if (c == '>')
3165 {
3166 this.in.ungetChar(c);
3167 report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3168 return 0;
3169 }
3170
3171 addCharToLexer(c);
3172 }
3173 while (c != '"');
3174 delim = '\'';
3175 continue;
3176 }
3177
3178 if (c == '\'')
3179 {
3180 do
3181 {
3182 c = this.in.readChar();
3183
3184 if (endOfInput())
3185 {
3186 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3187 this.in.ungetChar(c);
3188 return 0;
3189 }
3190 if (c == '>')
3191 {
3192 this.in.ungetChar(c);
3193 report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3194 return 0;
3195 }
3196
3197 addCharToLexer(c);
3198 }
3199 while (c != '\'');
3200 }
3201 }
3202
3203 return delim;
3204 }
3205
3206 /**
3207 * Parse an attribute value.
3208 * @param name attribute name
3209 * @param foldCase fold case?
3210 * @param isempty is attribute empty? Passed as an array reference to allow modification
3211 * @param pdelim delimiter, passed as an array reference to allow modification
3212 * @return parsed value
3213 */
3214 public String parseValue(String name, boolean foldCase, boolean[] isempty, int[] pdelim)
3215 {
3216
3217
3218
3219 int len = 0;
3220 int start;
3221 boolean seenGt = false;
3222 boolean munge = true;
3223 int c = 0;
3224 int lastc, delim, quotewarning;
3225 String value;
3226
3227 delim = 0;
3228 pdelim[0] = '"';
3229
3230
3231
3232
3233 if (this.configuration.literalAttribs)
3234 {
3235 munge = false;
3236 }
3237
3238
3239 while (true)
3240 {
3241 c = this.in.readChar();
3242
3243 if (c == StreamIn.END_OF_STREAM)
3244 {
3245 this.in.ungetChar(c);
3246 break;
3247 }
3248
3249 if (!TidyUtils.isWhite((char) c))
3250 {
3251 break;
3252 }
3253 }
3254
3255
3256
3257 if (c != '=' && c != '"' && c != '\'')
3258 {
3259 this.in.ungetChar(c);
3260 return null;
3261 }
3262
3263
3264
3265 while (true)
3266 {
3267 c = this.in.readChar();
3268
3269 if (c == StreamIn.END_OF_STREAM)
3270 {
3271 this.in.ungetChar(c);
3272 break;
3273 }
3274
3275 if (!TidyUtils.isWhite((char) c))
3276 {
3277 break;
3278 }
3279 }
3280
3281
3282
3283 if (c == '"' || c == '\'')
3284 {
3285 delim = c;
3286 }
3287 else if (c == '<')
3288 {
3289 start = this.lexsize;
3290 addCharToLexer(c);
3291 pdelim[0] = parseServerInstruction();
3292 len = this.lexsize - start;
3293 this.lexsize = start;
3294 return (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
3295 }
3296 else
3297 {
3298 this.in.ungetChar(c);
3299 }
3300
3301
3302
3303 quotewarning = 0;
3304 start = this.lexsize;
3305 c = '\0';
3306
3307 while (true)
3308 {
3309 lastc = c;
3310 c = this.in.readChar();
3311
3312 if (c == StreamIn.END_OF_STREAM)
3313 {
3314 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3315 this.in.ungetChar(c);
3316 break;
3317 }
3318
3319 if (delim == (char) 0)
3320 {
3321 if (c == '>')
3322 {
3323 this.in.ungetChar(c);
3324 break;
3325 }
3326
3327 if (c == '"' || c == '\'')
3328 {
3329 report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
3330 break;
3331 }
3332
3333 if (c == '<')
3334 {
3335 this.in.ungetChar(c);
3336 c = '>';
3337 this.in.ungetChar(c);
3338 report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3339 break;
3340 }
3341
3342
3343
3344
3345
3346 if (c == '/')
3347 {
3348
3349 c = this.in.readChar();
3350
3351 if (c == '>' && !AttributeTable.getDefaultAttributeTable().isUrl(name))
3352 {
3353 isempty[0] = true;
3354 this.in.ungetChar(c);
3355 break;
3356 }
3357
3358
3359 this.in.ungetChar(c);
3360 c = '/';
3361 }
3362 }
3363 else
3364 {
3365
3366 if (c == delim)
3367 {
3368 break;
3369 }
3370
3371
3372
3373 if (c == '\r')
3374 {
3375 c = this.in.readChar();
3376 if (c != '\n')
3377 {
3378 this.in.ungetChar(c);
3379 }
3380
3381 c = '\n';
3382 }
3383
3384 if (c == '\n' || c == '<' || c == '>')
3385 {
3386 ++quotewarning;
3387 }
3388
3389 if (c == '>')
3390 {
3391 seenGt = true;
3392 }
3393 }
3394
3395 if (c == '&')
3396 {
3397
3398 if ("id".equalsIgnoreCase(name))
3399 {
3400 report.attrError(this, null, null, Report.ENTITY_IN_ID);
3401 continue;
3402 }
3403
3404 addCharToLexer(c);
3405 parseEntity((short) 0);
3406 continue;
3407
3408 }
3409
3410
3411
3412 if (c == '\\')
3413 {
3414 c = this.in.readChar();
3415
3416 if (c != '\n')
3417 {
3418 this.in.ungetChar(c);
3419 c = '\\';
3420 }
3421 }
3422
3423 if (TidyUtils.isWhite((char) c))
3424 {
3425 if (delim == (char) 0)
3426 {
3427 break;
3428 }
3429
3430 if (munge)
3431 {
3432
3433
3434 if (c == '\n' && AttributeTable.getDefaultAttributeTable().isUrl(name))
3435 {
3436
3437 report.attrError(this, this.token, null, Report.NEWLINE_IN_URI);
3438 continue;
3439 }
3440
3441 c = ' ';
3442
3443 if (lastc == ' ')
3444 {
3445 continue;
3446 }
3447 }
3448 }
3449 else if (foldCase && TidyUtils.isUpper((char) c))
3450 {
3451 c = TidyUtils.toLower((char) c);
3452 }
3453
3454 addCharToLexer(c);
3455 }
3456
3457 if (quotewarning > 10 && seenGt && munge)
3458 {
3459
3460
3461
3462
3463 if (!AttributeTable.getDefaultAttributeTable().isScript(name)
3464 && !(AttributeTable.getDefaultAttributeTable().isUrl(name) && "javascript:".equals(TidyUtils.getString(
3465 this.lexbuf,
3466 start,
3467 11)))
3468 && !"<xml ".equals(TidyUtils.getString(this.lexbuf, start, 5)))
3469
3470 {
3471 report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
3472 }
3473 }
3474
3475 len = this.lexsize - start;
3476 this.lexsize = start;
3477
3478 if (len > 0 || delim != 0)
3479 {
3480
3481
3482
3483
3484 if (munge && !TidyUtils.isInValuesIgnoreCase(new String[]{"alt", "title", "value", "prompt"}, name))
3485 {
3486 while (TidyUtils.isWhite((char) this.lexbuf[start + len - 1]))
3487 {
3488 --len;
3489 }
3490
3491 while (TidyUtils.isWhite((char) this.lexbuf[start]) && start < len)
3492 {
3493 ++start;
3494 --len;
3495 }
3496 }
3497
3498 value = TidyUtils.getString(this.lexbuf, start, len);
3499 }
3500 else
3501 {
3502 value = null;
3503 }
3504
3505
3506 if (delim != 0)
3507 {
3508 pdelim[0] = delim;
3509 }
3510 else
3511 {
3512 pdelim[0] = '"';
3513 }
3514
3515 return value;
3516 }
3517
3518 /**
3519 * Check if attr is a valid name.
3520 * @param attr String to check, must be non-null
3521 * @return <code>true</code> if attr is a valid name.
3522 */
3523 public static boolean isValidAttrName(String attr)
3524 {
3525 char c;
3526 int i;
3527
3528
3529 c = attr.charAt(0);
3530
3531 if (!TidyUtils.isLetter(c))
3532 {
3533 return false;
3534 }
3535
3536
3537 for (i = 1; i < attr.length(); i++)
3538 {
3539 c = attr.charAt(i);
3540
3541 if (TidyUtils.isNamechar(c))
3542 {
3543 continue;
3544 }
3545
3546 return false;
3547 }
3548
3549 return true;
3550 }
3551
3552 /**
3553 * In CSS1, selectors can contain only the characters A-Z, 0-9, and Unicode characters 161-255, plus dash (-); they
3554 * cannot start with a dash or a digit; they can also contain escaped characters and any Unicode character as a
3555 * numeric code (see next item). The backslash followed by at most four hexadecimal digits (0..9A..F) stands for the
3556 * Unicode character with that number. Any character except a hexadecimal digit can be escaped to remove its special
3557 * meaning, by putting a backslash in front.
3558 * @param buf css selector name
3559 * @return <code>true</code> if the given string is a valid css1 selector name
3560 */
3561 public static boolean isCSS1Selector(String buf)
3562 {
3563 if (buf == null)
3564 {
3565 return false;
3566 }
3567
3568
3569 boolean valid = true;
3570 int esclen = 0;
3571 char c;
3572 int pos;
3573
3574 for (pos = 0; valid && pos < buf.length(); ++pos)
3575 {
3576 c = buf.charAt(pos);
3577 if (c == '\\')
3578 {
3579 esclen = 1;
3580 }
3581 else if (Character.isDigit(c))
3582 {
3583
3584 if (esclen > 0)
3585 {
3586 valid = (++esclen < 6);
3587 }
3588 if (valid)
3589 {
3590 valid = (pos > 0 || esclen > 0);
3591 }
3592 }
3593 else
3594 {
3595 valid = (esclen > 0
3596 || (pos > 0 && c == '-')
3597 || Character.isLetter(c)
3598 || (c >= 161 && c <= 255));
3599 esclen = 0;
3600 }
3601 }
3602 return valid;
3603 }
3604
3605 /**
3606 * Parse tag attributes.
3607 * @param isempty is tag empty?
3608 * @return parsed attribute/value list
3609 */
3610 public AttVal parseAttrs(boolean[] isempty)
3611 {
3612 AttVal av, list;
3613 String attribute, value;
3614 int[] delim = new int[1];
3615 Node[] asp = new Node[1];
3616 Node[] php = new Node[1];
3617
3618 list = null;
3619
3620 while (!endOfInput())
3621 {
3622 attribute = parseAttribute(isempty, asp, php);
3623
3624 if (attribute == null)
3625 {
3626
3627 if (asp[0] != null)
3628 {
3629 av = new AttVal(list, null, asp[0], null, '\0', null, null);
3630 list = av;
3631 continue;
3632 }
3633
3634
3635 if (php[0] != null)
3636 {
3637 av = new AttVal(list, null, null, php[0], '\0', null, null);
3638 list = av;
3639 continue;
3640 }
3641
3642 break;
3643 }
3644
3645 value = parseValue(attribute, false, isempty, delim);
3646
3647 if (attribute != null && isValidAttrName(attribute))
3648 {
3649 av = new AttVal(list, null, null, null, delim[0], attribute, value);
3650 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
3651 list = av;
3652 }
3653 else
3654 {
3655 av = new AttVal(null, null, null, null, 0, attribute, value);
3656
3657
3658 if (value != null)
3659 {
3660 report.attrError(this, this.token, av, Report.BAD_ATTRIBUTE_VALUE);
3661 }
3662 else if (TidyUtils.lastChar(attribute) == '"')
3663 {
3664 report.attrError(this, this.token, av, Report.MISSING_QUOTEMARK);
3665 }
3666 else
3667 {
3668 report.attrError(this, this.token, av, Report.UNKNOWN_ATTRIBUTE);
3669 }
3670 }
3671 }
3672
3673 return list;
3674 }
3675
3676 /**
3677 * Push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones
3678 * generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance:
3679 * <code><p><em> text <p><em> more text</code> Shouldn't be mapped to
3680 * <code><p><em> text </em></p><p><em><em> more text </em></em></code>
3681 * @param node Node to be pushed
3682 */
3683 public void pushInline(Node node)
3684 {
3685 IStack is;
3686
3687 if (node.implicit)
3688 {
3689 return;
3690 }
3691
3692 if (node.tag == null)
3693 {
3694 return;
3695 }
3696
3697 if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
3698 {
3699 return;
3700 }
3701
3702 if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
3703 {
3704 return;
3705 }
3706
3707 if (node.tag != this.configuration.tt.tagFont && isPushed(node))
3708 {
3709 return;
3710 }
3711
3712
3713 is = new IStack();
3714 is.tag = node.tag;
3715 is.element = node.element;
3716 if (node.attributes != null)
3717 {
3718 is.attributes = cloneAttributes(node.attributes);
3719 }
3720 this.istack.push(is);
3721 }
3722
3723 /**
3724 * Pop a copy of an inline node from the stack.
3725 * @param node Node to be popped
3726 */
3727 public void popInline(Node node)
3728 {
3729 IStack is;
3730
3731 if (node != null)
3732 {
3733
3734 if (node.tag == null)
3735 {
3736 return;
3737 }
3738
3739 if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
3740 {
3741 return;
3742 }
3743
3744 if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
3745 {
3746 return;
3747 }
3748
3749
3750 if (node.tag == this.configuration.tt.tagA)
3751 {
3752
3753 while (this.istack.size() > 0)
3754 {
3755 is = (IStack) this.istack.pop();
3756 if (is.tag == this.configuration.tt.tagA)
3757 {
3758 break;
3759 }
3760 }
3761
3762 if (this.insert >= this.istack.size())
3763 {
3764 this.insert = -1;
3765 }
3766 return;
3767 }
3768 }
3769
3770 if (this.istack.size() > 0)
3771 {
3772 is = (IStack) this.istack.pop();
3773 if (this.insert >= this.istack.size())
3774 {
3775 this.insert = -1;
3776 }
3777 }
3778 }
3779
3780 /**
3781 * Is the node in the stack?
3782 * @param node Node
3783 * @return <code>true</code> is the node is found in the stack
3784 */
3785 public boolean isPushed(Node node)
3786 {
3787 int i;
3788 IStack is;
3789
3790 for (i = this.istack.size() - 1; i >= 0; --i)
3791 {
3792 is = (IStack) this.istack.elementAt(i);
3793 if (is.tag == node.tag)
3794 {
3795 return true;
3796 }
3797 }
3798
3799 return false;
3800 }
3801
3802 /**
3803 * This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P,
3804 * TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. When the inline stack is not empty, as
3805 * will be the case in: <code><i><h1>italic heading</h1></i></code> which is then treated as
3806 * equivalent to <code><h1><i>italic heading</i></h1></code> This is implemented by setting the lexer
3807 * into a mode where it gets tokens from the inline stack rather than from the input stream.
3808 * @param node original node
3809 * @return stack size
3810 */
3811 public int inlineDup(Node node)
3812 {
3813 int n;
3814
3815 n = this.istack.size() - this.istackbase;
3816 if (n > 0)
3817 {
3818 this.insert = this.istackbase;
3819 this.inode = node;
3820 }
3821
3822 return n;
3823 }
3824
3825 /**
3826 * @return
3827 */
3828 public Node insertedToken()
3829 {
3830 Node node;
3831 IStack is;
3832 int n;
3833
3834
3835 if (this.insert == -1)
3836 {
3837 node = this.inode;
3838 this.inode = null;
3839 return node;
3840 }
3841
3842
3843 if (this.inode == null)
3844 {
3845 this.lines = this.in.getCurline();
3846 this.columns = this.in.getCurcol();
3847 }
3848
3849 node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend);
3850
3851
3852 node.implicit = true;
3853 is = (IStack) this.istack.elementAt(this.insert);
3854 node.element = is.element;
3855 node.tag = is.tag;
3856 if (is.attributes != null)
3857 {
3858 node.attributes = cloneAttributes(is.attributes);
3859 }
3860
3861
3862 n = this.insert;
3863
3864
3865 if (++n < this.istack.size())
3866 {
3867 this.insert = n;
3868 }
3869 else
3870 {
3871 this.insert = -1;
3872 }
3873
3874 return node;
3875 }
3876
3877 /**
3878 * Can the given element be removed?
3879 * @param element node
3880 * @return <code>true</code> if he element can be removed
3881 */
3882 public boolean canPrune(Node element)
3883 {
3884 if (element.type == Node.TEXT_NODE)
3885 {
3886 return true;
3887 }
3888
3889 if (element.content != null)
3890 {
3891 return false;
3892 }
3893
3894 if (element.tag == this.configuration.tt.tagA && element.attributes != null)
3895 {
3896 return false;
3897 }
3898
3899 if (element.tag == this.configuration.tt.tagP && !this.configuration.dropEmptyParas)
3900 {
3901 return false;
3902 }
3903
3904 if (element.tag == null)
3905 {
3906 return false;
3907 }
3908
3909 if (TidyUtils.toBoolean(element.tag.model & Dict.CM_ROW))
3910 {
3911 return false;
3912 }
3913
3914 if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY))
3915 {
3916 return false;
3917 }
3918
3919 if (element.tag == this.configuration.tt.tagApplet)
3920 {
3921 return false;
3922 }
3923
3924 if (element.tag == this.configuration.tt.tagObject)
3925 {
3926 return false;
3927 }
3928
3929 if (element.tag == this.configuration.tt.tagScript && element.getAttrByName("src") != null)
3930 {
3931 return false;
3932 }
3933
3934
3935 if (element.tag == this.configuration.tt.tagTitle)
3936 {
3937 return false;
3938 }
3939
3940
3941 if (element.tag == this.configuration.tt.tagIframe)
3942 {
3943 return false;
3944 }
3945
3946 if (element.getAttrByName("id") != null || element.getAttrByName("name") != null)
3947 {
3948 return false;
3949 }
3950
3951 return true;
3952 }
3953
3954 /**
3955 * duplicate name attribute as an id and check if id and name match.
3956 * @param node Node to check for name/it attributes
3957 */
3958 public void fixId(Node node)
3959 {
3960 AttVal name = node.getAttrByName("name");
3961 AttVal id = node.getAttrByName("id");
3962
3963 if (name != null)
3964 {
3965 if (id != null)
3966 {
3967 if (id.value != null && !id.value.equals(name.value))
3968 {
3969 report.attrError(this, node, name, Report.ID_NAME_MISMATCH);
3970 }
3971 }
3972 else if (this.configuration.xmlOut)
3973 {
3974 node.addAttribute("id", name.value);
3975 }
3976 }
3977 }
3978
3979 /**
3980 * Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated.
3981 */
3982 public void deferDup()
3983 {
3984 this.insert = -1;
3985 this.inode = null;
3986 }
3987
3988 /**
3989 * Constraint the html version in the document to the given one. Everything is allowed in proprietary version of
3990 * HTML this is handled here rather than in the tag/attr dicts.
3991 * @param vers html version code
3992 */
3993 void constrainVersion(int vers)
3994 {
3995 this.versions &= (vers | Dict.VERS_PROPRIETARY);
3996 }
3997
3998 /**
3999 * Is content acceptable for pre elements?
4000 * @param node content
4001 * @return <code>true</code> if node is acceptable in pre elements
4002 */
4003 protected boolean preContent(Node node)
4004 {
4005
4006 if (node.tag == this.configuration.tt.tagP)
4007 {
4008 return true;
4009 }
4010
4011 if (node.tag == null
4012 || node.tag == this.configuration.tt.tagP
4013 || !TidyUtils.toBoolean(node.tag.model & (Dict.CM_INLINE | Dict.CM_NEW)))
4014 {
4015 return false;
4016 }
4017 return true;
4018 }
4019
4020 /**
4021 * document type.
4022 */
4023 private static class W3CVersionInfo
4024 {
4025
4026 /**
4027 * name.
4028 */
4029 String name;
4030
4031 /**
4032 * voyager name.
4033 */
4034 String voyagerName;
4035
4036 /**
4037 * profile.
4038 */
4039 String profile;
4040
4041 /**
4042 * code.
4043 */
4044 short code;
4045
4046 /**
4047 * Instantiates a new W3CVersionInfo.
4048 * @param name version name
4049 * @param voyagerName voyager (xhtml) name
4050 * @param profile VOYAGER_STRICT | VOYAGER_LOOSE | VOYAGER_FRAMESET
4051 * @param code unique code for this version info
4052 */
4053 public W3CVersionInfo(String name, String voyagerName, String profile, short code)
4054 {
4055 this.name = name;
4056 this.voyagerName = voyagerName;
4057 this.profile = profile;
4058 this.code = code;
4059 }
4060 }
4061
4062 }