1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54 package org.w3c.tidy;
55
56 import java.io.PrintWriter;
57 import java.util.List;
58 import java.util.Stack;
59 import java.util.Vector;
60
61
62 /***
63 * Lexer for html parser.
64 * <p>
65 * Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
66 * level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
67 * null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
68 * mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
69 * to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
70 * Not yet done: - Doctype subset and marked sections
71 * </p>
72 * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
73 * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
74 * @author Fabrizio Giustina
75 * @version $Revision: 1.93 $ ($Author: fgiust $)
76 */
77 public class Lexer
78 {
79
80 /***
81 * state: ignore whitespace.
82 */
83 public static final short IGNORE_WHITESPACE = 0;
84
85 /***
86 * state: mixed content.
87 */
88 public static final short MIXED_CONTENT = 1;
89
90 /***
91 * state: preformatted.
92 */
93 public static final short PREFORMATTED = 2;
94
95 /***
96 * state: ignore markup.
97 */
98 public static final short IGNORE_MARKUP = 3;
99
100 /***
101 * URI for XHTML 1.0 transitional DTD.
102 */
103 private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
104
105 /***
106 * URI for XHTML 1.0 strict DTD.
107 */
108 private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
109
110 /***
111 * URI for XHTML 1.0 frameset DTD.
112 */
113 private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
114
115 /***
116 * URI for XHTML 1.1.
117 */
118 private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";
119
120 /***
121 * URI for XHTML Basic 1.0.
122 */
123
124 /***
125 * xhtml namespace.
126 */
127 private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
128
129 /***
130 * lists all the known versions.
131 */
132 private static final Lexer.W3CVersionInfo[] W3CVERSION = {
133 new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
134 new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
135 new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
136 new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
137 new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
138 new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
139 new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
140 new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
141 new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
142 new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20),
143 new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)};
144
145 /***
146 * getToken state: content.
147 */
148 private static final short LEX_CONTENT = 0;
149
150 /***
151 * getToken state: gt.
152 */
153 private static final short LEX_GT = 1;
154
155 /***
156 * getToken state: endtag.
157 */
158 private static final short LEX_ENDTAG = 2;
159
160 /***
161 * getToken state: start tag.
162 */
163 private static final short LEX_STARTTAG = 3;
164
165 /***
166 * getToken state: comment.
167 */
168 private static final short LEX_COMMENT = 4;
169
170 /***
171 * getToken state: doctype.
172 */
173 private static final short LEX_DOCTYPE = 5;
174
175 /***
176 * getToken state: procinstr.
177 */
178 private static final short LEX_PROCINSTR = 6;
179
180 /***
181 * getToken state: cdata.
182 */
183 private static final short LEX_CDATA = 8;
184
185 /***
186 * getToken state: section.
187 */
188 private static final short LEX_SECTION = 9;
189
190 /***
191 * getToken state: asp.
192 */
193 private static final short LEX_ASP = 10;
194
195 /***
196 * getToken state: jste.
197 */
198 private static final short LEX_JSTE = 11;
199
200 /***
201 * getToken state: php.
202 */
203 private static final short LEX_PHP = 12;
204
205 /***
206 * getToken state: xml declaration.
207 */
208 private static final short LEX_XMLDECL = 13;
209
210 /***
211 * file stream.
212 */
213 protected StreamIn in;
214
215 /***
216 * error output stream.
217 */
218 protected PrintWriter errout;
219
220 /***
221 * for accessibility errors.
222 */
223 protected short badAccess;
224
225 /***
226 * for bad style errors.
227 */
228 protected short badLayout;
229
230 /***
231 * for bad char encodings.
232 */
233 protected short badChars;
234
235 /***
236 * for mismatched/mispositioned form tags.
237 */
238 protected short badForm;
239
240 /***
241 * count of warnings in this document.
242 */
243 protected short warnings;
244
245 /***
246 * count of errors.
247 */
248 protected short errors;
249
250 /***
251 * lines seen.
252 */
253 protected int lines;
254
255 /***
256 * at start of current token.
257 */
258 protected int columns;
259
260 /***
261 * used to collapse contiguous white space.
262 */
263 protected boolean waswhite;
264
265 /***
266 * true after token has been pushed back.
267 */
268 protected boolean pushed;
269
270 /***
271 * when space is moved after end tag.
272 */
273 protected boolean insertspace;
274
275 /***
276 * Netscape compatibility.
277 */
278 protected boolean excludeBlocks;
279
280 /***
281 * true if moved out of table.
282 */
283 protected boolean exiled;
284
285 /***
286 * true if xmlns attribute on html element.
287 */
288 protected boolean isvoyager;
289
290 /***
291 * bit vector of HTML versions.
292 */
293 protected short versions;
294
295 /***
296 * version as given by doctype (if any).
297 */
298 protected int doctype;
299
300 /***
301 * set if html or PUBLIC is missing.
302 */
303 protected boolean badDoctype;
304
305 /***
306 * start of current node.
307 */
308 protected int txtstart;
309
310 /***
311 * end of current node.
312 */
313 protected int txtend;
314
315 /***
316 * state of lexer's finite state machine.
317 */
318 protected short state;
319
320 /***
321 * current node.
322 */
323 protected Node token;
324
325 /***
326 * Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
327 * all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
328 */
329 protected byte[] lexbuf;
330
331 /***
332 * allocated.
333 */
334 protected int lexlength;
335
336 /***
337 * used.
338 */
339 protected int lexsize;
340
341 /***
342 * Inline stack for compatibility with Mosaic. For deferring text node.
343 */
344 protected Node inode;
345
346 /***
347 * for inferring inline tags.
348 */
349 protected int insert;
350
351 /***
352 * stack.
353 */
354 protected Stack istack;
355
356 /***
357 * start of frame.
358 */
359 protected int istackbase;
360
361 /***
362 * used for cleaning up presentation markup.
363 */
364 protected Style styles;
365
366 /***
367 * configuration.
368 */
369 protected Configuration configuration;
370
371 /***
372 * already seen end body tag?
373 */
374 protected boolean seenEndBody;
375
376 /***
377 * already seen end html tag?
378 */
379 protected boolean seenEndHtml;
380
381 /***
382 * report.
383 */
384 protected Report report;
385
386 /***
387 * Root node is saved here.
388 */
389 protected Node root;
390
391 /***
392 * node list.
393 */
394 private List nodeList;
395
396 /***
397 * Instantiates a new Lexer.
398 * @param in StreamIn
399 * @param configuration configuation instance
400 * @param report report instance, for reporting errors
401 */
402 public Lexer(StreamIn in, Configuration configuration, Report report)
403 {
404 this.report = report;
405 this.in = in;
406 this.lines = 1;
407 this.columns = 1;
408 this.state = LEX_CONTENT;
409 this.versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY);
410 this.doctype = Dict.VERS_UNKNOWN;
411 this.insert = -1;
412 this.istack = new Stack();
413 this.configuration = configuration;
414 this.nodeList = new Vector();
415 }
416
417 /***
418 * Creates a new node and add it to nodelist.
419 * @return Node
420 */
421 public Node newNode()
422 {
423 Node node = new Node();
424 this.nodeList.add(node);
425 return node;
426 }
427
428 /***
429 * Creates a new node and add it to nodelist.
430 * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
431 * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
432 * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
433 * @param textarray array of bytes contained in the Node
434 * @param start start position
435 * @param end end position
436 * @return Node
437 */
438 public Node newNode(short type, byte[] textarray, int start, int end)
439 {
440 Node node = new Node(type, textarray, start, end);
441 this.nodeList.add(node);
442 return node;
443 }
444
445 /***
446 * Creates a new node and add it to nodelist.
447 * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
448 * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
449 * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
450 * @param textarray array of bytes contained in the Node
451 * @param start start position
452 * @param end end position
453 * @param element tag name
454 * @return Node
455 */
456 public Node newNode(short type, byte[] textarray, int start, int end, String element)
457 {
458 Node node = new Node(type, textarray, start, end, element, this.configuration.tt);
459 this.nodeList.add(node);
460 return node;
461 }
462
463 /***
464 * Clones a node and add it to node list.
465 * @param node Node
466 * @return cloned Node
467 */
468 public Node cloneNode(Node node)
469 {
470 Node cnode = (Node) node.clone();
471 this.nodeList.add(cnode);
472 for (AttVal att = cnode.attributes; att != null; att = att.next)
473 {
474 if (att.asp != null)
475 {
476 this.nodeList.add(att.asp);
477 }
478 if (att.php != null)
479 {
480 this.nodeList.add(att.php);
481 }
482 }
483 return cnode;
484 }
485
486 /***
487 * Clones an attribute value and add eventual asp or php node to node list.
488 * @param attrs original AttVal
489 * @return cloned AttVal
490 */
491 public AttVal cloneAttributes(AttVal attrs)
492 {
493 AttVal cattrs = (AttVal) attrs.clone();
494 for (AttVal att = cattrs; att != null; att = att.next)
495 {
496 if (att.asp != null)
497 {
498 this.nodeList.add(att.asp);
499 }
500 if (att.php != null)
501 {
502 this.nodeList.add(att.php);
503 }
504 }
505 return cattrs;
506 }
507
508 /***
509 * Update <code>oldtextarray</code> in the current nodes.
510 * @param oldtextarray previous text array
511 * @param newtextarray new text array
512 */
513 protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
514 {
515 Node node;
516 for (int i = 0; i < this.nodeList.size(); i++)
517 {
518 node = (Node) (this.nodeList.get(i));
519 if (node.textarray == oldtextarray)
520 {
521 node.textarray = newtextarray;
522 }
523 }
524 }
525
526 /***
527 * Adds a new line node. Used for creating preformatted text from Word2000.
528 * @return new line node
529 */
530 public Node newLineNode()
531 {
532 Node node = newNode();
533
534 node.textarray = this.lexbuf;
535 node.start = this.lexsize;
536 addCharToLexer('\n');
537 node.end = this.lexsize;
538 return node;
539 }
540
541 /***
542 * Has end of input stream been reached?
543 * @return <code>true</code> if end of input stream been reached
544 */
545 public boolean endOfInput()
546 {
547 return this.in.isEndOfStream();
548 }
549
550 /***
551 * Adds a byte to lexer buffer.
552 * @param c byte to add
553 */
554 public void addByte(int c)
555 {
556 if (this.lexsize + 1 >= this.lexlength)
557 {
558 while (this.lexsize + 1 >= this.lexlength)
559 {
560 if (this.lexlength == 0)
561 {
562 this.lexlength = 8192;
563 }
564 else
565 {
566 this.lexlength = this.lexlength * 2;
567 }
568 }
569
570 byte[] temp = this.lexbuf;
571 this.lexbuf = new byte[this.lexlength];
572 if (temp != null)
573 {
574 System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
575 updateNodeTextArrays(temp, this.lexbuf);
576 }
577 }
578
579 this.lexbuf[this.lexsize++] = (byte) c;
580 this.lexbuf[this.lexsize] = (byte) '\0';
581 }
582
583 /***
584 * Substitute the last char in buffer.
585 * @param c new char
586 */
587 public void changeChar(byte c)
588 {
589 if (this.lexsize > 0)
590 {
591 this.lexbuf[this.lexsize - 1] = c;
592 }
593 }
594
595 /***
596 * Store char c as UTF-8 encoded byte stream.
597 * @param c char to store
598 */
599 public void addCharToLexer(int c)
600 {
601
602
603
604 if ((this.configuration.xmlOut || this.configuration.xHTML)
605 && !((c >= 0x20 && c <= 0xD7FF)
606 || c == 0x9 || c == 0xA || c == 0xD
607 || (c >= 0xE000 && c <= 0xFFFD)
608 || (c >= 0x10000 && c <= 0x10FFFF)))
609 {
610 return;
611 }
612
613 int i = 0;
614 int[] count = new int[]{0};
615 byte[] buf = new byte[10];
616
617 boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count);
618 if (err)
619 {
620
621 buf[0] = (byte) 0xEF;
622 buf[1] = (byte) 0xBF;
623 buf[2] = (byte) 0xBD;
624 count[0] = 3;
625 }
626
627 for (i = 0; i < count[0]; i++)
628 {
629 addByte(buf[i]);
630 }
631
632 }
633
634 /***
635 * Adds a string to lexer buffer.
636 * @param str String to add
637 */
638 public void addStringToLexer(String str)
639 {
640 for (int i = 0; i < str.length(); i++)
641 {
642 addCharToLexer(str.charAt(i));
643 }
644 }
645
646 /***
647 * Parse an html entity.
648 * @param mode mode
649 */
650 public void parseEntity(short mode)
651 {
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678 int start;
679 boolean first = true;
680 boolean semicolon = false;
681 int c, ch, startcol;
682 String str;
683
684 start = this.lexsize - 1;
685 startcol = this.in.getCurcol() - 1;
686
687 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
688 {
689 if (c == ';')
690 {
691 semicolon = true;
692 break;
693 }
694
695 if (first && c == '#')
696 {
697
698 if (!this.configuration.ncr
699 || this.configuration.getInCharEncoding() == Configuration.BIG5
700 || this.configuration.getInCharEncoding() == Configuration.SHIFTJIS)
701 {
702 this.in.ungetChar(c);
703 return;
704 }
705
706
707 addCharToLexer(c);
708 first = false;
709 continue;
710 }
711
712 first = false;
713
714 if (TidyUtils.isNamechar((char) c))
715 {
716 addCharToLexer(c);
717 continue;
718 }
719
720
721 this.in.ungetChar(c);
722 break;
723 }
724
725 str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start);
726
727 if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML)
728 {
729 report.entityError(this, Report.APOS_UNDEFINED, str, 39);
730 }
731
732 ch = EntityTable.getDefaultEntityTable().entityCode(str);
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747 if (ch <= 0 || (ch >= 256 && c != ';'))
748 {
749
750 this.lines = this.in.getCurline();
751 this.columns = startcol;
752
753 if (this.lexsize > start + 1)
754 {
755 if (ch >= 128 && ch <= 159)
756 {
757
758 int c1 = 0;
759
760 if (configuration.replacementCharEncoding == Configuration.WIN1252)
761 {
762 c1 = EncodingUtils.decodeWin1252(ch);
763 }
764 else if (configuration.replacementCharEncoding == Configuration.MACROMAN)
765 {
766 c1 = EncodingUtils.decodeMacRoman(ch);
767 }
768
769
770
771 int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;
772
773 if (c != ';')
774 {
775 report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c);
776 }
777
778 report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch);
779
780 if (c1 != 0)
781 {
782
783 this.lexsize = start;
784 addCharToLexer(c1);
785 semicolon = false;
786 }
787 else
788 {
789
790 this.lexsize = start;
791 semicolon = false;
792 }
793
794 }
795 else
796 {
797 report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
798 }
799
800 if (semicolon)
801 {
802 addCharToLexer(';');
803 }
804 }
805 else
806 {
807
808 report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
809 }
810 }
811 else
812 {
813
814 if (c != ';')
815 {
816
817 this.lines = this.in.getCurline();
818 this.columns = startcol;
819 report.entityError(this, Report.MISSING_SEMICOLON, str, c);
820 }
821
822 this.lexsize = start;
823
824 if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED))
825 {
826 ch = ' ';
827 }
828
829 addCharToLexer(ch);
830
831 if (ch == '&' && !this.configuration.quoteAmpersand)
832 {
833 addCharToLexer('a');
834 addCharToLexer('m');
835 addCharToLexer('p');
836 addCharToLexer(';');
837 }
838 }
839 }
840
841 /***
842 * Parses a tag name.
843 * @return first char after the tag name
844 */
845 public char parseTagName()
846 {
847 int c;
848
849
850 c = this.lexbuf[this.txtstart];
851
852 if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
853 {
854 c = TidyUtils.toLower((char) c);
855 this.lexbuf[this.txtstart] = (byte) c;
856 }
857
858 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
859 {
860 if (!TidyUtils.isNamechar((char) c))
861 {
862 break;
863 }
864
865
866 if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
867 {
868 c = TidyUtils.toLower((char) c);
869 }
870
871 addCharToLexer(c);
872 }
873
874 this.txtend = this.lexsize;
875 return (char) c;
876 }
877
878 /***
879 * calls addCharToLexer for any char in the string.
880 * @param str input String
881 */
882 public void addStringLiteral(String str)
883 {
884 int len = str.length();
885 for (int i = 0; i < len; i++)
886 {
887 addCharToLexer(str.charAt(i));
888 }
889 }
890
891 /***
892 * calls addCharToLexer for any char in the string till len is reached.
893 * @param str input String
894 * @param len length of the substring to be added
895 */
896 void addStringLiteralLen(String str, int len)
897 {
898 int strlen = str.length();
899 if (strlen < len)
900 {
901 len = strlen;
902 }
903 for (int i = 0; i < len; i++)
904 {
905 addCharToLexer(str.charAt(i));
906 }
907 }
908
909 /***
910 * Choose what version to use for new doctype.
911 * @return html version constant
912 */
913 public short htmlVersion()
914 {
915 if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20))
916 {
917 return Dict.VERS_HTML20;
918 }
919
920 if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager)
921 && TidyUtils.toBoolean(versions & Dict.VERS_HTML32))
922 {
923 return Dict.VERS_HTML32;
924 }
925 if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11))
926 {
927 return Dict.VERS_XHTML11;
928 }
929 if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT))
930 {
931 return Dict.VERS_HTML40_STRICT;
932 }
933
934 if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE))
935 {
936 return Dict.VERS_HTML40_LOOSE;
937 }
938
939 if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET))
940 {
941 return Dict.VERS_FRAMESET;
942 }
943
944 return Dict.VERS_UNKNOWN;
945 }
946
947 /***
948 * Choose what version to use for new doctype.
949 * @return html version name
950 */
951 public String htmlVersionName()
952 {
953 short guessed;
954 int j;
955
956 guessed = apparentVersion();
957
958 for (j = 0; j < W3CVERSION.length; ++j)
959 {
960 if (guessed == W3CVERSION[j].code)
961 {
962 if (this.isvoyager)
963 {
964 return W3CVERSION[j].voyagerName;
965 }
966
967 return W3CVERSION[j].name;
968 }
969 }
970
971 return null;
972 }
973
974 /***
975 * Add meta element for Tidy. If the meta tag is already present, update release date.
976 * @param root root node
977 * @return <code>true</code> if the tag has been added
978 */
979 public boolean addGenerator(Node root)
980 {
981 AttVal attval;
982 Node node;
983 Node head = root.findHEAD(this.configuration.tt);
984
985 if (head != null)
986 {
987 String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see www.w3.org";
988
989 for (node = head.content; node != null; node = node.next)
990 {
991 if (node.tag == this.configuration.tt.tagMeta)
992 {
993 attval = node.getAttrByName("name");
994
995 if (attval != null && attval.value != null && "generator".equalsIgnoreCase(attval.value))
996 {
997 attval = node.getAttrByName("content");
998
999 if (attval != null
1000 && attval.value != null
1001 && attval.value.length() >= 9
1002 && "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9)))
1003 {
1004 attval.value = meta;
1005 return false;
1006 }
1007 }
1008 }
1009 }
1010
1011 node = this.inferredTag("meta");
1012 node.addAttribute("content", meta);
1013 node.addAttribute("name", "generator");
1014 head.insertNodeAtStart(node);
1015 return true;
1016 }
1017
1018 return false;
1019 }
1020
1021 /***
1022 * Check system keywords (keywords should be uppercase).
1023 * @param doctype doctype node
1024 * @return true if doctype keywords are all uppercase
1025 */
1026 public boolean checkDocTypeKeyWords(Node doctype)
1027 {
1028 int len = doctype.end - doctype.start;
1029 String s = TidyUtils.getString(this.lexbuf, doctype.start, len);
1030
1031 return !(TidyUtils.findBadSubString("SYSTEM", s, len)
1032 || TidyUtils.findBadSubString("PUBLIC", s, len)
1033 || TidyUtils.findBadSubString("//DTD", s, len)
1034 || TidyUtils.findBadSubString("//W3C", s, len) || TidyUtils.findBadSubString("//EN", s, len));
1035 }
1036
1037 /***
1038 * Examine DOCTYPE to identify version.
1039 * @param doctype doctype node
1040 * @return version code
1041 */
1042 public short findGivenVersion(Node doctype)
1043 {
1044 String p, s;
1045 int i, j;
1046 int len;
1047 String str1;
1048 String str2;
1049
1050
1051 str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5);
1052 if (!"html ".equalsIgnoreCase(str1))
1053 {
1054 return 0;
1055 }
1056
1057 if (!checkDocTypeKeyWords(doctype))
1058 {
1059 report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
1060 }
1061
1062
1063 str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7);
1064 if ("SYSTEM ".equalsIgnoreCase(str1))
1065 {
1066
1067 if (!str1.substring(0, 6).equals("SYSTEM"))
1068 {
1069 System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6);
1070 }
1071 return 0;
1072 }
1073
1074 if ("PUBLIC ".equalsIgnoreCase(str1))
1075 {
1076 if (!str1.substring(0, 6).equals("PUBLIC"))
1077 {
1078 System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6);
1079 }
1080 }
1081 else
1082 {
1083 this.badDoctype = true;
1084 }
1085
1086 for (i = doctype.start; i < doctype.end; ++i)
1087 {
1088 if (this.lexbuf[i] == (byte) '"')
1089 {
1090 str1 = TidyUtils.getString(this.lexbuf, i + 1, 12);
1091 str2 = TidyUtils.getString(this.lexbuf, i + 1, 13);
1092 if (str1.equals("-//W3C//DTD "))
1093 {
1094
1095 for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
1096 {
1097
1098 }
1099 len = j - i - 13;
1100 p = TidyUtils.getString(this.lexbuf, i + 13, len);
1101
1102 for (j = 1; j < W3CVERSION.length; ++j)
1103 {
1104 s = W3CVERSION[j].name;
1105 if (len == s.length() && s.equals(p))
1106 {
1107 return W3CVERSION[j].code;
1108 }
1109 }
1110
1111
1112 }
1113 else if (str2.equals("-//IETF//DTD "))
1114 {
1115
1116 for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
1117 {
1118
1119 }
1120 len = j - i - 14;
1121
1122 p = TidyUtils.getString(this.lexbuf, i + 14, len);
1123 s = W3CVERSION[0].name;
1124 if (len == s.length() && s.equals(p))
1125 {
1126 return W3CVERSION[0].code;
1127 }
1128
1129
1130 }
1131 break;
1132 }
1133 }
1134
1135 return 0;
1136 }
1137
1138 /***
1139 * Fix xhtml namespace.
1140 * @param root root Node
1141 * @param profile current profile
1142 */
1143 public void fixHTMLNameSpace(Node root, String profile)
1144 {
1145 Node node;
1146 AttVal attr;
1147
1148 node = root.content;
1149 while (node != null && node.tag != this.configuration.tt.tagHtml)
1150 {
1151 node = node.next;
1152 }
1153
1154 if (node != null)
1155 {
1156
1157 for (attr = node.attributes; attr != null; attr = attr.next)
1158 {
1159 if (attr.attribute.equals("xmlns"))
1160 {
1161 break;
1162 }
1163
1164 }
1165
1166 if (attr != null)
1167 {
1168 if (!attr.value.equals(profile))
1169 {
1170 report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
1171 attr.value = profile;
1172 }
1173 }
1174 else
1175 {
1176 attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
1177 attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr);
1178 node.attributes = attr;
1179 }
1180 }
1181 }
1182
1183 /***
1184 * Put DOCTYPE declaration between the <:?xml version "1.0" ... ?> declaration, if any, and the
1185 * <code>html</code> tag. Should also work for any comments, etc. that may precede the <code>html</code> tag.
1186 * @param root root node
1187 * @return new doctype node
1188 */
1189 Node newXhtmlDocTypeNode(Node root)
1190 {
1191 Node html = root.findHTML(this.configuration.tt);
1192 if (html == null)
1193 {
1194 return null;
1195 }
1196
1197 Node newdoctype = newNode();
1198 newdoctype.setType(Node.DOCTYPE_TAG);
1199 newdoctype.next = html;
1200 newdoctype.parent = root;
1201 newdoctype.prev = null;
1202
1203 if (html == root.content)
1204 {
1205
1206 root.content.prev = newdoctype;
1207 root.content = newdoctype;
1208 newdoctype.prev = null;
1209 }
1210 else
1211 {
1212
1213 newdoctype.prev = html.prev;
1214 newdoctype.prev.next = newdoctype;
1215 }
1216 html.prev = newdoctype;
1217 return newdoctype;
1218 }
1219
1220 /***
1221 * Adds a new xhtml doctype to the document.
1222 * @param root root node
1223 * @return <code>true</code> if a doctype has been added
1224 */
1225 public boolean setXHTMLDocType(Node root)
1226 {
1227 String fpi = " ";
1228 String sysid = "";
1229 String namespace = XHTML_NAMESPACE;
1230 String dtdsub = null;
1231 Node doctype;
1232 int dtdlen = 0;
1233
1234 doctype = root.findDocType();
1235
1236 fixHTMLNameSpace(root, namespace);
1237
1238 if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
1239 {
1240 if (doctype != null)
1241 {
1242 Node.discardElement(doctype);
1243 }
1244 return true;
1245 }
1246
1247 if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
1248 {
1249
1250 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1251 {
1252
1253 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1254 sysid = VOYAGER_STRICT;
1255 }
1256 else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1257 {
1258
1259 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
1260 sysid = VOYAGER_FRAMESET;
1261 }
1262 else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE))
1263 {
1264 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1265 sysid = VOYAGER_LOOSE;
1266 }
1267 else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1268 {
1269
1270 fpi = "-//W3C//DTD XHTML 1.1//EN";
1271 sysid = VOYAGER_11;
1272 }
1273 else
1274 {
1275
1276 fpi = null;
1277 sysid = "";
1278 if (doctype != null)
1279 {
1280 Node.discardElement(doctype);
1281 }
1282 }
1283 }
1284 else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
1285 {
1286 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1287 sysid = VOYAGER_STRICT;
1288 }
1289 else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
1290 {
1291 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1292 sysid = VOYAGER_LOOSE;
1293 }
1294
1295 if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null)
1296 {
1297 fpi = this.configuration.docTypeStr;
1298 sysid = "";
1299 }
1300
1301 if (fpi == null)
1302 {
1303 return false;
1304 }
1305
1306 if (doctype != null)
1307 {
1308
1309 if (configuration.xHTML || configuration.xmlOut)
1310 {
1311
1312 int len = doctype.end - doctype.start + 1;
1313 String start = TidyUtils.getString(this.lexbuf, doctype.start, len);
1314
1315 int dtdbeg = start.indexOf('[');
1316 if (dtdbeg >= 0)
1317 {
1318 int dtdend = start.substring(dtdbeg).indexOf(']');
1319 if (dtdend >= 0)
1320 {
1321 dtdlen = dtdend + 1;
1322 dtdsub = start.substring(dtdbeg);
1323 }
1324 }
1325 }
1326 }
1327 else
1328 {
1329 if ((doctype = newXhtmlDocTypeNode(root)) == null)
1330 {
1331 return false;
1332 }
1333 }
1334
1335 this.txtstart = this.lexsize;
1336 this.txtend = this.lexsize;
1337
1338
1339 addStringLiteral("html PUBLIC ");
1340
1341
1342 if (fpi.charAt(0) == '"')
1343 {
1344 addStringLiteral(fpi);
1345 }
1346 else
1347 {
1348 addStringLiteral("\"");
1349 addStringLiteral(fpi);
1350 addStringLiteral("\"");
1351 }
1352
1353 if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen)
1354 {
1355 addStringLiteral("\n\"");
1356 }
1357 else
1358 {
1359
1360 addStringLiteral(" \"");
1361 }
1362
1363
1364 addStringLiteral(sysid);
1365 addStringLiteral("\"");
1366
1367 if (dtdlen > 0 && dtdsub != null)
1368 {
1369 addCharToLexer(' ');
1370 addStringLiteralLen(dtdsub, dtdlen);
1371 }
1372
1373 this.txtend = this.lexsize;
1374
1375 int length = this.txtend - this.txtstart;
1376 doctype.textarray = new byte[length];
1377
1378 System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
1379 doctype.start = 0;
1380 doctype.end = length;
1381
1382 return false;
1383 }
1384
1385 /***
1386 * Return the html version used in document.
1387 * @return version code
1388 */
1389 public short apparentVersion()
1390 {
1391 switch (this.doctype)
1392 {
1393 case Dict.VERS_UNKNOWN :
1394 return htmlVersion();
1395
1396 case Dict.VERS_HTML20 :
1397 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
1398 {
1399 return Dict.VERS_HTML20;
1400 }
1401
1402 break;
1403
1404 case Dict.VERS_HTML32 :
1405 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
1406 {
1407 return Dict.VERS_HTML32;
1408 }
1409
1410 break;
1411
1412 case Dict.VERS_HTML40_STRICT :
1413 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1414 {
1415 return Dict.VERS_HTML40_STRICT;
1416 }
1417
1418 break;
1419
1420 case Dict.VERS_HTML40_LOOSE :
1421 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
1422 {
1423 return Dict.VERS_HTML40_LOOSE;
1424 }
1425
1426 break;
1427
1428 case Dict.VERS_FRAMESET :
1429 if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1430 {
1431 return Dict.VERS_FRAMESET;
1432 }
1433
1434 break;
1435
1436 case Dict.VERS_XHTML11 :
1437 if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1438 {
1439 return Dict.VERS_XHTML11;
1440 }
1441
1442 break;
1443 default :
1444
1445 break;
1446 }
1447
1448
1449
1450
1451
1452 this.lines = 1;
1453 this.columns = 1;
1454
1455 report.warning(this, null, null, Report.INCONSISTENT_VERSION);
1456 return this.htmlVersion();
1457 }
1458
1459 /***
1460 * Fixup doctype if missing.
1461 * @param root root node
1462 * @return <code>false</code> if current version has not been identified
1463 */
1464 public boolean fixDocType(Node root)
1465 {
1466 Node doctype;
1467 int guessed = Dict.VERS_HTML40_STRICT, i;
1468
1469 if (this.badDoctype)
1470 {
1471 report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
1472 }
1473
1474 doctype = root.findDocType();
1475
1476 if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
1477 {
1478 if (doctype != null)
1479 {
1480 Node.discardElement(doctype);
1481 }
1482 return true;
1483 }
1484
1485 if (this.configuration.xmlOut)
1486 {
1487 return true;
1488 }
1489
1490 if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
1491 {
1492 Node.discardElement(doctype);
1493 doctype = null;
1494 guessed = Dict.VERS_HTML40_STRICT;
1495 }
1496 else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
1497 {
1498 Node.discardElement(doctype);
1499 doctype = null;
1500 guessed = Dict.VERS_HTML40_LOOSE;
1501 }
1502 else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
1503 {
1504 if (doctype != null)
1505 {
1506 if (this.doctype == Dict.VERS_UNKNOWN)
1507 {
1508 return false;
1509 }
1510
1511 switch (this.doctype)
1512 {
1513 case Dict.VERS_UNKNOWN :
1514 return false;
1515
1516 case Dict.VERS_HTML20 :
1517 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
1518 {
1519 return true;
1520 }
1521
1522 break;
1523
1524 case Dict.VERS_HTML32 :
1525 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
1526 {
1527 return true;
1528 }
1529
1530 break;
1531
1532 case Dict.VERS_HTML40_STRICT :
1533 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1534 {
1535 return true;
1536 }
1537
1538 break;
1539
1540 case Dict.VERS_HTML40_LOOSE :
1541 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
1542 {
1543 return true;
1544 }
1545
1546 break;
1547
1548 case Dict.VERS_FRAMESET :
1549 if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1550 {
1551 return true;
1552 }
1553
1554 break;
1555
1556 case Dict.VERS_XHTML11 :
1557 if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1558 {
1559 return true;
1560 }
1561
1562 break;
1563 default :
1564
1565 break;
1566 }
1567
1568
1569 }
1570
1571
1572 guessed = htmlVersion();
1573 }
1574
1575 if (guessed == Dict.VERS_UNKNOWN)
1576 {
1577 return false;
1578 }
1579
1580
1581 if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager)
1582 {
1583 if (doctype != null)
1584 {
1585 Node.discardElement(doctype);
1586 }
1587
1588 fixHTMLNameSpace(root, XHTML_NAMESPACE);
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602 }
1603
1604 if (doctype == null)
1605 {
1606 if ((doctype = newXhtmlDocTypeNode(root)) == null)
1607 {
1608 return false;
1609 }
1610 }
1611
1612 this.txtstart = this.lexsize;
1613 this.txtend = this.lexsize;
1614
1615
1616 addStringLiteral("html PUBLIC ");
1617
1618 if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER
1619 && this.configuration.docTypeStr != null
1620 && this.configuration.docTypeStr.length() > 0)
1621 {
1622
1623 if (this.configuration.docTypeStr.charAt(0) == '"')
1624 {
1625 addStringLiteral(this.configuration.docTypeStr);
1626 }
1627 else
1628 {
1629 addStringLiteral("\"");
1630 addStringLiteral(this.configuration.docTypeStr);
1631 addStringLiteral("\"");
1632 }
1633 }
1634 else if (guessed == Dict.VERS_HTML20)
1635 {
1636 addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
1637 }
1638 else
1639 {
1640 addStringLiteral("\"-//W3C//DTD ");
1641
1642 for (i = 0; i < W3CVERSION.length; ++i)
1643 {
1644 if (guessed == W3CVERSION[i].code)
1645 {
1646 addStringLiteral(W3CVERSION[i].name);
1647 break;
1648 }
1649 }
1650
1651 addStringLiteral("//EN\"");
1652 }
1653
1654 this.txtend = this.lexsize;
1655
1656 int length = this.txtend - this.txtstart;
1657 doctype.textarray = new byte[length];
1658
1659 System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
1660 doctype.start = 0;
1661 doctype.end = length;
1662
1663 return true;
1664 }
1665
1666 /***
1667 * Ensure XML document starts with <code><?XML version="1.0"?></code>. Add encoding attribute if not using
1668 * ASCII or UTF-8 output.
1669 * @param root root node
1670 * @return always true
1671 */
1672 public boolean fixXmlDecl(Node root)
1673 {
1674 Node xml;
1675 AttVal version;
1676 AttVal encoding;
1677
1678 if (root.content != null && root.content.type == Node.XML_DECL)
1679 {
1680 xml = root.content;
1681 }
1682 else
1683 {
1684 xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0);
1685 xml.next = root.content;
1686
1687 if (root.content != null)
1688 {
1689 root.content.prev = xml;
1690 xml.next = root.content;
1691 }
1692
1693 root.content = xml;
1694 }
1695
1696 version = xml.getAttrByName("version");
1697 encoding = xml.getAttrByName("encoding");
1698
1699
1700
1701 if (encoding == null && this.configuration.getOutCharEncoding() != Configuration.UTF8)
1702 {
1703 if (this.configuration.getOutCharEncoding() == Configuration.LATIN1)
1704 {
1705 xml.addAttribute("encoding", "iso-8859-1");
1706 }
1707 if (this.configuration.getOutCharEncoding() == Configuration.ISO2022)
1708 {
1709 xml.addAttribute("encoding", "iso-2022");
1710 }
1711 }
1712
1713 if (version == null)
1714 {
1715 xml.addAttribute("version", "1.0");
1716 }
1717
1718 return true;
1719 }
1720
1721 /***
1722 * Generates and inserts a new node.
1723 * @param name tag name
1724 * @return generated node
1725 */
1726 public Node inferredTag(String name)
1727 {
1728 Node node;
1729
1730 node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name);
1731 node.implicit = true;
1732 return node;
1733 }
1734
1735 /***
1736 * Create a text node for the contents of a CDATA element like style or script which ends with </foo> for some
1737 * foo.
1738 * @param container container node
1739 * @return cdata node
1740 */
1741 public Node getCDATA(Node container)
1742 {
1743 int c, lastc, start, len, i;
1744 int qt = 0;
1745 int esc = 0;
1746 String str;
1747 boolean endtag = false;
1748 boolean begtag = false;
1749
1750 if (container.isJavaScript())
1751 {
1752 esc = '//';
1753 }
1754
1755 this.lines = this.in.getCurline();
1756 this.columns = this.in.getCurcol();
1757 this.waswhite = false;
1758 this.txtstart = this.lexsize;
1759 this.txtend = this.lexsize;
1760
1761 lastc = '\0';
1762 start = -1;
1763
1764 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
1765 {
1766
1767 if (qt > 0)
1768 {
1769
1770
1771 if ((c == '\r' || c == '\n' || c == qt) && (!TidyUtils.toBoolean(esc) || lastc != esc))
1772 {
1773 qt = 0;
1774 }
1775 else if (c == '/' && lastc == '<')
1776 {
1777 start = this.lexsize + 1;
1778 }
1779
1780 else if (c == '>' && start >= 0)
1781 {
1782 len = this.lexsize - start;
1783
1784 this.lines = this.in.getCurline();
1785 this.columns = this.in.getCurcol() - 3;
1786
1787 report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1788
1789
1790 if (TidyUtils.toBoolean(esc))
1791 {
1792 for (i = this.lexsize; i > start - 1; --i)
1793 {
1794 this.lexbuf[i] = this.lexbuf[i - 1];
1795 }
1796
1797 this.lexbuf[start - 1] = (byte) esc;
1798 this.lexsize++;
1799 }
1800
1801 start = -1;
1802 }
1803 }
1804 else if (TidyUtils.isQuote(c) && (!TidyUtils.toBoolean(esc) || lastc != esc))
1805 {
1806 qt = c;
1807 }
1808 else if (c == '<')
1809 {
1810 start = this.lexsize + 1;
1811 endtag = false;
1812 begtag = true;
1813 }
1814 else if (c == '!' && lastc == '<')
1815 {
1816 start = -1;
1817 endtag = false;
1818 begtag = false;
1819 }
1820 else if (c == '/' && lastc == '<')
1821 {
1822 start = this.lexsize + 1;
1823 endtag = true;
1824 begtag = false;
1825 }
1826 else if (c == '>' && start >= 0)
1827 {
1828 int decr = 2;
1829
1830 if (endtag && ((len = this.lexsize - start) == container.element.length()))
1831 {
1832
1833 str = TidyUtils.getString(this.lexbuf, start, len);
1834 if (container.element.equalsIgnoreCase(str))
1835 {
1836 this.txtend = start - decr;
1837 this.lexsize = start - decr;
1838 break;
1839 }
1840 }
1841
1842
1843
1844 this.lines = this.in.getCurline();
1845 this.columns = this.in.getCurcol() - 3;
1846
1847 report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1848 if (begtag)
1849 {
1850 decr = 1;
1851 }
1852 this.txtend = start - decr;
1853 this.lexsize = start - decr;
1854 break;
1855 }
1856
1857 else if (c == '\r')
1858 {
1859 if (begtag || endtag)
1860 {
1861 continue;
1862 }
1863
1864 c = this.in.readChar();
1865
1866 if (c != '\n')
1867 {
1868 this.in.ungetChar(c);
1869 }
1870
1871 c = '\n';
1872
1873 }
1874 else if ((c == '\n' || c == '\t' || c == ' ') && (begtag || endtag))
1875 {
1876 continue;
1877 }
1878
1879 addCharToLexer(c);
1880 this.txtend = this.lexsize;
1881 lastc = c;
1882 }
1883
1884 if (c == StreamIn.END_OF_STREAM)
1885 {
1886 report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
1887 }
1888
1889 if (this.txtend > this.txtstart)
1890 {
1891 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
1892 return this.token;
1893 }
1894
1895 return null;
1896 }
1897
1898 /***
1899 *
1900 *
1901 */
1902 public void ungetToken()
1903 {
1904 this.pushed = true;
1905 }
1906
1907 /***
1908 * Gets a token.
1909 * @param mode one of the following:
1910 * <ul>
1911 * <li><code>MixedContent</code>-- for elements which don't accept PCDATA</li>
1912 * <li><code>Preformatted</code>-- white spacepreserved as is</li>
1913 * <li><code>IgnoreMarkup</code>-- for CDATA elements such as script, style</li>
1914 * </ul>
1915 * @return next Node
1916 */
1917 public Node getToken(short mode)
1918 {
1919 int c = 0;
1920 int badcomment = 0;
1921
1922 boolean[] isempty = new boolean[1];
1923 boolean inDTDSubset = false;
1924 AttVal attributes = null;
1925
1926 if (this.pushed)
1927 {
1928
1929 if (this.token.type != Node.TEXT_NODE || (this.insert == -1 && this.inode == null))
1930 {
1931 this.pushed = false;
1932 return this.token;
1933 }
1934 }
1935
1936
1937 if (this.insert != -1 || this.inode != null)
1938 {
1939 return insertedToken();
1940 }
1941
1942 this.lines = this.in.getCurline();
1943 this.columns = this.in.getCurcol();
1944 this.waswhite = false;
1945
1946 this.txtstart = this.lexsize;
1947 this.txtend = this.lexsize;
1948
1949 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
1950 {
1951
1952
1953 if (this.insertspace && mode != IGNORE_WHITESPACE)
1954 {
1955 addCharToLexer(' ');
1956 }
1957 if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
1958 {
1959 this.waswhite = true;
1960 this.insertspace = false;
1961 }
1962
1963
1964 if (c == '\r')
1965 {
1966 c = this.in.readChar();
1967
1968 if (c != '\n')
1969 {
1970 this.in.ungetChar(c);
1971 }
1972
1973 c = '\n';
1974 }
1975
1976 addCharToLexer(c);
1977
1978 switch (this.state)
1979 {
1980 case LEX_CONTENT :
1981
1982
1983
1984
1985
1986 if (TidyUtils.isWhite((char) c) && (mode == IGNORE_WHITESPACE) && this.lexsize == this.txtstart + 1)
1987 {
1988 --this.lexsize;
1989 this.waswhite = false;
1990 this.lines = this.in.getCurline();
1991 this.columns = this.in.getCurcol();
1992 continue;
1993 }
1994
1995 if (c == '<')
1996 {
1997 this.state = LEX_GT;
1998 continue;
1999 }
2000
2001 if (TidyUtils.isWhite((char) c))
2002 {
2003
2004 if (this.waswhite)
2005 {
2006 if (mode != PREFORMATTED && mode != IGNORE_MARKUP)
2007 {
2008 --this.lexsize;
2009 this.lines = this.in.getCurline();
2010 this.columns = this.in.getCurcol();
2011 }
2012 }
2013 else
2014 {
2015
2016 this.waswhite = true;
2017
2018 if (mode != PREFORMATTED && mode != IGNORE_MARKUP && c != ' ')
2019 {
2020 changeChar((byte) ' ');
2021 }
2022 }
2023
2024 continue;
2025 }
2026 else if (c == '&' && mode != IGNORE_MARKUP)
2027 {
2028 parseEntity(mode);
2029 }
2030
2031
2032 if (mode == IGNORE_WHITESPACE)
2033 {
2034 mode = MIXED_CONTENT;
2035 }
2036
2037 this.waswhite = false;
2038 continue;
2039
2040 case LEX_GT :
2041
2042
2043
2044 if (c == '/')
2045 {
2046 c = this.in.readChar();
2047 if (c == StreamIn.END_OF_STREAM)
2048 {
2049 this.in.ungetChar(c);
2050 continue;
2051 }
2052
2053 addCharToLexer(c);
2054
2055 if (TidyUtils.isLetter((char) c))
2056 {
2057 this.lexsize -= 3;
2058 this.txtend = this.lexsize;
2059 this.in.ungetChar(c);
2060 this.state = LEX_ENDTAG;
2061 this.lexbuf[this.lexsize] = (byte) '\0';
2062
2063
2064
2065 this.columns -= 2;
2066
2067
2068 if (this.txtend > this.txtstart)
2069 {
2070
2071 if (mode == IGNORE_WHITESPACE && this.lexbuf[this.lexsize - 1] == (byte) ' ')
2072 {
2073 this.lexsize -= 1;
2074 this.txtend = this.lexsize;
2075 }
2076
2077 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2078 return this.token;
2079 }
2080
2081 continue;
2082 }
2083
2084
2085 this.waswhite = false;
2086 this.state = LEX_CONTENT;
2087 continue;
2088 }
2089
2090 if (mode == IGNORE_MARKUP)
2091 {
2092
2093 this.waswhite = false;
2094 this.state = LEX_CONTENT;
2095 continue;
2096 }
2097
2098
2099 if (c == '!')
2100 {
2101 c = this.in.readChar();
2102
2103 if (c == '-')
2104 {
2105 c = this.in.readChar();
2106
2107 if (c == '-')
2108 {
2109 this.state = LEX_COMMENT;
2110 this.lexsize -= 2;
2111 this.txtend = this.lexsize;
2112
2113
2114 if (this.txtend > this.txtstart)
2115 {
2116 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2117 return this.token;
2118 }
2119
2120 this.txtstart = this.lexsize;
2121 continue;
2122 }
2123
2124 report.warning(this, null, null, Report.MALFORMED_COMMENT);
2125 }
2126 else if (c == 'd' || c == 'D')
2127 {
2128 this.state = LEX_DOCTYPE;
2129 this.lexsize -= 2;
2130 this.txtend = this.lexsize;
2131 mode = IGNORE_WHITESPACE;
2132
2133
2134
2135 for (;;)
2136 {
2137 c = this.in.readChar();
2138
2139 if (c == StreamIn.END_OF_STREAM || c == '>')
2140 {
2141 this.in.ungetChar(c);
2142 break;
2143 }
2144
2145 if (!TidyUtils.isWhite((char) c))
2146 {
2147 continue;
2148 }
2149
2150
2151
2152 for (;;)
2153 {
2154 c = this.in.readChar();
2155
2156 if (c == StreamIn.END_OF_STREAM || c == '>')
2157 {
2158 this.in.ungetChar(c);
2159 break;
2160 }
2161
2162 if (TidyUtils.isWhite((char) c))
2163 {
2164 continue;
2165 }
2166
2167 this.in.ungetChar(c);
2168 break;
2169 }
2170
2171 break;
2172 }
2173
2174
2175 if (this.txtend > this.txtstart)
2176 {
2177 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2178 return this.token;
2179 }
2180
2181 this.txtstart = this.lexsize;
2182 continue;
2183 }
2184 else if (c == '[')
2185 {
2186
2187 this.lexsize -= 2;
2188 this.state = LEX_SECTION;
2189 this.txtend = this.lexsize;
2190
2191
2192 if (this.txtend > this.txtstart)
2193 {
2194 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2195 return this.token;
2196 }
2197
2198 this.txtstart = this.lexsize;
2199 continue;
2200 }
2201
2202
2203 while (true)
2204 {
2205 c = this.in.readChar();
2206 if (c == '>')
2207 {
2208 break;
2209 }
2210 if (c == -1)
2211 {
2212 this.in.ungetChar(c);
2213 break;
2214 }
2215 }
2216
2217 this.lexsize -= 2;
2218 this.lexbuf[this.lexsize] = (byte) '\0';
2219 this.state = LEX_CONTENT;
2220 continue;
2221 }
2222
2223
2224
2225 if (c == '?')
2226 {
2227 this.lexsize -= 2;
2228 this.state = LEX_PROCINSTR;
2229 this.txtend = this.lexsize;
2230
2231
2232 if (this.txtend > this.txtstart)
2233 {
2234 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2235 return this.token;
2236 }
2237
2238 this.txtstart = this.lexsize;
2239 continue;
2240 }
2241
2242
2243 if (c == '%')
2244 {
2245 this.lexsize -= 2;
2246 this.state = LEX_ASP;
2247 this.txtend = this.lexsize;
2248
2249
2250 if (this.txtend > this.txtstart)
2251 {
2252 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2253 return this.token;
2254 }
2255
2256 this.txtstart = this.lexsize;
2257 continue;
2258 }
2259
2260
2261 if (c == '#')
2262 {
2263 this.lexsize -= 2;
2264 this.state = LEX_JSTE;
2265 this.txtend = this.lexsize;
2266
2267
2268 if (this.txtend > this.txtstart)
2269 {
2270 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2271 return this.token;
2272 }
2273
2274 this.txtstart = this.lexsize;
2275 continue;
2276 }
2277
2278
2279 if (TidyUtils.isLetter((char) c))
2280 {
2281 this.in.ungetChar(c);
2282 this.lexsize -= 2;
2283 this.txtend = this.lexsize;
2284 this.state = LEX_STARTTAG;
2285
2286
2287 if (this.txtend > this.txtstart)
2288 {
2289 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2290 return this.token;
2291 }
2292
2293 continue;
2294 }
2295
2296
2297 this.state = LEX_CONTENT;
2298 this.waswhite = false;
2299 continue;
2300
2301 case LEX_ENDTAG :
2302
2303 this.txtstart = this.lexsize - 1;
2304
2305
2306
2307 this.columns -= 2;
2308
2309 c = parseTagName();
2310 this.token = newNode(Node.END_TAG,
2311 this.lexbuf, this.txtstart, this.txtend, TidyUtils.getString(
2312 this.lexbuf,
2313 this.txtstart,
2314 this.txtend - this.txtstart));
2315 this.lexsize = this.txtstart;
2316 this.txtend = this.txtstart;
2317
2318
2319 while (c != '>')
2320 {
2321 c = this.in.readChar();
2322
2323 if (c == StreamIn.END_OF_STREAM)
2324 {
2325 break;
2326 }
2327 }
2328
2329 if (c == StreamIn.END_OF_STREAM)
2330 {
2331 this.in.ungetChar(c);
2332 continue;
2333 }
2334
2335 this.state = LEX_CONTENT;
2336 this.waswhite = false;
2337 return this.token;
2338
2339 case LEX_STARTTAG :
2340
2341 this.txtstart = this.lexsize - 1;
2342 c = parseTagName();
2343 isempty[0] = false;
2344 attributes = null;
2345 this.token = newNode(
2346 (isempty[0] ? Node.START_END_TAG : Node.START_TAG),
2347 this.lexbuf,
2348 this.txtstart,
2349 this.txtend,
2350 TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart));
2351
2352
2353 if (c != '>')
2354 {
2355 if (c == '/')
2356 {
2357 this.in.ungetChar(c);
2358 }
2359
2360 attributes = parseAttrs(isempty);
2361 }
2362
2363 if (isempty[0])
2364 {
2365 this.token.type = Node.START_END_TAG;
2366 }
2367
2368 this.token.attributes = attributes;
2369 this.lexsize = this.txtstart;
2370 this.txtend = this.txtstart;
2371
2372
2373
2374
2375
2376
2377 if (
2378
2379 (mode != PREFORMATTED || preContent(this.token))
2380 && (this.token.expectsContent() || this.token.tag == this.configuration.tt.tagBr))
2381 {
2382
2383 c = this.in.readChar();
2384
2385 if (c == '\r')
2386 {
2387 c = this.in.readChar();
2388
2389 if (c != '\n')
2390 {
2391 this.in.ungetChar(c);
2392 }
2393 }
2394 else if (c != '\n' && c != '\f')
2395 {
2396 this.in.ungetChar(c);
2397 }
2398
2399 this.waswhite = true;
2400 }
2401 else
2402 {
2403 this.waswhite = false;
2404 }
2405
2406 this.state = LEX_CONTENT;
2407
2408 if (this.token.tag == null)
2409 {
2410 report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
2411 }
2412 else if (!this.configuration.xmlTags)
2413 {
2414 constrainVersion(this.token.tag.versions);
2415
2416 if (TidyUtils.toBoolean(this.token.tag.versions & Dict.VERS_PROPRIETARY))
2417 {
2418
2419 if (this.configuration.makeClean && (this.token.tag != this.configuration.tt.tagNobr &&
2420 this.token.tag != this.configuration.tt.tagWbr))
2421 {
2422 report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
2423 }
2424
2425 else if (!this.configuration.makeClean)
2426 {
2427 report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
2428 }
2429 }
2430
2431 if (this.token.tag.getChkattrs() != null)
2432 {
2433 this.token.tag.getChkattrs().check(this, this.token);
2434 }
2435 else
2436 {
2437 this.token.checkAttributes(this);
2438 }
2439
2440
2441 this.token.repairDuplicateAttributes(this);
2442
2443 }
2444
2445 return this.token;
2446
2447 case LEX_COMMENT :
2448
2449
2450 if (c != '-')
2451 {
2452 continue;
2453 }
2454
2455 c = this.in.readChar();
2456 addCharToLexer(c);
2457
2458 if (c != '-')
2459 {
2460 continue;
2461 }
2462
2463 end_comment : while (true)
2464 {
2465 c = this.in.readChar();
2466
2467 if (c == '>')
2468 {
2469 if (badcomment != 0)
2470 {
2471 report.warning(this, null, null, Report.MALFORMED_COMMENT);
2472 }
2473
2474 this.txtend = this.lexsize - 2;
2475 this.lexbuf[this.lexsize] = (byte) '\0';
2476 this.state = LEX_CONTENT;
2477 this.waswhite = false;
2478 this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
2479
2480
2481
2482 c = this.in.readChar();
2483
2484 if (c == '\r')
2485 {
2486 c = this.in.readChar();
2487
2488 if (c != '\n')
2489 {
2490 this.token.linebreak = true;
2491 }
2492 }
2493
2494 if (c == '\n')
2495 {
2496 this.token.linebreak = true;
2497 }
2498 else
2499 {
2500 this.in.ungetChar(c);
2501 }
2502
2503 return this.token;
2504 }
2505
2506
2507 if (badcomment == 0)
2508 {
2509 this.lines = this.in.getCurline();
2510 this.columns = this.in.getCurcol() - 3;
2511 }
2512
2513 badcomment++;
2514 if (this.configuration.fixComments)
2515 {
2516 this.lexbuf[this.lexsize - 2] = (byte) '=';
2517 }
2518
2519 addCharToLexer(c);
2520
2521
2522 if (c != '-')
2523 {
2524 break end_comment;
2525 }
2526
2527 }
2528
2529 this.lexbuf[this.lexsize - 2] = (byte) '=';
2530 continue;
2531
2532 case LEX_DOCTYPE :
2533
2534
2535 if (TidyUtils.isWhite((char) c))
2536 {
2537 if (this.waswhite)
2538 {
2539 this.lexsize -= 1;
2540 }
2541
2542 this.waswhite = true;
2543 }
2544 else
2545 {
2546 this.waswhite = false;
2547 }
2548
2549 if (inDTDSubset)
2550 {
2551 if (c == ']')
2552 {
2553 inDTDSubset = false;
2554 }
2555 }
2556 else if (c == '[')
2557 {
2558 inDTDSubset = true;
2559 }
2560 if (inDTDSubset || c != '>')
2561 {
2562 continue;
2563 }
2564
2565 this.lexsize -= 1;
2566 this.txtend = this.lexsize;
2567 this.lexbuf[this.lexsize] = (byte) '\0';
2568 this.state = LEX_CONTENT;
2569 this.waswhite = false;
2570 this.token = newNode(Node.DOCTYPE_TAG, this.lexbuf, this.txtstart, this.txtend);
2571
2572 this.doctype = findGivenVersion(this.token);
2573 return this.token;
2574
2575 case LEX_PROCINSTR :
2576
2577
2578
2579 if (this.lexsize - this.txtstart == 3)
2580 {
2581 if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("php"))
2582 {
2583 this.state = LEX_PHP;
2584 continue;
2585 }
2586 }
2587
2588 if (this.lexsize - this.txtstart == 4)
2589 {
2590 if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("xml")
2591 && TidyUtils.isWhite((char) this.lexbuf[this.txtstart + 3]))
2592 {
2593 this.state = LEX_XMLDECL;
2594 attributes = null;
2595 continue;
2596 }
2597 }
2598
2599 if (this.configuration.xmlPIs)
2600 {
2601 if (c != '?')
2602 {
2603 continue;
2604 }
2605
2606
2607 c = this.in.readChar();
2608
2609 if (c == StreamIn.END_OF_STREAM)
2610 {
2611 report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
2612 this.in.ungetChar(c);
2613 continue;
2614 }
2615
2616 addCharToLexer(c);
2617 }
2618
2619 if (c != '>')
2620 {
2621 continue;
2622 }
2623
2624 this.lexsize -= 1;
2625 this.txtend = this.lexsize;
2626 this.lexbuf[this.lexsize] = (byte) '\0';
2627 this.state = LEX_CONTENT;
2628 this.waswhite = false;
2629 this.token = newNode(Node.PROC_INS_TAG, this.lexbuf, this.txtstart, this.txtend);
2630 return this.token;
2631
2632 case LEX_ASP :
2633
2634 if (c != '%')
2635 {
2636 continue;
2637 }
2638
2639
2640 c = this.in.readChar();
2641
2642 if (c != '>')
2643 {
2644 this.in.ungetChar(c);
2645 continue;
2646 }
2647
2648 this.lexsize -= 1;
2649 this.txtend = this.lexsize;
2650 this.lexbuf[this.lexsize] = (byte) '\0';
2651 this.state = LEX_CONTENT;
2652 this.waswhite = false;
2653 this.token = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
2654 return this.token;
2655
2656 case LEX_JSTE :
2657
2658 if (c != '#')
2659 {
2660 continue;
2661 }
2662
2663
2664 c = this.in.readChar();
2665
2666 if (c != '>')
2667 {
2668 this.in.ungetChar(c);
2669 continue;
2670 }
2671
2672 this.lexsize -= 1;
2673 this.txtend = this.lexsize;
2674 this.lexbuf[this.lexsize] = (byte) '\0';
2675 this.state = LEX_CONTENT;
2676 this.waswhite = false;
2677 this.token = newNode(Node.JSTE_TAG, this.lexbuf, this.txtstart, this.txtend);
2678 return this.token;
2679
2680 case LEX_PHP :
2681
2682 if (c != '?')
2683 {
2684 continue;
2685 }
2686
2687
2688 c = this.in.readChar();
2689
2690 if (c != '>')
2691 {
2692 this.in.ungetChar(c);
2693 continue;
2694 }
2695
2696 this.lexsize -= 1;
2697 this.txtend = this.lexsize;
2698 this.lexbuf[this.lexsize] = (byte) '\0';
2699 this.state = LEX_CONTENT;
2700 this.waswhite = false;
2701 this.token = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
2702 return this.token;
2703
2704 case LEX_XMLDECL :
2705
2706 if (TidyUtils.isWhite((char) c) && c != '?')
2707 {
2708 continue;
2709 }
2710
2711
2712 if (c != '?')
2713 {
2714 String name;
2715 Node[] asp = new Node[1];
2716 Node[] php = new Node[1];
2717 AttVal av = new AttVal();
2718 int[] pdelim = new int[1];
2719 isempty[0] = false;
2720
2721 this.in.ungetChar(c);
2722
2723 name = this.parseAttribute(isempty, asp, php);
2724 av.attribute = name;
2725
2726 av.value = this.parseValue(name, true, isempty, pdelim);
2727 av.delim = pdelim[0];
2728 av.next = attributes;
2729
2730 attributes = av;
2731
2732 }
2733
2734
2735 c = this.in.readChar();
2736
2737 if (c != '>')
2738 {
2739 this.in.ungetChar(c);
2740 continue;
2741 }
2742 this.lexsize -= 1;
2743 this.txtend = this.txtstart;
2744 this.lexbuf[this.txtend] = '\0';
2745 this.state = LEX_CONTENT;
2746 this.waswhite = false;
2747 this.token = newNode(Node.XML_DECL, this.lexbuf, this.txtstart, this.txtend);
2748 this.token.attributes = attributes;
2749 return this.token;
2750
2751 case LEX_SECTION :
2752
2753 if (c == '[')
2754 {
2755 if (this.lexsize == (this.txtstart + 6)
2756 && (TidyUtils.getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
2757 {
2758 this.state = LEX_CDATA;
2759 this.lexsize -= 6;
2760 continue;
2761 }
2762 }
2763
2764 if (c != ']')
2765 {
2766 continue;
2767 }
2768
2769
2770 c = this.in.readChar();
2771
2772 if (c != '>')
2773 {
2774 this.in.ungetChar(c);
2775 continue;
2776 }
2777
2778 this.lexsize -= 1;
2779 this.txtend = this.lexsize;
2780 this.lexbuf[this.lexsize] = (byte) '\0';
2781 this.state = LEX_CONTENT;
2782 this.waswhite = false;
2783 this.token = newNode(Node.SECTION_TAG, this.lexbuf, this.txtstart, this.txtend);
2784 return this.token;
2785
2786 case LEX_CDATA :
2787
2788 if (c != ']')
2789 {
2790 continue;
2791 }
2792
2793
2794 c = this.in.readChar();
2795
2796 if (c != ']')
2797 {
2798 this.in.ungetChar(c);
2799 continue;
2800 }
2801
2802
2803 c = this.in.readChar();
2804
2805 if (c != '>')
2806 {
2807 this.in.ungetChar(c);
2808 continue;
2809 }
2810
2811 this.lexsize -= 1;
2812 this.txtend = this.lexsize;
2813 this.lexbuf[this.lexsize] = (byte) '\0';
2814 this.state = LEX_CONTENT;
2815 this.waswhite = false;
2816 this.token = newNode(Node.CDATA_TAG, this.lexbuf, this.txtstart, this.txtend);
2817 return this.token;
2818
2819 default :
2820
2821 break;
2822 }
2823 }
2824
2825 if (this.state == LEX_CONTENT)
2826 {
2827 this.txtend = this.lexsize;
2828
2829 if (this.txtend > this.txtstart)
2830 {
2831 this.in.ungetChar(c);
2832
2833 if (this.lexbuf[this.lexsize - 1] == (byte) ' ')
2834 {
2835 this.lexsize -= 1;
2836 this.txtend = this.lexsize;
2837 }
2838
2839 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2840 return this.token;
2841 }
2842 }
2843 else if (this.state == LEX_COMMENT)
2844 {
2845 if (c == StreamIn.END_OF_STREAM)
2846 {
2847 report.warning(this, null, null, Report.MALFORMED_COMMENT);
2848 }
2849
2850 this.txtend = this.lexsize;
2851 this.lexbuf[this.lexsize] = (byte) '\0';
2852 this.state = LEX_CONTENT;
2853 this.waswhite = false;
2854 this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
2855 return this.token;
2856 }
2857
2858 return null;
2859 }
2860
2861 /***
2862 * parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to
2863 * dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to
2864 * tailor the attribute value. Here is an example of a work around for using ASP in attribute values:
2865 * <code>href='<%=rsSchool.Fields("ID").Value%>'</code> where the ASP that generates the attribute value is
2866 * masked from Tidy by the quotemarks.
2867 * @return parsed Node
2868 */
2869 public Node parseAsp()
2870 {
2871 int c;
2872 Node asp = null;
2873
2874 this.txtstart = this.lexsize;
2875
2876 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
2877 {
2878
2879 addCharToLexer(c);
2880
2881 if (c != '%')
2882 {
2883 continue;
2884 }
2885
2886 if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
2887 {
2888 break;
2889 }
2890 addCharToLexer(c);
2891
2892 if (c == '>')
2893 {
2894 break;
2895 }
2896 }
2897
2898 this.lexsize -= 2;
2899 this.txtend = this.lexsize;
2900
2901 if (this.txtend > this.txtstart)
2902 {
2903 asp = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
2904 }
2905
2906 this.txtstart = this.txtend;
2907 return asp;
2908 }
2909
2910 /***
2911 * PHP is like ASP but is based upon XML processing instructions, e.g. <code><?php ... ?></code>.
2912 * @return parsed Node
2913 */
2914 public Node parsePhp()
2915 {
2916 int c;
2917 Node php = null;
2918
2919 this.txtstart = this.lexsize;
2920
2921 while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
2922 {
2923 addCharToLexer(c);
2924
2925 if (c != '?')
2926 {
2927 continue;
2928 }
2929
2930 if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
2931 {
2932 break;
2933 }
2934 addCharToLexer(c);
2935
2936 if (c == '>')
2937 {
2938 break;
2939 }
2940 }
2941
2942 this.lexsize -= 2;
2943 this.txtend = this.lexsize;
2944
2945 if (this.txtend > this.txtstart)
2946 {
2947 php = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
2948 }
2949
2950 this.txtstart = this.txtend;
2951 return php;
2952 }
2953
2954 /***
2955 * consumes the '>' terminating start tags.
2956 * @param isempty flag is passed as array so it can be modified
2957 * @param asp asp Node, passed as array so it can be modified
2958 * @param php php Node, passed as array so it can be modified
2959 * @return parsed attribute
2960 */
2961 public String parseAttribute(boolean[] isempty, Node[] asp, Node[] php)
2962 {
2963 int start = 0;
2964 String attr;
2965 int c = 0;
2966 int lastc = 0;
2967
2968 asp[0] = null;
2969 php[0] = null;
2970
2971
2972 for (;;)
2973 {
2974 c = this.in.readChar();
2975
2976 if (c == '/')
2977 {
2978 c = this.in.readChar();
2979
2980 if (c == '>')
2981 {
2982 isempty[0] = true;
2983 return null;
2984 }
2985
2986 this.in.ungetChar(c);
2987 c = '/';
2988 break;
2989 }
2990
2991 if (c == '>')
2992 {
2993 return null;
2994 }
2995
2996 if (c == '<')
2997 {
2998 c = this.in.readChar();
2999
3000 if (c == '%')
3001 {
3002 asp[0] = parseAsp();
3003 return null;
3004 }
3005 else if (c == '?')
3006 {
3007 php[0] = parsePhp();
3008 return null;
3009 }
3010
3011 this.in.ungetChar(c);
3012 if (this.state != LEX_XMLDECL)
3013 {
3014 this.in.ungetChar('<');
3015 }
3016 report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3017 return null;
3018 }
3019
3020 if (c == '=')
3021 {
3022 report.attrError(this, this.token, null, Report.UNEXPECTED_EQUALSIGN);
3023 continue;
3024 }
3025
3026 if (c == '"' || c == '\'')
3027 {
3028 report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
3029 continue;
3030 }
3031
3032 if (c == StreamIn.END_OF_STREAM)
3033 {
3034 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3035 this.in.ungetChar(c);
3036 return null;
3037 }
3038
3039 if (!TidyUtils.isWhite((char) c))
3040 {
3041 break;
3042 }
3043 }
3044
3045 start = this.lexsize;
3046 lastc = c;
3047
3048 for (;;)
3049 {
3050
3051 if (c == '=' || c == '>')
3052 {
3053 this.in.ungetChar(c);
3054 break;
3055 }
3056
3057 if (c == '<' || c == StreamIn.END_OF_STREAM)
3058 {
3059 this.in.ungetChar(c);
3060 break;
3061 }
3062 if (lastc == '-' && (c == '"' || c == '\''))
3063 {
3064 this.lexsize--;
3065 this.in.ungetChar(c);
3066 break;
3067 }
3068 if (TidyUtils.isWhite((char) c))
3069 {
3070 break;
3071 }
3072
3073
3074
3075
3076 if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
3077 {
3078 c = TidyUtils.toLower((char) c);
3079 }
3080
3081
3082 addCharToLexer(c);
3083
3084 lastc = c;
3085 c = this.in.readChar();
3086 }
3087
3088
3089 int len = this.lexsize - start;
3090 attr = (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
3091 this.lexsize = start;
3092
3093 return attr;
3094 }
3095
3096 /***
3097 * Invoked when < is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this
3098 * routine recognizes ' and " quoted strings.
3099 * @return delimiter
3100 */
3101 public int parseServerInstruction()
3102 {
3103 int c, delim = '"';
3104 boolean isrule = false;
3105
3106 c = this.in.readChar();
3107 addCharToLexer(c);
3108
3109
3110 if (c == '%' || c == '?' || c == '@')
3111 {
3112 isrule = true;
3113 }
3114
3115 for (;;)
3116 {
3117 c = this.in.readChar();
3118
3119 if (c == StreamIn.END_OF_STREAM)
3120 {
3121 break;
3122 }
3123
3124 if (c == '>')
3125 {
3126 if (isrule)
3127 {
3128 addCharToLexer(c);
3129 }
3130 else
3131 {
3132 this.in.ungetChar(c);
3133 }
3134
3135 break;
3136 }
3137
3138
3139
3140 if (!isrule)
3141 {
3142 if (TidyUtils.isWhite((char) c))
3143 {
3144 break;
3145 }
3146 }
3147
3148 addCharToLexer(c);
3149
3150 if (c == '"')
3151 {
3152 do
3153 {
3154 c = this.in.readChar();
3155
3156 if (endOfInput())
3157 {
3158 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3159 this.in.ungetChar(c);
3160 return 0;
3161 }
3162 if (c == '>')
3163 {
3164 this.in.ungetChar(c);
3165 report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3166 return 0;
3167 }
3168
3169 addCharToLexer(c);
3170 }
3171 while (c != '"');
3172 delim = '\'';
3173 continue;
3174 }
3175
3176 if (c == '\'')
3177 {
3178 do
3179 {
3180 c = this.in.readChar();
3181
3182 if (endOfInput())
3183 {
3184 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3185 this.in.ungetChar(c);
3186 return 0;
3187 }
3188 if (c == '>')
3189 {
3190 this.in.ungetChar(c);
3191 report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3192 return 0;
3193 }
3194
3195 addCharToLexer(c);
3196 }
3197 while (c != '\'');
3198 }
3199 }
3200
3201 return delim;
3202 }
3203
3204 /***
3205 * Parse an attribute value.
3206 * @param name attribute name
3207 * @param foldCase fold case?
3208 * @param isempty is attribute empty? Passed as an array reference to allow modification
3209 * @param pdelim delimiter, passed as an array reference to allow modification
3210 * @return parsed value
3211 */
3212 public String parseValue(String name, boolean foldCase, boolean[] isempty, int[] pdelim)
3213 {
3214
3215
3216
3217 int len = 0;
3218 int start;
3219 boolean seenGt = false;
3220 boolean munge = true;
3221 int c = 0;
3222 int lastc, delim, quotewarning;
3223 String value;
3224
3225 delim = 0;
3226 pdelim[0] = '"';
3227
3228
3229
3230
3231 if (this.configuration.literalAttribs)
3232 {
3233 munge = false;
3234 }
3235
3236
3237 while (true)
3238 {
3239 c = this.in.readChar();
3240
3241 if (c == StreamIn.END_OF_STREAM)
3242 {
3243 this.in.ungetChar(c);
3244 break;
3245 }
3246
3247 if (!TidyUtils.isWhite((char) c))
3248 {
3249 break;
3250 }
3251 }
3252
3253
3254
3255 if (c != '=' && c != '"' && c != '\'')
3256 {
3257 this.in.ungetChar(c);
3258 return null;
3259 }
3260
3261
3262
3263 while (true)
3264 {
3265 c = this.in.readChar();
3266
3267 if (c == StreamIn.END_OF_STREAM)
3268 {
3269 this.in.ungetChar(c);
3270 break;
3271 }
3272
3273 if (!TidyUtils.isWhite((char) c))
3274 {
3275 break;
3276 }
3277 }
3278
3279
3280
3281 if (c == '"' || c == '\'')
3282 {
3283 delim = c;
3284 }
3285 else if (c == '<')
3286 {
3287 start = this.lexsize;
3288 addCharToLexer(c);
3289 pdelim[0] = parseServerInstruction();
3290 len = this.lexsize - start;
3291 this.lexsize = start;
3292 return (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
3293 }
3294 else
3295 {
3296 this.in.ungetChar(c);
3297 }
3298
3299
3300
3301 quotewarning = 0;
3302 start = this.lexsize;
3303 c = '\0';
3304
3305 while (true)
3306 {
3307 lastc = c;
3308 c = this.in.readChar();
3309
3310 if (c == StreamIn.END_OF_STREAM)
3311 {
3312 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3313 this.in.ungetChar(c);
3314 break;
3315 }
3316
3317 if (delim == (char) 0)
3318 {
3319 if (c == '>')
3320 {
3321 this.in.ungetChar(c);
3322 break;
3323 }
3324
3325 if (c == '"' || c == '\'')
3326 {
3327 report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
3328 break;
3329 }
3330
3331 if (c == '<')
3332 {
3333 this.in.ungetChar(c);
3334 c = '>';
3335 this.in.ungetChar(c);
3336 report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3337 break;
3338 }
3339
3340
3341
3342
3343
3344 if (c == '/')
3345 {
3346
3347 c = this.in.readChar();
3348
3349 if (c == '>' && !AttributeTable.getDefaultAttributeTable().isUrl(name))
3350 {
3351 isempty[0] = true;
3352 this.in.ungetChar(c);
3353 break;
3354 }
3355
3356
3357 this.in.ungetChar(c);
3358 c = '/';
3359 }
3360 }
3361 else
3362 {
3363
3364 if (c == delim)
3365 {
3366 break;
3367 }
3368
3369
3370
3371 if (c == '\r')
3372 {
3373 c = this.in.readChar();
3374 if (c != '\n')
3375 {
3376 this.in.ungetChar(c);
3377 }
3378
3379 c = '\n';
3380 }
3381
3382 if (c == '\n' || c == '<' || c == '>')
3383 {
3384 ++quotewarning;
3385 }
3386
3387 if (c == '>')
3388 {
3389 seenGt = true;
3390 }
3391 }
3392
3393 if (c == '&')
3394 {
3395
3396 if ("id".equalsIgnoreCase(name))
3397 {
3398 report.attrError(this, null, null, Report.ENTITY_IN_ID);
3399 continue;
3400 }
3401
3402 addCharToLexer(c);
3403 parseEntity((short) 0);
3404 continue;
3405
3406 }
3407
3408
3409
3410 if (c == '//')
3411 {
3412 c = this.in.readChar();
3413
3414 if (c != '\n')
3415 {
3416 this.in.ungetChar(c);
3417 c = '//';
3418 }
3419 }
3420
3421 if (TidyUtils.isWhite((char) c))
3422 {
3423 if (delim == (char) 0)
3424 {
3425 break;
3426 }
3427
3428 if (munge)
3429 {
3430
3431
3432 if (c == '\n' && AttributeTable.getDefaultAttributeTable().isUrl(name))
3433 {
3434
3435 report.attrError(this, this.token, null, Report.NEWLINE_IN_URI);
3436 continue;
3437 }
3438
3439 c = ' ';
3440
3441 if (lastc == ' ')
3442 {
3443 continue;
3444 }
3445 }
3446 }
3447 else if (foldCase && TidyUtils.isUpper((char) c))
3448 {
3449 c = TidyUtils.toLower((char) c);
3450 }
3451
3452 addCharToLexer(c);
3453 }
3454
3455 if (quotewarning > 10 && seenGt && munge)
3456 {
3457
3458
3459
3460
3461 if (!AttributeTable.getDefaultAttributeTable().isScript(name)
3462 && !(AttributeTable.getDefaultAttributeTable().isUrl(name) && "javascript:".equals(TidyUtils.getString(
3463 this.lexbuf,
3464 start,
3465 11)))
3466 && !"<xml ".equals(TidyUtils.getString(this.lexbuf, start, 5)))
3467
3468 {
3469 report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
3470 }
3471 }
3472
3473 len = this.lexsize - start;
3474 this.lexsize = start;
3475
3476 if (len > 0 || delim != 0)
3477 {
3478
3479
3480
3481
3482 if (munge && !TidyUtils.isInValuesIgnoreCase(new String[]{"alt", "title", "value", "prompt"}, name))
3483 {
3484 while (TidyUtils.isWhite((char) this.lexbuf[start + len - 1]))
3485 {
3486 --len;
3487 }
3488
3489 while (TidyUtils.isWhite((char) this.lexbuf[start]) && start < len)
3490 {
3491 ++start;
3492 --len;
3493 }
3494 }
3495
3496 value = TidyUtils.getString(this.lexbuf, start, len);
3497 }
3498 else
3499 {
3500 value = null;
3501 }
3502
3503
3504 if (delim != 0)
3505 {
3506 pdelim[0] = delim;
3507 }
3508 else
3509 {
3510 pdelim[0] = '"';
3511 }
3512
3513 return value;
3514 }
3515
3516 /***
3517 * Check if attr is a valid name.
3518 * @param attr String to check, must be non-null
3519 * @return <code>true</code> if attr is a valid name.
3520 */
3521 public static boolean isValidAttrName(String attr)
3522 {
3523 char c;
3524 int i;
3525
3526
3527 c = attr.charAt(0);
3528
3529 if (!TidyUtils.isLetter(c))
3530 {
3531 return false;
3532 }
3533
3534
3535 for (i = 1; i < attr.length(); i++)
3536 {
3537 c = attr.charAt(i);
3538
3539 if (TidyUtils.isNamechar(c))
3540 {
3541 continue;
3542 }
3543
3544 return false;
3545 }
3546
3547 return true;
3548 }
3549
3550 /***
3551 * In CSS1, selectors can contain only the characters A-Z, 0-9, and Unicode characters 161-255, plus dash (-); they
3552 * cannot start with a dash or a digit; they can also contain escaped characters and any Unicode character as a
3553 * numeric code (see next item). The backslash followed by at most four hexadecimal digits (0..9A..F) stands for the
3554 * Unicode character with that number. Any character except a hexadecimal digit can be escaped to remove its special
3555 * meaning, by putting a backslash in front.
3556 * @param buf css selector name
3557 * @return <code>true</code> if the given string is a valid css1 selector name
3558 */
3559 public static boolean isCSS1Selector(String buf)
3560 {
3561 if (buf == null)
3562 {
3563 return false;
3564 }
3565
3566
3567 boolean valid = true;
3568 int esclen = 0;
3569 char c;
3570 int pos;
3571
3572 for (pos = 0; valid && pos < buf.length(); ++pos)
3573 {
3574 c = buf.charAt(pos);
3575 if (c == '//')
3576 {
3577 esclen = 1;
3578 }
3579 else if (Character.isDigit(c))
3580 {
3581
3582 if (esclen > 0)
3583 {
3584 valid = (++esclen < 6);
3585 }
3586 if (valid)
3587 {
3588 valid = (pos > 0 || esclen > 0);
3589 }
3590 }
3591 else
3592 {
3593 valid = (esclen > 0
3594 || (pos > 0 && c == '-')
3595 || Character.isLetter(c)
3596 || (c >= 161 && c <= 255));
3597 esclen = 0;
3598 }
3599 }
3600 return valid;
3601 }
3602
3603 /***
3604 * Parse tag attributes.
3605 * @param isempty is tag empty?
3606 * @return parsed attribute/value list
3607 */
3608 public AttVal parseAttrs(boolean[] isempty)
3609 {
3610 AttVal av, list;
3611 String attribute, value;
3612 int[] delim = new int[1];
3613 Node[] asp = new Node[1];
3614 Node[] php = new Node[1];
3615
3616 list = null;
3617
3618 while (!endOfInput())
3619 {
3620 attribute = parseAttribute(isempty, asp, php);
3621
3622 if (attribute == null)
3623 {
3624
3625 if (asp[0] != null)
3626 {
3627 av = new AttVal(list, null, asp[0], null, '\0', null, null);
3628 list = av;
3629 continue;
3630 }
3631
3632
3633 if (php[0] != null)
3634 {
3635 av = new AttVal(list, null, null, php[0], '\0', null, null);
3636 list = av;
3637 continue;
3638 }
3639
3640 break;
3641 }
3642
3643 value = parseValue(attribute, false, isempty, delim);
3644
3645 if (attribute != null && isValidAttrName(attribute))
3646 {
3647 av = new AttVal(list, null, null, null, delim[0], attribute, value);
3648 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
3649 list = av;
3650 }
3651 else
3652 {
3653 av = new AttVal(null, null, null, null, 0, attribute, value);
3654
3655
3656 if (value != null)
3657 {
3658 report.attrError(this, this.token, av, Report.BAD_ATTRIBUTE_VALUE);
3659 }
3660 else if (TidyUtils.lastChar(attribute) == '"')
3661 {
3662 report.attrError(this, this.token, av, Report.MISSING_QUOTEMARK);
3663 }
3664 else
3665 {
3666 report.attrError(this, this.token, av, Report.UNKNOWN_ATTRIBUTE);
3667 }
3668 }
3669 }
3670
3671 return list;
3672 }
3673
3674 /***
3675 * Push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones
3676 * generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance:
3677 * <code><p><em> text <p><em> more text</code> Shouldn't be mapped to
3678 * <code><p><em> text </em></p><p><em><em> more text </em></em></code>
3679 * @param node Node to be pushed
3680 */
3681 public void pushInline(Node node)
3682 {
3683 IStack is;
3684
3685 if (node.implicit)
3686 {
3687 return;
3688 }
3689
3690 if (node.tag == null)
3691 {
3692 return;
3693 }
3694
3695 if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
3696 {
3697 return;
3698 }
3699
3700 if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
3701 {
3702 return;
3703 }
3704
3705 if (node.tag != this.configuration.tt.tagFont && isPushed(node))
3706 {
3707 return;
3708 }
3709
3710
3711 is = new IStack();
3712 is.tag = node.tag;
3713 is.element = node.element;
3714 if (node.attributes != null)
3715 {
3716 is.attributes = cloneAttributes(node.attributes);
3717 }
3718 this.istack.push(is);
3719 }
3720
3721 /***
3722 * Pop a copy of an inline node from the stack.
3723 * @param node Node to be popped
3724 */
3725 public void popInline(Node node)
3726 {
3727 IStack is;
3728
3729 if (node != null)
3730 {
3731
3732 if (node.tag == null)
3733 {
3734 return;
3735 }
3736
3737 if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
3738 {
3739 return;
3740 }
3741
3742 if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
3743 {
3744 return;
3745 }
3746
3747
3748 if (node.tag == this.configuration.tt.tagA)
3749 {
3750
3751 while (this.istack.size() > 0)
3752 {
3753 is = (IStack) this.istack.pop();
3754 if (is.tag == this.configuration.tt.tagA)
3755 {
3756 break;
3757 }
3758 }
3759
3760 if (this.insert >= this.istack.size())
3761 {
3762 this.insert = -1;
3763 }
3764 return;
3765 }
3766 }
3767
3768 if (this.istack.size() > 0)
3769 {
3770 is = (IStack) this.istack.pop();
3771 if (this.insert >= this.istack.size())
3772 {
3773 this.insert = -1;
3774 }
3775 }
3776 }
3777
3778 /***
3779 * Is the node in the stack?
3780 * @param node Node
3781 * @return <code>true</code> is the node is found in the stack
3782 */
3783 public boolean isPushed(Node node)
3784 {
3785 int i;
3786 IStack is;
3787
3788 for (i = this.istack.size() - 1; i >= 0; --i)
3789 {
3790 is = (IStack) this.istack.elementAt(i);
3791 if (is.tag == node.tag)
3792 {
3793 return true;
3794 }
3795 }
3796
3797 return false;
3798 }
3799
3800 /***
3801 * This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P,
3802 * TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. When the inline stack is not empty, as
3803 * will be the case in: <code><i><h1>italic heading</h1></i></code> which is then treated as
3804 * equivalent to <code><h1><i>italic heading</i></h1></code> This is implemented by setting the lexer
3805 * into a mode where it gets tokens from the inline stack rather than from the input stream.
3806 * @param node original node
3807 * @return stack size
3808 */
3809 public int inlineDup(Node node)
3810 {
3811 int n;
3812
3813 n = this.istack.size() - this.istackbase;
3814 if (n > 0)
3815 {
3816 this.insert = this.istackbase;
3817 this.inode = node;
3818 }
3819
3820 return n;
3821 }
3822
3823 /***
3824 * @return
3825 */
3826 public Node insertedToken()
3827 {
3828 Node node;
3829 IStack is;
3830 int n;
3831
3832
3833 if (this.insert == -1)
3834 {
3835 node = this.inode;
3836 this.inode = null;
3837 return node;
3838 }
3839
3840
3841 if (this.inode == null)
3842 {
3843 this.lines = this.in.getCurline();
3844 this.columns = this.in.getCurcol();
3845 }
3846
3847 node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend);
3848
3849
3850 node.implicit = true;
3851 is = (IStack) this.istack.elementAt(this.insert);
3852 node.element = is.element;
3853 node.tag = is.tag;
3854 if (is.attributes != null)
3855 {
3856 node.attributes = cloneAttributes(is.attributes);
3857 }
3858
3859
3860 n = this.insert;
3861
3862
3863 if (++n < this.istack.size())
3864 {
3865 this.insert = n;
3866 }
3867 else
3868 {
3869 this.insert = -1;
3870 }
3871
3872 return node;
3873 }
3874
3875 /***
3876 * Can the given element be removed?
3877 * @param element node
3878 * @return <code>true</code> if he element can be removed
3879 */
3880 public boolean canPrune(Node element)
3881 {
3882 if (element.type == Node.TEXT_NODE)
3883 {
3884 return true;
3885 }
3886
3887 if (element.content != null)
3888 {
3889 return false;
3890 }
3891
3892 if (element.tag == this.configuration.tt.tagA && element.attributes != null)
3893 {
3894 return false;
3895 }
3896
3897 if (element.tag == this.configuration.tt.tagP && !this.configuration.dropEmptyParas)
3898 {
3899 return false;
3900 }
3901
3902 if (element.tag == null)
3903 {
3904 return false;
3905 }
3906
3907 if (TidyUtils.toBoolean(element.tag.model & Dict.CM_ROW))
3908 {
3909 return false;
3910 }
3911
3912 if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY))
3913 {
3914 return false;
3915 }
3916
3917 if (element.tag == this.configuration.tt.tagApplet)
3918 {
3919 return false;
3920 }
3921
3922 if (element.tag == this.configuration.tt.tagObject)
3923 {
3924 return false;
3925 }
3926
3927 if (element.tag == this.configuration.tt.tagScript && element.getAttrByName("src") != null)
3928 {
3929 return false;
3930 }
3931
3932
3933 if (element.tag == this.configuration.tt.tagTitle)
3934 {
3935 return false;
3936 }
3937
3938
3939 if (element.tag == this.configuration.tt.tagIframe)
3940 {
3941 return false;
3942 }
3943
3944 if (element.getAttrByName("id") != null || element.getAttrByName("name") != null)
3945 {
3946 return false;
3947 }
3948
3949 return true;
3950 }
3951
3952 /***
3953 * duplicate name attribute as an id and check if id and name match.
3954 * @param node Node to check for name/it attributes
3955 */
3956 public void fixId(Node node)
3957 {
3958 AttVal name = node.getAttrByName("name");
3959 AttVal id = node.getAttrByName("id");
3960
3961 if (name != null)
3962 {
3963 if (id != null)
3964 {
3965 if (id.value != null && !id.value.equals(name.value))
3966 {
3967 report.attrError(this, node, name, Report.ID_NAME_MISMATCH);
3968 }
3969 }
3970 else if (this.configuration.xmlOut)
3971 {
3972 node.addAttribute("id", name.value);
3973 }
3974 }
3975 }
3976
3977 /***
3978 * Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated.
3979 */
3980 public void deferDup()
3981 {
3982 this.insert = -1;
3983 this.inode = null;
3984 }
3985
3986 /***
3987 * Constraint the html version in the document to the given one. Everything is allowed in proprietary version of
3988 * HTML this is handled here rather than in the tag/attr dicts.
3989 * @param vers html version code
3990 */
3991 void constrainVersion(int vers)
3992 {
3993 this.versions &= (vers | Dict.VERS_PROPRIETARY);
3994 }
3995
3996 /***
3997 * Is content acceptable for pre elements?
3998 * @param node content
3999 * @return <code>true</code> if node is acceptable in pre elements
4000 */
4001 protected boolean preContent(Node node)
4002 {
4003
4004 if (node.tag == this.configuration.tt.tagP)
4005 {
4006 return true;
4007 }
4008
4009 if (node.tag == null
4010 || node.tag == this.configuration.tt.tagP
4011 || !TidyUtils.toBoolean(node.tag.model & (Dict.CM_INLINE | Dict.CM_NEW)))
4012 {
4013 return false;
4014 }
4015 return true;
4016 }
4017
4018 /***
4019 * document type.
4020 */
4021 private static class W3CVersionInfo
4022 {
4023
4024 /***
4025 * name.
4026 */
4027 String name;
4028
4029 /***
4030 * voyager name.
4031 */
4032 String voyagerName;
4033
4034 /***
4035 * profile.
4036 */
4037 String profile;
4038
4039 /***
4040 * code.
4041 */
4042 short code;
4043
4044 /***
4045 * Instantiates a new W3CVersionInfo.
4046 * @param name version name
4047 * @param voyagerName voyager (xhtml) name
4048 * @param profile VOYAGER_STRICT | VOYAGER_LOOSE | VOYAGER_FRAMESET
4049 * @param code unique code for this version info
4050 */
4051 public W3CVersionInfo(String name, String voyagerName, String profile, short code)
4052 {
4053 this.name = name;
4054 this.voyagerName = voyagerName;
4055 this.profile = profile;
4056 this.code = code;
4057 }
4058 }
4059
4060 }