1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54 package org.w3c.tidy;
55
56 /**
57 * HTML Parser implementation.
58 * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
59 * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
60 * @author Fabrizio Giustina
61 * @version $Revision: 806 $ ($Author: fgiust $)
62 */
63 public final class ParserImpl
64 {
65
66 /**
67 * parser for html.
68 */
69 public static final Parser HTML = new ParseHTML();
70
71 /**
72 * parser for head.
73 */
74 public static final Parser HEAD = new ParseHead();
75
76 /**
77 * parser for title.
78 */
79 public static final Parser TITLE = new ParseTitle();
80
81 /**
82 * parser for script.
83 */
84 public static final Parser SCRIPT = new ParseScript();
85
86 /**
87 * parser for body.
88 */
89 public static final Parser BODY = new ParseBody();
90
91 /**
92 * parser for frameset.
93 */
94 public static final Parser FRAMESET = new ParseFrameSet();
95
96 /**
97 * parser for inline.
98 */
99 public static final Parser INLINE = new ParseInline();
100
101 /**
102 * parser for list.
103 */
104 public static final Parser LIST = new ParseList();
105
106 /**
107 * parser for definition lists.
108 */
109 public static final Parser DEFLIST = new ParseDefList();
110
111 /**
112 * parser for pre.
113 */
114 public static final Parser PRE = new ParsePre();
115
116 /**
117 * parser for block elements.
118 */
119 public static final Parser BLOCK = new ParseBlock();
120
121 /**
122 * parser for table.
123 */
124 public static final Parser TABLETAG = new ParseTableTag();
125
126 /**
127 * parser for colgroup.
128 */
129 public static final Parser COLGROUP = new ParseColGroup();
130
131 /**
132 * parser for rowgroup.
133 */
134 public static final Parser ROWGROUP = new ParseRowGroup();
135
136 /**
137 * parser for row.
138 */
139 public static final Parser ROW = new ParseRow();
140
141 /**
142 * parser for noframes.
143 */
144 public static final Parser NOFRAMES = new ParseNoFrames();
145
146 /**
147 * parser for select.
148 */
149 public static final Parser SELECT = new ParseSelect();
150
151 /**
152 * parser for text.
153 */
154 public static final Parser TEXT = new ParseText();
155
156 /**
157 * parser for empty elements.
158 */
159 public static final Parser EMPTY = new ParseEmpty();
160
161 /**
162 * parser for optgroup.
163 */
164 public static final Parser OPTGROUP = new ParseOptGroup();
165
166 /**
167 * ParserImpl should not be instantiated.
168 */
169 private ParserImpl()
170 {
171
172 }
173
174 /**
175 * @param lexer
176 * @param node
177 * @param mode
178 */
179 protected static void parseTag(Lexer lexer, Node node, short mode)
180 {
181
182
183 if ((node.tag.model & Dict.CM_EMPTY) != 0)
184 {
185 lexer.waswhite = false;
186 }
187 else if (!((node.tag.model & Dict.CM_INLINE) != 0))
188 {
189 lexer.insertspace = false;
190 }
191
192 if (node.tag.getParser() == null)
193 {
194 return;
195 }
196
197 if (node.type == Node.START_END_TAG)
198 {
199 Node.trimEmptyElement(lexer, node);
200 return;
201 }
202
203 node.tag.getParser().parse(lexer, node, mode);
204 }
205
206 /**
207 * Move node to the head, where element is used as starting point in hunt for head. Normally called during parsing.
208 * @param lexer
209 * @param element
210 * @param node
211 */
212 protected static void moveToHead(Lexer lexer, Node element, Node node)
213 {
214 Node head;
215 node.removeNode();
216
217 TagTable tt = lexer.configuration.tt;
218
219 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
220 {
221 lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
222
223 while (element.tag != tt.tagHtml)
224 {
225 element = element.parent;
226 }
227
228 for (head = element.content; head != null; head = head.next)
229 {
230 if (head.tag == tt.tagHead)
231 {
232 head.insertNodeAtEnd(node);
233 break;
234 }
235 }
236
237 if (node.tag.getParser() != null)
238 {
239 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
240 }
241 }
242 else
243 {
244 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
245 }
246 }
247
248 /**
249 * moves given node to end of body element.
250 * @param lexer Lexer
251 * @param node Node to insert
252 */
253 static void moveNodeToBody(Lexer lexer, Node node)
254 {
255 node.removeNode();
256 Node body = lexer.root.findBody(lexer.configuration.tt);
257 body.insertNodeAtEnd(node);
258 }
259
260 /**
261 * Parser for HTML.
262 */
263 public static class ParseHTML implements Parser
264 {
265
266 /**
267 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
268 */
269 public void parse(Lexer lexer, Node html, short mode)
270 {
271 Node node, head;
272 Node frameset = null;
273 Node noframes = null;
274
275 lexer.configuration.xmlTags = false;
276 lexer.seenEndBody = false;
277 TagTable tt = lexer.configuration.tt;
278
279 while (true)
280 {
281 node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
282
283 if (node == null)
284 {
285 node = lexer.inferredTag("head");
286 break;
287 }
288
289 if (node.tag == tt.tagHead)
290 {
291 break;
292 }
293
294 if (node.tag == html.tag && node.type == Node.END_TAG)
295 {
296 lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
297 continue;
298 }
299
300
301 if (Node.insertMisc(html, node))
302 {
303 continue;
304 }
305
306 lexer.ungetToken();
307 node = lexer.inferredTag("head");
308 break;
309 }
310
311 head = node;
312 html.insertNodeAtEnd(head);
313 HEAD.parse(lexer, head, mode);
314
315 while (true)
316 {
317 node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
318
319 if (node == null)
320 {
321 if (frameset == null)
322 {
323
324 node = lexer.inferredTag("body");
325 html.insertNodeAtEnd(node);
326 BODY.parse(lexer, node, mode);
327 }
328
329 return;
330 }
331
332
333 if (node.tag == html.tag)
334 {
335 if (node.type != Node.START_TAG && frameset == null)
336 {
337 lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
338 }
339 else if (node.type == Node.END_TAG)
340 {
341 lexer.seenEndHtml = true;
342 }
343
344 continue;
345 }
346
347
348 if (Node.insertMisc(html, node))
349 {
350 continue;
351 }
352
353
354 if (node.tag == tt.tagBody)
355 {
356 if (node.type != Node.START_TAG)
357 {
358 lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
359 continue;
360 }
361
362 if (frameset != null)
363 {
364 lexer.ungetToken();
365
366 if (noframes == null)
367 {
368 noframes = lexer.inferredTag("noframes");
369 frameset.insertNodeAtEnd(noframes);
370 lexer.report.warning(lexer, html, noframes, Report.INSERTING_TAG);
371 }
372
373 parseTag(lexer, noframes, mode);
374 continue;
375 }
376
377 lexer.constrainVersion(~Dict.VERS_FRAMESET);
378 break;
379 }
380
381
382 if (node.tag == tt.tagFrameset)
383 {
384 if (node.type != Node.START_TAG)
385 {
386 lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
387 continue;
388 }
389
390 if (frameset != null)
391 {
392 lexer.report.error(lexer, html, node, Report.DUPLICATE_FRAMESET);
393 }
394 else
395 {
396 frameset = node;
397 }
398
399 html.insertNodeAtEnd(node);
400 parseTag(lexer, node, mode);
401
402
403
404 for (node = frameset.content; node != null; node = node.next)
405 {
406 if (node.tag == tt.tagNoframes)
407 {
408 noframes = node;
409 }
410 }
411 continue;
412 }
413
414
415 if (node.tag == tt.tagNoframes)
416 {
417 if (node.type != Node.START_TAG)
418 {
419 lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
420 continue;
421 }
422
423 if (frameset == null)
424 {
425 lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
426 node = lexer.inferredTag("body");
427 break;
428 }
429
430 if (noframes == null)
431 {
432 noframes = node;
433 frameset.insertNodeAtEnd(noframes);
434 }
435
436 parseTag(lexer, noframes, mode);
437 continue;
438 }
439
440 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
441 {
442 if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0)
443 {
444 moveToHead(lexer, html, node);
445 continue;
446 }
447
448
449 if (frameset != null && node.tag == tt.tagFrame)
450 {
451 lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
452 continue;
453 }
454 }
455
456 lexer.ungetToken();
457
458
459 if (frameset != null)
460 {
461 if (noframes == null)
462 {
463 noframes = lexer.inferredTag("noframes");
464 frameset.insertNodeAtEnd(noframes);
465 }
466 else
467 {
468 lexer.report.warning(lexer, html, node, Report.NOFRAMES_CONTENT);
469 }
470
471 lexer.constrainVersion(Dict.VERS_FRAMESET);
472 parseTag(lexer, noframes, mode);
473 continue;
474 }
475
476 node = lexer.inferredTag("body");
477 lexer.constrainVersion(~Dict.VERS_FRAMESET);
478 break;
479 }
480
481
482 html.insertNodeAtEnd(node);
483 parseTag(lexer, node, mode);
484 lexer.seenEndHtml = true;
485 }
486
487 }
488
489 /**
490 * Parser for HEAD.
491 */
492 public static class ParseHead implements Parser
493 {
494
495 /**
496 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
497 */
498 public void parse(Lexer lexer, Node head, short mode)
499 {
500 Node node;
501 int hasTitle = 0;
502 int hasBase = 0;
503 TagTable tt = lexer.configuration.tt;
504
505 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
506 {
507 if (node.tag == head.tag && node.type == Node.END_TAG)
508 {
509 head.closed = true;
510 break;
511 }
512
513 if (node.type == Node.TEXT_NODE)
514 {
515 lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
516 lexer.ungetToken();
517 break;
518 }
519
520
521 if (Node.insertMisc(head, node))
522 {
523 continue;
524 }
525
526 if (node.type == Node.DOCTYPE_TAG)
527 {
528 Node.insertDocType(lexer, head, node);
529 continue;
530 }
531
532
533 if (node.tag == null)
534 {
535 lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
536 continue;
537 }
538
539 if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_HEAD))
540 {
541
542 if (lexer.isvoyager)
543 {
544 lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
545 }
546 lexer.ungetToken();
547 break;
548 }
549
550 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
551 {
552 if (node.tag == tt.tagTitle)
553 {
554 ++hasTitle;
555
556 if (hasTitle > 1)
557 {
558 lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
559 }
560 }
561 else if (node.tag == tt.tagBase)
562 {
563 ++hasBase;
564
565 if (hasBase > 1)
566 {
567 lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
568 }
569 }
570 else if (node.tag == tt.tagNoscript)
571 {
572 lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
573 }
574
575 head.insertNodeAtEnd(node);
576 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
577 continue;
578 }
579
580
581 lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
582 }
583
584 if (hasTitle == 0)
585 {
586 if (!lexer.configuration.bodyOnly)
587 {
588 lexer.report.warning(lexer, head, null, Report.MISSING_TITLE_ELEMENT);
589 }
590 head.insertNodeAtEnd(lexer.inferredTag("title"));
591 }
592 }
593
594 }
595
596 /**
597 * Parser for TITLE.
598 */
599 public static class ParseTitle implements Parser
600 {
601
602 /**
603 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
604 */
605 public void parse(Lexer lexer, Node title, short mode)
606 {
607 Node node;
608
609 while ((node = lexer.getToken(Lexer.MIXED_CONTENT)) != null)
610 {
611
612 if (node.tag == title.tag && node.type == Node.START_TAG)
613 {
614 lexer.report.warning(lexer, title, node, Report.COERCE_TO_ENDTAG);
615 node.type = Node.END_TAG;
616 continue;
617 }
618 else if (node.tag == title.tag && node.type == Node.END_TAG)
619 {
620 title.closed = true;
621 Node.trimSpaces(lexer, title);
622 return;
623 }
624
625 if (node.type == Node.TEXT_NODE)
626 {
627
628 if (title.content == null)
629 {
630 Node.trimInitialSpace(lexer, title, node);
631 }
632
633 if (node.start >= node.end)
634 {
635 continue;
636 }
637
638 title.insertNodeAtEnd(node);
639 continue;
640 }
641
642
643 if (Node.insertMisc(title, node))
644 {
645 continue;
646 }
647
648
649 if (node.tag == null)
650 {
651 lexer.report.warning(lexer, title, node, Report.DISCARDING_UNEXPECTED);
652 continue;
653 }
654
655
656 lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_BEFORE);
657 lexer.ungetToken();
658 Node.trimSpaces(lexer, title);
659 return;
660 }
661
662 lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_FOR);
663 }
664
665 }
666
667 /**
668 * Parser for SCRIPT.
669 */
670 public static class ParseScript implements Parser
671 {
672
673 /**
674 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
675 */
676 public void parse(Lexer lexer, Node script, short mode)
677 {
678
679
680
681
682 Node node = lexer.getCDATA(script);
683
684 if (node != null)
685 {
686 script.insertNodeAtEnd(node);
687 }
688 }
689
690 }
691
692 /**
693 * Parser for BODY.
694 */
695 public static class ParseBody implements Parser
696 {
697
698 /**
699 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
700 */
701 public void parse(Lexer lexer, Node body, short mode)
702 {
703 Node node;
704 boolean checkstack, iswhitenode;
705
706 mode = Lexer.IGNORE_WHITESPACE;
707 checkstack = true;
708 TagTable tt = lexer.configuration.tt;
709
710 Clean.bumpObject(lexer, body.parent);
711
712 while ((node = lexer.getToken(mode)) != null)
713 {
714
715
716 if (node.tag == tt.tagHtml)
717 {
718 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG || lexer.seenEndHtml)
719 {
720 lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
721 }
722 else
723 {
724 lexer.seenEndHtml = true;
725 }
726
727 continue;
728 }
729
730 if (lexer.seenEndBody
731 && (node.type == Node.START_TAG || node.type == Node.END_TAG || node.type == Node.START_END_TAG))
732 {
733 lexer.report.warning(lexer, body, node, Report.CONTENT_AFTER_BODY);
734 }
735
736 if (node.tag == body.tag && node.type == Node.END_TAG)
737 {
738 body.closed = true;
739 Node.trimSpaces(lexer, body);
740 lexer.seenEndBody = true;
741 mode = Lexer.IGNORE_WHITESPACE;
742
743 if (body.parent.tag == tt.tagNoframes)
744 {
745 break;
746 }
747
748 continue;
749 }
750
751 if (node.tag == tt.tagNoframes)
752 {
753 if (node.type == Node.START_TAG)
754 {
755 body.insertNodeAtEnd(node);
756 BLOCK.parse(lexer, node, mode);
757 continue;
758 }
759
760 if (node.type == Node.END_TAG && body.parent.tag == tt.tagNoframes)
761 {
762 Node.trimSpaces(lexer, body);
763 lexer.ungetToken();
764 break;
765 }
766 }
767
768 if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset) && body.parent.tag == tt.tagNoframes)
769 {
770 Node.trimSpaces(lexer, body);
771 lexer.ungetToken();
772 break;
773 }
774
775 iswhitenode = false;
776
777 if (node.type == Node.TEXT_NODE
778 && node.end <= node.start + 1
779 && node.textarray[node.start] == (byte) ' ')
780 {
781 iswhitenode = true;
782 }
783
784
785 if (Node.insertMisc(body, node))
786 {
787 continue;
788 }
789
790
791
792
793
794
795
796
797
798 if (node.type == Node.TEXT_NODE)
799 {
800 if (iswhitenode && mode == Lexer.IGNORE_WHITESPACE)
801 {
802 continue;
803 }
804
805 if (lexer.configuration.encloseBodyText && !iswhitenode)
806 {
807 Node para;
808
809 lexer.ungetToken();
810 para = lexer.inferredTag("p");
811 body.insertNodeAtEnd(para);
812 parseTag(lexer, para, mode);
813 mode = Lexer.MIXED_CONTENT;
814 continue;
815 }
816
817
818 lexer.constrainVersion(~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20));
819
820 if (checkstack)
821 {
822 checkstack = false;
823
824 if (lexer.inlineDup(node) > 0)
825 {
826 continue;
827 }
828 }
829
830 body.insertNodeAtEnd(node);
831 mode = Lexer.MIXED_CONTENT;
832 continue;
833 }
834
835 if (node.type == Node.DOCTYPE_TAG)
836 {
837 Node.insertDocType(lexer, body, node);
838 continue;
839 }
840
841 if (node.tag == null || node.tag == tt.tagParam)
842 {
843 lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
844 continue;
845 }
846
847
848
849
850 lexer.excludeBlocks = false;
851
852 if ((!((node.tag.model & Dict.CM_BLOCK) != 0) && !((node.tag.model & Dict.CM_INLINE) != 0))
853 || node.tag == tt.tagInput)
854 {
855
856 if (!((node.tag.model & Dict.CM_HEAD) != 0))
857 {
858 lexer.report.warning(lexer, body, node, Report.TAG_NOT_ALLOWED_IN);
859 }
860
861 if ((node.tag.model & Dict.CM_HTML) != 0)
862 {
863
864 if (node.tag == tt.tagBody && body.implicit && body.attributes == null)
865 {
866 body.attributes = node.attributes;
867 node.attributes = null;
868 }
869
870 continue;
871 }
872
873 if ((node.tag.model & Dict.CM_HEAD) != 0)
874 {
875 moveToHead(lexer, body, node);
876 continue;
877 }
878
879 if ((node.tag.model & Dict.CM_LIST) != 0)
880 {
881 lexer.ungetToken();
882 node = lexer.inferredTag("ul");
883 node.addClass("noindent");
884 lexer.excludeBlocks = true;
885 }
886 else if ((node.tag.model & Dict.CM_DEFLIST) != 0)
887 {
888 lexer.ungetToken();
889 node = lexer.inferredTag("dl");
890 lexer.excludeBlocks = true;
891 }
892 else if ((node.tag.model & (Dict.CM_TABLE | Dict.CM_ROWGRP | Dict.CM_ROW)) != 0)
893 {
894 lexer.ungetToken();
895 node = lexer.inferredTag("table");
896 lexer.excludeBlocks = true;
897 }
898 else if (node.tag == tt.tagInput)
899 {
900 lexer.ungetToken();
901 node = lexer.inferredTag("form");
902 lexer.excludeBlocks = true;
903 }
904 else
905 {
906 if (!((node.tag.model & (Dict.CM_ROW | Dict.CM_FIELD)) != 0))
907 {
908 lexer.ungetToken();
909 return;
910 }
911
912
913 continue;
914 }
915 }
916
917 if (node.type == Node.END_TAG)
918 {
919 if (node.tag == tt.tagBr)
920 {
921 node.type = Node.START_TAG;
922 }
923 else if (node.tag == tt.tagP)
924 {
925 Node.coerceNode(lexer, node, tt.tagBr);
926 body.insertNodeAtEnd(node);
927 node = lexer.inferredTag("br");
928 }
929 else if ((node.tag.model & Dict.CM_INLINE) != 0)
930 {
931 lexer.popInline(node);
932 }
933 }
934
935 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
936 {
937 if (((node.tag.model & Dict.CM_INLINE) != 0) && !((node.tag.model & Dict.CM_MIXED) != 0))
938 {
939
940
941 if (node.tag == tt.tagImg)
942 {
943 lexer.constrainVersion(~Dict.VERS_HTML40_STRICT);
944 }
945 else
946 {
947 lexer.constrainVersion(~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20));
948 }
949
950 if (checkstack && !node.implicit)
951 {
952 checkstack = false;
953
954 if (lexer.inlineDup(node) > 0)
955 {
956 continue;
957 }
958 }
959
960 mode = Lexer.MIXED_CONTENT;
961 }
962 else
963 {
964 checkstack = true;
965 mode = Lexer.IGNORE_WHITESPACE;
966 }
967
968 if (node.implicit)
969 {
970 lexer.report.warning(lexer, body, node, Report.INSERTING_TAG);
971 }
972
973 body.insertNodeAtEnd(node);
974 parseTag(lexer, node, mode);
975 continue;
976 }
977
978
979 lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
980 }
981 }
982
983 }
984
985 /**
986 * Parser for FRAMESET.
987 */
988 public static class ParseFrameSet implements Parser
989 {
990
991 /**
992 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
993 */
994 public void parse(Lexer lexer, Node frameset, short mode)
995 {
996 Node node;
997 TagTable tt = lexer.configuration.tt;
998
999 lexer.badAccess |= Report.USING_FRAMES;
1000
1001 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
1002 {
1003 if (node.tag == frameset.tag && node.type == Node.END_TAG)
1004 {
1005 frameset.closed = true;
1006 Node.trimSpaces(lexer, frameset);
1007 return;
1008 }
1009
1010
1011 if (Node.insertMisc(frameset, node))
1012 {
1013 continue;
1014 }
1015
1016 if (node.tag == null)
1017 {
1018 lexer.report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED);
1019 continue;
1020 }
1021
1022 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1023 {
1024 if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0)
1025 {
1026 moveToHead(lexer, frameset, node);
1027 continue;
1028 }
1029 }
1030
1031 if (node.tag == tt.tagBody)
1032 {
1033 lexer.ungetToken();
1034 node = lexer.inferredTag("noframes");
1035 lexer.report.warning(lexer, frameset, node, Report.INSERTING_TAG);
1036 }
1037
1038 if (node.type == Node.START_TAG && (node.tag.model & Dict.CM_FRAMES) != 0)
1039 {
1040 frameset.insertNodeAtEnd(node);
1041 lexer.excludeBlocks = false;
1042 parseTag(lexer, node, Lexer.MIXED_CONTENT);
1043 continue;
1044 }
1045 else if (node.type == Node.START_END_TAG && (node.tag.model & Dict.CM_FRAMES) != 0)
1046 {
1047 frameset.insertNodeAtEnd(node);
1048 continue;
1049 }
1050
1051
1052 lexer.report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED);
1053 }
1054
1055 lexer.report.warning(lexer, frameset, node, Report.MISSING_ENDTAG_FOR);
1056 }
1057
1058 }
1059
1060 /**
1061 * Parser for INLINE.
1062 */
1063 public static class ParseInline implements Parser
1064 {
1065
1066 /**
1067 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1068 */
1069 public void parse(Lexer lexer, Node element, short mode)
1070 {
1071 Node node, parent;
1072 TagTable tt = lexer.configuration.tt;
1073
1074 if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY))
1075 {
1076 return;
1077 }
1078
1079
1080
1081
1082
1083
1084 if (TidyUtils.toBoolean(element.tag.model & Dict.CM_BLOCK) || (element.tag == tt.tagDt))
1085 {
1086 lexer.inlineDup(null);
1087 }
1088 else if (TidyUtils.toBoolean(element.tag.model & Dict.CM_INLINE))
1089 {
1090
1091 lexer.pushInline(element);
1092 }
1093
1094 if (element.tag == tt.tagNobr)
1095 {
1096 lexer.badLayout |= Report.USING_NOBR;
1097 }
1098 else if (element.tag == tt.tagFont)
1099 {
1100 lexer.badLayout |= Report.USING_FONT;
1101 }
1102
1103
1104 if (mode != Lexer.PREFORMATTED)
1105 {
1106 mode = Lexer.MIXED_CONTENT;
1107 }
1108
1109 while ((node = lexer.getToken(mode)) != null)
1110 {
1111
1112 if (node.tag == element.tag && node.type == Node.END_TAG)
1113 {
1114 if (TidyUtils.toBoolean(element.tag.model & Dict.CM_INLINE))
1115 {
1116 lexer.popInline(node);
1117 }
1118
1119 if (!TidyUtils.toBoolean(mode & Lexer.PREFORMATTED))
1120 {
1121 Node.trimSpaces(lexer, element);
1122 }
1123
1124
1125
1126
1127 if (element.tag == tt.tagFont && element.content != null && element.content == element.last)
1128 {
1129 Node child = element.content;
1130
1131 if (child.tag == tt.tagA)
1132 {
1133 child.parent = element.parent;
1134 child.next = element.next;
1135 child.prev = element.prev;
1136
1137 if (child.prev != null)
1138 {
1139 child.prev.next = child;
1140 }
1141 else
1142 {
1143 child.parent.content = child;
1144 }
1145
1146 if (child.next != null)
1147 {
1148 child.next.prev = child;
1149 }
1150 else
1151 {
1152 child.parent.last = child;
1153 }
1154
1155 element.next = null;
1156 element.prev = null;
1157 element.parent = child;
1158 element.content = child.content;
1159 element.last = child.last;
1160 child.content = element;
1161 child.last = element;
1162 for (child = element.content; child != null; child = child.next)
1163 {
1164 child.parent = element;
1165 }
1166 }
1167 }
1168 element.closed = true;
1169 Node.trimSpaces(lexer, element);
1170 Node.trimEmptyElement(lexer, element);
1171 return;
1172 }
1173
1174
1175
1176
1177 if (node.type == Node.START_TAG
1178 && node.tag == element.tag
1179 && lexer.isPushed(node)
1180 && !node.implicit
1181 && !element.implicit
1182 && node.tag != null
1183 && ((node.tag.model & Dict.CM_INLINE) != 0)
1184 && node.tag != tt.tagA
1185 && node.tag != tt.tagFont
1186 && node.tag != tt.tagBig
1187 && node.tag != tt.tagSmall
1188 && node.tag != tt.tagQ)
1189 {
1190 if (element.content != null && node.attributes == null)
1191 {
1192 lexer.report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG);
1193 node.type = Node.END_TAG;
1194 lexer.ungetToken();
1195 continue;
1196 }
1197
1198 lexer.report.warning(lexer, element, node, Report.NESTED_EMPHASIS);
1199 }
1200 else if (lexer.isPushed(node) && node.type == Node.START_TAG && node.tag == tt.tagQ)
1201 {
1202 lexer.report.warning(lexer, element, node, Report.NESTED_QUOTATION);
1203 }
1204
1205 if (node.type == Node.TEXT_NODE)
1206 {
1207
1208 if (element.content == null && !TidyUtils.toBoolean(mode & Lexer.PREFORMATTED))
1209 {
1210 Node.trimSpaces(lexer, element);
1211 }
1212
1213 if (node.start >= node.end)
1214 {
1215 continue;
1216 }
1217
1218 element.insertNodeAtEnd(node);
1219 continue;
1220 }
1221
1222
1223 if (Node.insertMisc(element, node))
1224 {
1225 continue;
1226 }
1227
1228
1229 if (node.tag == tt.tagHtml)
1230 {
1231 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1232 {
1233 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1234 continue;
1235 }
1236
1237
1238 lexer.ungetToken();
1239 if (!((mode & Lexer.PREFORMATTED) != 0))
1240 {
1241 Node.trimSpaces(lexer, element);
1242 }
1243 Node.trimEmptyElement(lexer, element);
1244 return;
1245 }
1246
1247
1248 if (node.tag == tt.tagP
1249 && node.type == Node.START_TAG
1250 && ((mode & Lexer.PREFORMATTED) != 0 || element.tag == tt.tagDt || element.isDescendantOf(tt.tagDt)))
1251 {
1252 node.tag = tt.tagBr;
1253 node.element = "br";
1254 Node.trimSpaces(lexer, element);
1255 element.insertNodeAtEnd(node);
1256 continue;
1257 }
1258
1259
1260 if (node.tag == null || node.tag == tt.tagParam)
1261 {
1262 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1263 continue;
1264 }
1265
1266 if (node.tag == tt.tagBr && node.type == Node.END_TAG)
1267 {
1268 node.type = Node.START_TAG;
1269 }
1270
1271 if (node.type == Node.END_TAG)
1272 {
1273
1274 if (node.tag == tt.tagBr)
1275 {
1276 node.type = Node.START_TAG;
1277 }
1278 else if (node.tag == tt.tagP)
1279 {
1280
1281 if (!element.isDescendantOf(tt.tagP))
1282 {
1283 Node.coerceNode(lexer, node, tt.tagBr);
1284 Node.trimSpaces(lexer, element);
1285 element.insertNodeAtEnd(node);
1286 node = lexer.inferredTag("br");
1287 continue;
1288 }
1289 }
1290 else if ((node.tag.model & Dict.CM_INLINE) != 0
1291 && node.tag != tt.tagA
1292 && !((node.tag.model & Dict.CM_OBJECT) != 0)
1293 && (element.tag.model & Dict.CM_INLINE) != 0)
1294 {
1295
1296 lexer.popInline(element);
1297
1298 if (element.tag != tt.tagA)
1299 {
1300 if (node.tag == tt.tagA && node.tag != element.tag)
1301 {
1302 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1303 lexer.ungetToken();
1304 }
1305 else
1306 {
1307 lexer.report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG);
1308 }
1309
1310 if (!((mode & Lexer.PREFORMATTED) != 0))
1311 {
1312 Node.trimSpaces(lexer, element);
1313 }
1314 Node.trimEmptyElement(lexer, element);
1315 return;
1316 }
1317
1318
1319 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1320 continue;
1321 }
1322 else if (lexer.exiled && node.tag.model != 0 && (node.tag.model & Dict.CM_TABLE) != 0)
1323 {
1324 lexer.ungetToken();
1325 Node.trimSpaces(lexer, element);
1326 Node.trimEmptyElement(lexer, element);
1327 return;
1328 }
1329 }
1330
1331
1332 if ((node.tag.model & Dict.CM_HEADING) != 0 && (element.tag.model & Dict.CM_HEADING) != 0)
1333 {
1334 if (node.tag == element.tag)
1335 {
1336 lexer.report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG);
1337 }
1338 else
1339 {
1340 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1341 lexer.ungetToken();
1342 }
1343 if (!((mode & Lexer.PREFORMATTED) != 0))
1344 {
1345 Node.trimSpaces(lexer, element);
1346 }
1347 Node.trimEmptyElement(lexer, element);
1348 return;
1349 }
1350
1351
1352
1353
1354
1355 if (node.tag == tt.tagA
1356 && !node.implicit
1357 && (element.tag == tt.tagA || element.isDescendantOf(tt.tagA)))
1358 {
1359
1360
1361
1362
1363 if (node.type != Node.END_TAG && node.attributes == null)
1364 {
1365 node.type = Node.END_TAG;
1366 lexer.report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG);
1367
1368 lexer.ungetToken();
1369 continue;
1370 }
1371
1372 lexer.ungetToken();
1373 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1374
1375 if (!((mode & Lexer.PREFORMATTED) != 0))
1376 {
1377 Node.trimSpaces(lexer, element);
1378 }
1379 Node.trimEmptyElement(lexer, element);
1380 return;
1381 }
1382
1383 if ((element.tag.model & Dict.CM_HEADING) != 0)
1384 {
1385 if (node.tag == tt.tagCenter || node.tag == tt.tagDiv)
1386 {
1387 if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
1388 {
1389 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1390 continue;
1391 }
1392
1393 lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
1394
1395
1396 if (element.content == null)
1397 {
1398 Node.insertNodeAsParent(element, node);
1399 continue;
1400 }
1401
1402
1403 element.insertNodeAfterElement(node);
1404
1405 if (!((mode & Lexer.PREFORMATTED) != 0))
1406 {
1407 Node.trimSpaces(lexer, element);
1408 }
1409
1410 element = lexer.cloneNode(element);
1411 element.start = lexer.lexsize;
1412 element.end = lexer.lexsize;
1413 node.insertNodeAtEnd(element);
1414 continue;
1415 }
1416
1417 if (node.tag == tt.tagHr)
1418 {
1419 if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
1420 {
1421 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1422 continue;
1423 }
1424
1425 lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
1426
1427
1428 if (element.content == null)
1429 {
1430 Node.insertNodeBeforeElement(element, node);
1431 continue;
1432 }
1433
1434
1435 element.insertNodeAfterElement(node);
1436
1437 if (!((mode & Lexer.PREFORMATTED) != 0))
1438 {
1439 Node.trimSpaces(lexer, element);
1440 }
1441
1442 element = lexer.cloneNode(element);
1443 element.start = lexer.lexsize;
1444 element.end = lexer.lexsize;
1445 node.insertNodeAfterElement(element);
1446 continue;
1447 }
1448 }
1449
1450 if (element.tag == tt.tagDt)
1451 {
1452 if (node.tag == tt.tagHr)
1453 {
1454 Node dd;
1455
1456 if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
1457 {
1458 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1459 continue;
1460 }
1461
1462 lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
1463 dd = lexer.inferredTag("dd");
1464
1465
1466 if (element.content == null)
1467 {
1468 Node.insertNodeBeforeElement(element, dd);
1469 dd.insertNodeAtEnd(node);
1470 continue;
1471 }
1472
1473
1474 element.insertNodeAfterElement(dd);
1475 dd.insertNodeAtEnd(node);
1476
1477 if (!((mode & Lexer.PREFORMATTED) != 0))
1478 {
1479 Node.trimSpaces(lexer, element);
1480 }
1481
1482 element = lexer.cloneNode(element);
1483 element.start = lexer.lexsize;
1484 element.end = lexer.lexsize;
1485 dd.insertNodeAfterElement(element);
1486 continue;
1487 }
1488 }
1489
1490
1491
1492 if (node.type == Node.END_TAG)
1493 {
1494 for (parent = element.parent; parent != null; parent = parent.parent)
1495 {
1496 if (node.tag == parent.tag)
1497 {
1498 if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit)
1499 {
1500 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1501 }
1502
1503 if (element.tag == tt.tagA)
1504 {
1505 lexer.popInline(element);
1506 }
1507
1508 lexer.ungetToken();
1509
1510 if (!((mode & Lexer.PREFORMATTED) != 0))
1511 {
1512 Node.trimSpaces(lexer, element);
1513 }
1514
1515 Node.trimEmptyElement(lexer, element);
1516 return;
1517 }
1518 }
1519 }
1520
1521
1522 if (!((node.tag.model & Dict.CM_INLINE) != 0))
1523 {
1524 if (node.type != Node.START_TAG)
1525 {
1526 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1527 continue;
1528 }
1529
1530 if (!((element.tag.model & Dict.CM_OPT) != 0))
1531 {
1532 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1533 }
1534
1535 if ((node.tag.model & Dict.CM_HEAD) != 0 && !((node.tag.model & Dict.CM_BLOCK) != 0))
1536 {
1537 moveToHead(lexer, element, node);
1538 continue;
1539 }
1540
1541
1542
1543 if (element.tag == tt.tagA)
1544 {
1545 if (node.tag != null && !((node.tag.model & Dict.CM_HEADING) != 0))
1546 {
1547 lexer.popInline(element);
1548 }
1549 else if (!(element.content != null))
1550 {
1551 Node.discardElement(element);
1552 lexer.ungetToken();
1553 return;
1554 }
1555 }
1556
1557 lexer.ungetToken();
1558
1559 if (!((mode & Lexer.PREFORMATTED) != 0))
1560 {
1561 Node.trimSpaces(lexer, element);
1562 }
1563
1564 Node.trimEmptyElement(lexer, element);
1565 return;
1566 }
1567
1568
1569 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1570 {
1571 if (node.implicit)
1572 {
1573 lexer.report.warning(lexer, element, node, Report.INSERTING_TAG);
1574 }
1575
1576
1577 if (node.tag == tt.tagBr)
1578 {
1579 Node.trimSpaces(lexer, element);
1580 }
1581
1582 element.insertNodeAtEnd(node);
1583 parseTag(lexer, node, mode);
1584 continue;
1585 }
1586
1587
1588 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1589 continue;
1590 }
1591
1592 if (!((element.tag.model & Dict.CM_OPT) != 0))
1593 {
1594 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR);
1595 }
1596
1597 Node.trimEmptyElement(lexer, element);
1598 }
1599 }
1600
1601 /**
1602 * Parser for LIST.
1603 */
1604 public static class ParseList implements Parser
1605 {
1606
1607 public void parse(Lexer lexer, Node list, short mode)
1608 {
1609 Node node;
1610 Node parent;
1611 TagTable tt = lexer.configuration.tt;
1612
1613 if ((list.tag.model & Dict.CM_EMPTY) != 0)
1614 {
1615 return;
1616 }
1617
1618 lexer.insert = -1;
1619
1620 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
1621 {
1622 if (node.tag == list.tag && node.type == Node.END_TAG)
1623 {
1624 if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1625 {
1626 Node.coerceNode(lexer, list, tt.tagUl);
1627 }
1628
1629 list.closed = true;
1630 Node.trimEmptyElement(lexer, list);
1631 return;
1632 }
1633
1634
1635 if (Node.insertMisc(list, node))
1636 {
1637 continue;
1638 }
1639
1640 if (node.type != Node.TEXT_NODE && node.tag == null)
1641 {
1642 lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1643 continue;
1644 }
1645
1646
1647
1648 if (node.type == Node.END_TAG)
1649 {
1650 if (node.tag == tt.tagForm)
1651 {
1652 badForm(lexer);
1653 lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1654 continue;
1655 }
1656
1657 if (node.tag != null && (node.tag.model & Dict.CM_INLINE) != 0)
1658 {
1659 lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1660 lexer.popInline(node);
1661 continue;
1662 }
1663
1664 for (parent = list.parent; parent != null; parent = parent.parent)
1665 {
1666 if (node.tag == parent.tag)
1667 {
1668 lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
1669 lexer.ungetToken();
1670
1671 if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1672 {
1673 Node.coerceNode(lexer, list, tt.tagUl);
1674 }
1675
1676 Node.trimEmptyElement(lexer, list);
1677 return;
1678 }
1679 }
1680
1681 lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1682 continue;
1683 }
1684
1685 if (node.tag != tt.tagLi)
1686 {
1687 lexer.ungetToken();
1688
1689 if (node.tag != null && (node.tag.model & Dict.CM_BLOCK) != 0 && lexer.excludeBlocks)
1690 {
1691 lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
1692 Node.trimEmptyElement(lexer, list);
1693 return;
1694 }
1695
1696 node = lexer.inferredTag("li");
1697 node.addAttribute("style", "list-style: none");
1698 lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG);
1699 }
1700
1701
1702 list.insertNodeAtEnd(node);
1703 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
1704 }
1705
1706 if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1707 {
1708 Node.coerceNode(lexer, list, tt.tagUl);
1709 }
1710
1711 lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR);
1712 Node.trimEmptyElement(lexer, list);
1713 }
1714
1715 }
1716
1717 /**
1718 * Parser for empty elements.
1719 */
1720 public static class ParseEmpty implements Parser
1721 {
1722
1723 /**
1724 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1725 */
1726 public void parse(Lexer lexer, Node element, short mode)
1727 {
1728 if (lexer.isvoyager)
1729 {
1730 Node node = lexer.getToken(mode);
1731 if (node != null && !(node.type == Node.END_TAG && node.tag == element.tag))
1732 {
1733 lexer.report.warning(lexer, element, node, Report.ELEMENT_NOT_EMPTY);
1734 lexer.ungetToken();
1735 }
1736 }
1737 }
1738 }
1739
1740 /**
1741 * Parser for DEFLIST.
1742 */
1743 public static class ParseDefList implements Parser
1744 {
1745
1746 /**
1747 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1748 */
1749 public void parse(Lexer lexer, Node list, short mode)
1750 {
1751 Node node, parent;
1752 TagTable tt = lexer.configuration.tt;
1753
1754 if ((list.tag.model & Dict.CM_EMPTY) != 0)
1755 {
1756 return;
1757 }
1758
1759 lexer.insert = -1;
1760
1761 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
1762 {
1763 if (node.tag == list.tag && node.type == Node.END_TAG)
1764 {
1765 list.closed = true;
1766 Node.trimEmptyElement(lexer, list);
1767 return;
1768 }
1769
1770
1771 if (Node.insertMisc(list, node))
1772 {
1773 continue;
1774 }
1775
1776 if (node.type == Node.TEXT_NODE)
1777 {
1778 lexer.ungetToken();
1779 node = lexer.inferredTag("dt");
1780 lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG);
1781 }
1782
1783 if (node.tag == null)
1784 {
1785 lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1786 continue;
1787 }
1788
1789
1790
1791 if (node.type == Node.END_TAG)
1792 {
1793 if (node.tag == tt.tagForm)
1794 {
1795 badForm(lexer);
1796 lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1797 continue;
1798 }
1799
1800 for (parent = list.parent; parent != null; parent = parent.parent)
1801 {
1802 if (node.tag == parent.tag)
1803 {
1804 lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
1805
1806 lexer.ungetToken();
1807 Node.trimEmptyElement(lexer, list);
1808 return;
1809 }
1810 }
1811 }
1812
1813
1814 if (node.tag == tt.tagCenter)
1815 {
1816 if (list.content != null)
1817 {
1818 list.insertNodeAfterElement(node);
1819 }
1820 else
1821 {
1822
1823 Node.insertNodeBeforeElement(list, node);
1824
1825
1826 Node.discardElement(list);
1827 }
1828
1829
1830 parseTag(lexer, node, mode);
1831
1832
1833 list = lexer.inferredTag("dl");
1834 node.insertNodeAfterElement(list);
1835 continue;
1836 }
1837
1838 if (!(node.tag == tt.tagDt || node.tag == tt.tagDd))
1839 {
1840 lexer.ungetToken();
1841
1842 if (!((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
1843 {
1844 lexer.report.warning(lexer, list, node, Report.TAG_NOT_ALLOWED_IN);
1845 Node.trimEmptyElement(lexer, list);
1846 return;
1847 }
1848
1849
1850 if (!((node.tag.model & Dict.CM_INLINE) != 0) && lexer.excludeBlocks)
1851 {
1852 Node.trimEmptyElement(lexer, list);
1853 return;
1854 }
1855
1856 node = lexer.inferredTag("dd");
1857 lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG);
1858 }
1859
1860 if (node.type == Node.END_TAG)
1861 {
1862 lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1863 continue;
1864 }
1865
1866
1867 list.insertNodeAtEnd(node);
1868 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
1869 }
1870
1871 lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR);
1872 Node.trimEmptyElement(lexer, list);
1873 }
1874
1875 }
1876
1877 /**
1878 * Parser for PRE.
1879 */
1880 public static class ParsePre implements Parser
1881 {
1882
1883 /**
1884 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1885 */
1886 public void parse(Lexer lexer, Node pre, short mode)
1887 {
1888 Node node;
1889 TagTable tt = lexer.configuration.tt;
1890
1891 if ((pre.tag.model & Dict.CM_EMPTY) != 0)
1892 {
1893 return;
1894 }
1895
1896 if ((pre.tag.model & Dict.CM_OBSOLETE) != 0)
1897 {
1898 Node.coerceNode(lexer, pre, tt.tagPre);
1899 }
1900
1901 lexer.inlineDup(null);
1902
1903 while ((node = lexer.getToken(Lexer.PREFORMATTED)) != null)
1904 {
1905 if (node.tag == pre.tag && node.type == Node.END_TAG)
1906 {
1907 Node.trimSpaces(lexer, pre);
1908 pre.closed = true;
1909 Node.trimEmptyElement(lexer, pre);
1910 return;
1911 }
1912
1913 if (node.tag == tt.tagHtml)
1914 {
1915 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1916 {
1917 lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
1918 }
1919
1920 continue;
1921 }
1922
1923 if (node.type == Node.TEXT_NODE)
1924 {
1925
1926 if (pre.content == null)
1927 {
1928 if (node.textarray[node.start] == (byte) '\n')
1929 {
1930 ++node.start;
1931 }
1932
1933 if (node.start >= node.end)
1934 {
1935 continue;
1936 }
1937 }
1938
1939 pre.insertNodeAtEnd(node);
1940 continue;
1941 }
1942
1943
1944 if (Node.insertMisc(pre, node))
1945 {
1946 continue;
1947 }
1948
1949
1950 if (!lexer.preContent(node))
1951 {
1952 Node newnode;
1953
1954 lexer.report.warning(lexer, pre, node, Report.UNESCAPED_ELEMENT);
1955 newnode = Node.escapeTag(lexer, node);
1956 pre.insertNodeAtEnd(newnode);
1957 continue;
1958 }
1959
1960 if (node.tag == tt.tagP)
1961 {
1962 if (node.type == Node.START_TAG)
1963 {
1964 lexer.report.warning(lexer, pre, node, Report.USING_BR_INPLACE_OF);
1965
1966
1967 Node.trimSpaces(lexer, pre);
1968
1969
1970 Node.coerceNode(lexer, node, tt.tagBr);
1971 pre.insertNodeAtEnd(node);
1972 }
1973 else
1974 {
1975 lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
1976 }
1977 continue;
1978 }
1979
1980 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1981 {
1982
1983 if (node.tag == tt.tagBr)
1984 {
1985 Node.trimSpaces(lexer, pre);
1986 }
1987
1988 pre.insertNodeAtEnd(node);
1989 parseTag(lexer, node, Lexer.PREFORMATTED);
1990 continue;
1991 }
1992
1993
1994 lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
1995 }
1996
1997 lexer.report.warning(lexer, pre, node, Report.MISSING_ENDTAG_FOR);
1998 Node.trimEmptyElement(lexer, pre);
1999 }
2000
2001 }
2002
2003 /**
2004 * Parser for block elements.
2005 */
2006 public static class ParseBlock implements Parser
2007 {
2008
2009 /**
2010 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2011 */
2012 public void parse(Lexer lexer, Node element, short mode)
2013 {
2014
2015
2016 Node node, parent;
2017 boolean checkstack;
2018 int istackbase = 0;
2019 TagTable tt = lexer.configuration.tt;
2020
2021 checkstack = true;
2022
2023 if ((element.tag.model & Dict.CM_EMPTY) != 0)
2024 {
2025 return;
2026 }
2027
2028 if (element.tag == tt.tagForm && element.isDescendantOf(tt.tagForm))
2029 {
2030 lexer.report.warning(lexer, element, null, Report.ILLEGAL_NESTING);
2031 }
2032
2033
2034
2035
2036
2037
2038 if ((element.tag.model & Dict.CM_OBJECT) != 0)
2039 {
2040 istackbase = lexer.istackbase;
2041 lexer.istackbase = lexer.istack.size();
2042 }
2043
2044 if (!((element.tag.model & Dict.CM_MIXED) != 0))
2045 {
2046 lexer.inlineDup(null);
2047 }
2048
2049 mode = Lexer.IGNORE_WHITESPACE;
2050
2051 while ((node = lexer.getToken(mode)) != null)
2052 {
2053
2054 if (node.type == Node.END_TAG
2055 && node.tag != null
2056 && (node.tag == element.tag || element.was == node.tag))
2057 {
2058
2059 if ((element.tag.model & Dict.CM_OBJECT) != 0)
2060 {
2061
2062 while (lexer.istack.size() > lexer.istackbase)
2063 {
2064 lexer.popInline(null);
2065 }
2066 lexer.istackbase = istackbase;
2067 }
2068
2069 element.closed = true;
2070 Node.trimSpaces(lexer, element);
2071 Node.trimEmptyElement(lexer, element);
2072 return;
2073 }
2074
2075 if (node.tag == tt.tagHtml || node.tag == tt.tagHead || node.tag == tt.tagBody)
2076 {
2077 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2078 {
2079 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2080 }
2081
2082 continue;
2083 }
2084
2085 if (node.type == Node.END_TAG)
2086 {
2087 if (node.tag == null)
2088 {
2089 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2090
2091 continue;
2092 }
2093 else if (node.tag == tt.tagBr)
2094 {
2095 node.type = Node.START_TAG;
2096 }
2097 else if (node.tag == tt.tagP)
2098 {
2099 Node.coerceNode(lexer, node, tt.tagBr);
2100 element.insertNodeAtEnd(node);
2101 node = lexer.inferredTag("br");
2102 }
2103 else
2104 {
2105
2106
2107 for (parent = element.parent; parent != null; parent = parent.parent)
2108 {
2109 if (node.tag == parent.tag)
2110 {
2111 if (!((element.tag.model & Dict.CM_OPT) != 0))
2112 {
2113 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
2114 }
2115
2116 lexer.ungetToken();
2117
2118 if ((element.tag.model & Dict.CM_OBJECT) != 0)
2119 {
2120
2121 while (lexer.istack.size() > lexer.istackbase)
2122 {
2123 lexer.popInline(null);
2124 }
2125 lexer.istackbase = istackbase;
2126 }
2127
2128 Node.trimSpaces(lexer, element);
2129 Node.trimEmptyElement(lexer, element);
2130 return;
2131 }
2132 }
2133
2134 if (lexer.exiled && node.tag.model != 0 && (node.tag.model & Dict.CM_TABLE) != 0)
2135 {
2136 lexer.ungetToken();
2137 Node.trimSpaces(lexer, element);
2138 Node.trimEmptyElement(lexer, element);
2139 return;
2140 }
2141 }
2142 }
2143
2144
2145 if (node.type == Node.TEXT_NODE)
2146 {
2147 boolean iswhitenode = false;
2148
2149 if (node.type == Node.TEXT_NODE
2150 && node.end <= node.start + 1
2151 && lexer.lexbuf[node.start] == (byte) ' ')
2152 {
2153 iswhitenode = true;
2154 }
2155
2156 if (lexer.configuration.encloseBlockText && !iswhitenode)
2157 {
2158 lexer.ungetToken();
2159 node = lexer.inferredTag("p");
2160 element.insertNodeAtEnd(node);
2161 parseTag(lexer, node, Lexer.MIXED_CONTENT);
2162 continue;
2163 }
2164
2165 if (checkstack)
2166 {
2167 checkstack = false;
2168
2169 if (!((element.tag.model & Dict.CM_MIXED) != 0))
2170 {
2171 if (lexer.inlineDup(node) > 0)
2172 {
2173 continue;
2174 }
2175 }
2176 }
2177
2178 element.insertNodeAtEnd(node);
2179 mode = Lexer.MIXED_CONTENT;
2180
2181
2182
2183 if (element.tag == tt.tagBody
2184 || element.tag == tt.tagMap
2185 || element.tag == tt.tagBlockquote
2186 || element.tag == tt.tagForm
2187 || element.tag == tt.tagNoscript)
2188 {
2189 lexer.constrainVersion(~Dict.VERS_HTML40_STRICT);
2190 }
2191 continue;
2192 }
2193
2194 if (Node.insertMisc(element, node))
2195 {
2196 continue;
2197 }
2198
2199
2200 if (node.tag == tt.tagParam)
2201 {
2202 if (((element.tag.model & Dict.CM_PARAM) != 0)
2203 && (node.type == Node.START_TAG || node.type == Node.START_END_TAG))
2204 {
2205 element.insertNodeAtEnd(node);
2206 continue;
2207 }
2208
2209
2210 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2211 continue;
2212 }
2213
2214
2215 if (node.tag == tt.tagArea)
2216 {
2217 if ((element.tag == tt.tagMap) && (node.type == Node.START_TAG || node.type == Node.START_END_TAG))
2218 {
2219 element.insertNodeAtEnd(node);
2220 continue;
2221 }
2222
2223
2224 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2225 continue;
2226 }
2227
2228
2229 if (node.tag == null)
2230 {
2231 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2232 continue;
2233 }
2234
2235
2236
2237
2238 if (!((node.tag.model & Dict.CM_INLINE) != 0))
2239 {
2240 if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
2241 {
2242 if (node.tag == tt.tagForm)
2243 {
2244 badForm(lexer);
2245 }
2246 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2247 continue;
2248 }
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260 if (element.tag == tt.tagLi)
2261 {
2262 if (node.tag == tt.tagFrame
2263 || node.tag == tt.tagFrameset
2264 || node.tag == tt.tagOptgroup
2265 || node.tag == tt.tagOption)
2266 {
2267 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2268 continue;
2269 }
2270 }
2271
2272 if (element.tag == tt.tagTd || element.tag == tt.tagTh)
2273 {
2274
2275
2276 if ((node.tag.model & Dict.CM_HEAD) != 0)
2277 {
2278 moveToHead(lexer, element, node);
2279 continue;
2280 }
2281
2282 if ((node.tag.model & Dict.CM_LIST) != 0)
2283 {
2284 lexer.ungetToken();
2285 node = lexer.inferredTag("ul");
2286 node.addClass("noindent");
2287 lexer.excludeBlocks = true;
2288 }
2289 else if ((node.tag.model & Dict.CM_DEFLIST) != 0)
2290 {
2291 lexer.ungetToken();
2292 node = lexer.inferredTag("dl");
2293 lexer.excludeBlocks = true;
2294 }
2295
2296
2297 if (!((node.tag.model & Dict.CM_BLOCK) != 0))
2298 {
2299 lexer.ungetToken();
2300 Node.trimSpaces(lexer, element);
2301 Node.trimEmptyElement(lexer, element);
2302 return;
2303 }
2304 }
2305 else if ((node.tag.model & Dict.CM_BLOCK) != 0)
2306 {
2307 if (lexer.excludeBlocks)
2308 {
2309 if (!((element.tag.model & Dict.CM_OPT) != 0))
2310 {
2311 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
2312 }
2313
2314 lexer.ungetToken();
2315
2316 if ((element.tag.model & Dict.CM_OBJECT) != 0)
2317 {
2318 lexer.istackbase = istackbase;
2319 }
2320
2321 Node.trimSpaces(lexer, element);
2322 Node.trimEmptyElement(lexer, element);
2323 return;
2324 }
2325 }
2326 else
2327 {
2328
2329
2330 if ((node.tag.model & Dict.CM_HEAD) != 0)
2331 {
2332 moveToHead(lexer, element, node);
2333 continue;
2334 }
2335
2336
2337 if (element.tag == tt.tagForm && element.parent.tag == tt.tagTd && element.parent.implicit)
2338 {
2339 if (node.tag == tt.tagTd)
2340 {
2341 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2342 continue;
2343 }
2344
2345 if (node.tag == tt.tagTh)
2346 {
2347 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2348 node = element.parent;
2349 node.element = "th";
2350 node.tag = tt.tagTh;
2351 continue;
2352 }
2353 }
2354
2355 if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit)
2356 {
2357 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
2358 }
2359
2360 lexer.ungetToken();
2361
2362 if ((node.tag.model & Dict.CM_LIST) != 0)
2363 {
2364 if (element.parent != null
2365 && element.parent.tag != null
2366 && element.parent.tag.getParser() == LIST)
2367 {
2368 Node.trimSpaces(lexer, element);
2369 Node.trimEmptyElement(lexer, element);
2370 return;
2371 }
2372
2373 node = lexer.inferredTag("ul");
2374 node.addClass("noindent");
2375 }
2376 else if ((node.tag.model & Dict.CM_DEFLIST) != 0)
2377 {
2378 if (element.parent.tag == tt.tagDl)
2379 {
2380 Node.trimSpaces(lexer, element);
2381 Node.trimEmptyElement(lexer, element);
2382 return;
2383 }
2384
2385 node = lexer.inferredTag("dl");
2386 }
2387 else if ((node.tag.model & Dict.CM_TABLE) != 0 || (node.tag.model & Dict.CM_ROW) != 0)
2388 {
2389 node = lexer.inferredTag("table");
2390 }
2391 else if ((element.tag.model & Dict.CM_OBJECT) != 0)
2392 {
2393
2394 while (lexer.istack.size() > lexer.istackbase)
2395 {
2396 lexer.popInline(null);
2397 }
2398 lexer.istackbase = istackbase;
2399 Node.trimSpaces(lexer, element);
2400 Node.trimEmptyElement(lexer, element);
2401 return;
2402
2403 }
2404 else
2405 {
2406 Node.trimSpaces(lexer, element);
2407 Node.trimEmptyElement(lexer, element);
2408 return;
2409 }
2410 }
2411 }
2412
2413
2414 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2415 {
2416 if (TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
2417 {
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429 if (checkstack && !node.implicit)
2430 {
2431 checkstack = false;
2432
2433
2434 if (!TidyUtils.toBoolean(element.tag.model & Dict.CM_MIXED))
2435 {
2436 if (lexer.inlineDup(node) > 0)
2437 {
2438 continue;
2439 }
2440 }
2441 }
2442
2443 mode = Lexer.MIXED_CONTENT;
2444 }
2445 else
2446 {
2447 checkstack = true;
2448 mode = Lexer.IGNORE_WHITESPACE;
2449 }
2450
2451
2452 if (node.tag == tt.tagBr)
2453 {
2454 Node.trimSpaces(lexer, element);
2455 }
2456
2457 element.insertNodeAtEnd(node);
2458
2459 if (node.implicit)
2460 {
2461 lexer.report.warning(lexer, element, node, Report.INSERTING_TAG);
2462 }
2463
2464 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE
2465 );
2466 continue;
2467 }
2468
2469
2470 if (node.type == Node.END_TAG)
2471 {
2472 lexer.popInline(node);
2473 }
2474
2475 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2476 continue;
2477 }
2478
2479 if (!((element.tag.model & Dict.CM_OPT) != 0))
2480 {
2481 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR);
2482 }
2483
2484 if ((element.tag.model & Dict.CM_OBJECT) != 0)
2485 {
2486
2487 while (lexer.istack.size() > lexer.istackbase)
2488 {
2489 lexer.popInline(null);
2490 }
2491 lexer.istackbase = istackbase;
2492 }
2493
2494 Node.trimSpaces(lexer, element);
2495 Node.trimEmptyElement(lexer, element);
2496 }
2497
2498 }
2499
2500 /**
2501 * Parser for TABLE.
2502 */
2503 public static class ParseTableTag implements Parser
2504 {
2505
2506 /**
2507 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2508 */
2509 public void parse(Lexer lexer, Node table, short mode)
2510 {
2511 Node node, parent;
2512 int istackbase;
2513 TagTable tt = lexer.configuration.tt;
2514
2515 lexer.deferDup();
2516 istackbase = lexer.istackbase;
2517 lexer.istackbase = lexer.istack.size();
2518
2519 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2520 {
2521 if (node.tag == table.tag && node.type == Node.END_TAG)
2522 {
2523 lexer.istackbase = istackbase;
2524 table.closed = true;
2525 Node.trimEmptyElement(lexer, table);
2526 return;
2527 }
2528
2529
2530 if (Node.insertMisc(table, node))
2531 {
2532 continue;
2533 }
2534
2535
2536 if (node.tag == null && node.type != Node.TEXT_NODE)
2537 {
2538 lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2539 continue;
2540 }
2541
2542
2543
2544 if (node.type != Node.END_TAG)
2545 {
2546 if (node.tag == tt.tagTd || node.tag == tt.tagTh || node.tag == tt.tagTable)
2547 {
2548 lexer.ungetToken();
2549 node = lexer.inferredTag("tr");
2550 lexer.report.warning(lexer, table, node, Report.MISSING_STARTTAG);
2551 }
2552 else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)
2553 {
2554 Node.insertNodeBeforeElement(table, node);
2555 lexer.report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN);
2556 lexer.exiled = true;
2557
2558 if (!(node.type == Node.TEXT_NODE))
2559 {
2560 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2561 }
2562
2563 lexer.exiled = false;
2564 continue;
2565 }
2566 else if ((node.tag.model & Dict.CM_HEAD) != 0)
2567 {
2568 moveToHead(lexer, table, node);
2569 continue;
2570 }
2571 }
2572
2573
2574
2575 if (node.type == Node.END_TAG)
2576 {
2577 if (node.tag == tt.tagForm
2578 || (node.tag != null && ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)))
2579 {
2580 badForm(lexer);
2581 lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2582 continue;
2583 }
2584
2585 if ((node.tag != null && (node.tag.model & (Dict.CM_TABLE | Dict.CM_ROW)) != 0)
2586 || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
2587 {
2588 lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2589 continue;
2590 }
2591
2592 for (parent = table.parent; parent != null; parent = parent.parent)
2593 {
2594 if (node.tag == parent.tag)
2595 {
2596 lexer.report.warning(lexer, table, node, Report.MISSING_ENDTAG_BEFORE);
2597 lexer.ungetToken();
2598 lexer.istackbase = istackbase;
2599 Node.trimEmptyElement(lexer, table);
2600 return;
2601 }
2602 }
2603 }
2604
2605 if (!((node.tag.model & Dict.CM_TABLE) != 0))
2606 {
2607 lexer.ungetToken();
2608 lexer.report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN);
2609 lexer.istackbase = istackbase;
2610 Node.trimEmptyElement(lexer, table);
2611 return;
2612 }
2613
2614 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2615 {
2616 table.insertNodeAtEnd(node);
2617
2618 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2619 continue;
2620 }
2621
2622
2623 lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2624 }
2625
2626 lexer.report.warning(lexer, table, node, Report.MISSING_ENDTAG_FOR);
2627 Node.trimEmptyElement(lexer, table);
2628 lexer.istackbase = istackbase;
2629 }
2630
2631 }
2632
2633 /**
2634 * Parser for COLGROUP.
2635 */
2636 public static class ParseColGroup implements Parser
2637 {
2638
2639 /**
2640 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2641 */
2642 public void parse(Lexer lexer, Node colgroup, short mode)
2643 {
2644 Node node, parent;
2645 TagTable tt = lexer.configuration.tt;
2646
2647 if ((colgroup.tag.model & Dict.CM_EMPTY) != 0)
2648 {
2649 return;
2650 }
2651
2652 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2653 {
2654 if (node.tag == colgroup.tag && node.type == Node.END_TAG)
2655 {
2656 colgroup.closed = true;
2657 return;
2658 }
2659
2660
2661
2662 if (node.type == Node.END_TAG)
2663 {
2664 if (node.tag == tt.tagForm)
2665 {
2666 badForm(lexer);
2667 lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
2668 continue;
2669 }
2670
2671 for (parent = colgroup.parent; parent != null; parent = parent.parent)
2672 {
2673
2674 if (node.tag == parent.tag)
2675 {
2676 lexer.ungetToken();
2677 return;
2678 }
2679 }
2680 }
2681
2682 if (node.type == Node.TEXT_NODE)
2683 {
2684 lexer.ungetToken();
2685 return;
2686 }
2687
2688
2689 if (Node.insertMisc(colgroup, node))
2690 {
2691 continue;
2692 }
2693
2694
2695 if (node.tag == null)
2696 {
2697 lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
2698 continue;
2699 }
2700
2701 if (node.tag != tt.tagCol)
2702 {
2703 lexer.ungetToken();
2704 return;
2705 }
2706
2707 if (node.type == Node.END_TAG)
2708 {
2709 lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
2710 continue;
2711 }
2712
2713
2714 colgroup.insertNodeAtEnd(node);
2715 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2716 }
2717 }
2718
2719 }
2720
2721 /**
2722 * Parser for ROWGROUP.
2723 */
2724 public static class ParseRowGroup implements Parser
2725 {
2726
2727 /**
2728 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2729 */
2730 public void parse(Lexer lexer, Node rowgroup, short mode)
2731 {
2732 Node node, parent;
2733 TagTable tt = lexer.configuration.tt;
2734
2735 if ((rowgroup.tag.model & Dict.CM_EMPTY) != 0)
2736 {
2737 return;
2738 }
2739
2740 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2741 {
2742 if (node.tag == rowgroup.tag)
2743 {
2744 if (node.type == Node.END_TAG)
2745 {
2746 rowgroup.closed = true;
2747 Node.trimEmptyElement(lexer, rowgroup);
2748 return;
2749 }
2750
2751 lexer.ungetToken();
2752 return;
2753 }
2754
2755
2756 if (node.tag == tt.tagTable && node.type == Node.END_TAG)
2757 {
2758 lexer.ungetToken();
2759 Node.trimEmptyElement(lexer, rowgroup);
2760 return;
2761 }
2762
2763
2764 if (Node.insertMisc(rowgroup, node))
2765 {
2766 continue;
2767 }
2768
2769
2770 if (node.tag == null && node.type != Node.TEXT_NODE)
2771 {
2772 lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2773 continue;
2774 }
2775
2776
2777
2778
2779 if (node.type != Node.END_TAG)
2780 {
2781 if (node.tag == tt.tagTd || node.tag == tt.tagTh)
2782 {
2783 lexer.ungetToken();
2784 node = lexer.inferredTag("tr");
2785 lexer.report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG);
2786 }
2787 else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)
2788 {
2789 Node.moveBeforeTable(rowgroup, node, tt);
2790 lexer.report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN);
2791 lexer.exiled = true;
2792
2793
2794 if (node.type != Node.TEXT_NODE)
2795 {
2796 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2797 }
2798
2799 lexer.exiled = false;
2800 continue;
2801 }
2802 else if ((node.tag.model & Dict.CM_HEAD) != 0)
2803 {
2804 lexer.report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN);
2805 moveToHead(lexer, rowgroup, node);
2806 continue;
2807 }
2808 }
2809
2810
2811
2812 if (node.type == Node.END_TAG)
2813 {
2814
2815 if (node.tag == tt.tagForm
2816 || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
2817 {
2818 if (node.tag == tt.tagForm)
2819 {
2820 badForm(lexer);
2821 }
2822 lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2823 continue;
2824 }
2825
2826 if (node.tag == tt.tagTr || node.tag == tt.tagTd || node.tag == tt.tagTh)
2827 {
2828 lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2829 continue;
2830 }
2831
2832 for (parent = rowgroup.parent; parent != null; parent = parent.parent)
2833 {
2834 if (node.tag == parent.tag)
2835 {
2836 lexer.ungetToken();
2837 Node.trimEmptyElement(lexer, rowgroup);
2838 return;
2839 }
2840 }
2841
2842 }
2843
2844
2845
2846 if ((node.tag.model & Dict.CM_ROWGRP) != 0)
2847 {
2848 if (node.type != Node.END_TAG)
2849 {
2850 lexer.ungetToken();
2851 }
2852
2853 Node.trimEmptyElement(lexer, rowgroup);
2854 return;
2855 }
2856
2857 if (node.type == Node.END_TAG)
2858 {
2859 lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2860 continue;
2861 }
2862
2863 if (!(node.tag == tt.tagTr))
2864 {
2865 node = lexer.inferredTag("tr");
2866 lexer.report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG);
2867 lexer.ungetToken();
2868 }
2869
2870
2871 rowgroup.insertNodeAtEnd(node);
2872 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2873 }
2874 Node.trimEmptyElement(lexer, rowgroup);
2875 }
2876 }
2877
2878 /**
2879 * Parser for ROW.
2880 */
2881 public static class ParseRow implements Parser
2882 {
2883
2884 /**
2885 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2886 */
2887 public void parse(Lexer lexer, Node row, short mode)
2888 {
2889 Node node, parent;
2890 boolean excludeState;
2891 TagTable tt = lexer.configuration.tt;
2892
2893 if ((row.tag.model & Dict.CM_EMPTY) != 0)
2894 {
2895 return;
2896 }
2897
2898 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2899 {
2900 if (node.tag == row.tag)
2901 {
2902 if (node.type == Node.END_TAG)
2903 {
2904 row.closed = true;
2905 Node.fixEmptyRow(lexer, row);
2906 return;
2907 }
2908
2909 lexer.ungetToken();
2910 Node.fixEmptyRow(lexer, row);
2911 return;
2912 }
2913
2914
2915 if (node.type == Node.END_TAG)
2916 {
2917 if (node.tag == tt.tagForm
2918 || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
2919 {
2920 if (node.tag == tt.tagForm)
2921 {
2922 badForm(lexer);
2923 }
2924 lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2925 continue;
2926 }
2927
2928 if (node.tag == tt.tagTd || node.tag == tt.tagTh)
2929 {
2930 lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2931 continue;
2932 }
2933
2934 for (parent = row.parent; parent != null; parent = parent.parent)
2935 {
2936 if (node.tag == parent.tag)
2937 {
2938 lexer.ungetToken();
2939 Node.trimEmptyElement(lexer, row);
2940 return;
2941 }
2942 }
2943 }
2944
2945
2946 if (Node.insertMisc(row, node))
2947 {
2948 continue;
2949 }
2950
2951
2952 if (node.tag == null && node.type != Node.TEXT_NODE)
2953 {
2954 lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2955 continue;
2956 }
2957
2958
2959 if (node.tag == tt.tagTable)
2960 {
2961 lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2962 continue;
2963 }
2964
2965
2966 if (node.tag != null && (node.tag.model & Dict.CM_ROWGRP) != 0)
2967 {
2968 lexer.ungetToken();
2969 Node.trimEmptyElement(lexer, row);
2970 return;
2971 }
2972
2973 if (node.type == Node.END_TAG)
2974 {
2975 lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2976 continue;
2977 }
2978
2979
2980
2981 if (node.type != Node.END_TAG)
2982 {
2983 if (node.tag == tt.tagForm)
2984 {
2985 lexer.ungetToken();
2986 node = lexer.inferredTag("td");
2987 lexer.report.warning(lexer, row, node, Report.MISSING_STARTTAG);
2988 }
2989 else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)
2990 {
2991 Node.moveBeforeTable(row, node, tt);
2992 lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
2993 lexer.exiled = true;
2994
2995 if (node.type != Node.TEXT_NODE)
2996 {
2997 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2998 }
2999
3000 lexer.exiled = false;
3001 continue;
3002 }
3003 else if ((node.tag.model & Dict.CM_HEAD) != 0)
3004 {
3005 lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
3006 moveToHead(lexer, row, node);
3007 continue;
3008 }
3009 }
3010
3011 if (!(node.tag == tt.tagTd || node.tag == tt.tagTh))
3012 {
3013 lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
3014 continue;
3015 }
3016
3017
3018 row.insertNodeAtEnd(node);
3019 excludeState = lexer.excludeBlocks;
3020 lexer.excludeBlocks = false;
3021 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
3022 lexer.excludeBlocks = excludeState;
3023
3024
3025
3026 while (lexer.istack.size() > lexer.istackbase)
3027 {
3028 lexer.popInline(null);
3029 }
3030 }
3031
3032 Node.trimEmptyElement(lexer, row);
3033 }
3034
3035 }
3036
3037 /**
3038 * Parser for NOFRAMES.
3039 */
3040 public static class ParseNoFrames implements Parser
3041 {
3042
3043 /**
3044 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3045 */
3046 public void parse(Lexer lexer, Node noframes, short mode)
3047 {
3048 Node node;
3049 TagTable tt = lexer.configuration.tt;
3050
3051 lexer.badAccess |= Report.USING_NOFRAMES;
3052 mode = Lexer.IGNORE_WHITESPACE;
3053
3054 while ((node = lexer.getToken(mode)) != null)
3055 {
3056 if (node.tag == noframes.tag && node.type == Node.END_TAG)
3057 {
3058 noframes.closed = true;
3059 Node.trimSpaces(lexer, noframes);
3060 return;
3061 }
3062
3063 if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset))
3064 {
3065
3066 Node.trimSpaces(lexer, noframes);
3067
3068
3069 if (node.type == Node.END_TAG)
3070 {
3071 lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
3072
3073 }
3074 else
3075 {
3076 lexer.report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_BEFORE);
3077
3078 lexer.ungetToken();
3079 }
3080 return;
3081 }
3082
3083 if (node.tag == tt.tagHtml)
3084 {
3085 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
3086 {
3087 lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
3088 }
3089
3090 continue;
3091 }
3092
3093
3094 if (Node.insertMisc(noframes, node))
3095 {
3096 continue;
3097 }
3098
3099 if (node.tag == tt.tagBody && node.type == Node.START_TAG)
3100 {
3101 boolean seenbody = lexer.seenEndBody;
3102 noframes.insertNodeAtEnd(node);
3103 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
3104
3105 if (seenbody)
3106 {
3107 Node.coerceNode(lexer, node, tt.tagDiv);
3108 moveNodeToBody(lexer, node);
3109 }
3110 continue;
3111 }
3112
3113
3114 if (node.type == Node.TEXT_NODE || (node.tag != null && node.type != Node.END_TAG))
3115 {
3116 if (lexer.seenEndBody)
3117 {
3118 Node body = lexer.root.findBody(tt);
3119
3120 if (node.type == Node.TEXT_NODE)
3121 {
3122 lexer.ungetToken();
3123 node = lexer.inferredTag("p");
3124 lexer.report.warning(lexer, noframes, node, Report.CONTENT_AFTER_BODY);
3125 }
3126
3127 body.insertNodeAtEnd(node);
3128 }
3129 else
3130 {
3131 lexer.ungetToken();
3132 node = lexer.inferredTag("body");
3133 if (lexer.configuration.xmlOut)
3134 {
3135 lexer.report.warning(lexer, noframes, node, Report.INSERTING_TAG);
3136 }
3137 noframes.insertNodeAtEnd(node);
3138 }
3139 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
3140
3141 continue;
3142 }
3143
3144 lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
3145 }
3146
3147 lexer.report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_FOR);
3148 }
3149
3150 }
3151
3152 /**
3153 * Parser for SELECT.
3154 */
3155 public static class ParseSelect implements Parser
3156 {
3157
3158 /**
3159 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3160 */
3161 public void parse(Lexer lexer, Node field, short mode)
3162 {
3163 Node node;
3164 TagTable tt = lexer.configuration.tt;
3165
3166 lexer.insert = -1;
3167
3168 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3169 {
3170 if (node.tag == field.tag && node.type == Node.END_TAG)
3171 {
3172 field.closed = true;
3173 Node.trimSpaces(lexer, field);
3174 return;
3175 }
3176
3177
3178 if (Node.insertMisc(field, node))
3179 {
3180 continue;
3181 }
3182
3183 if (node.type == Node.START_TAG
3184 && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup || node.tag == tt.tagScript))
3185 {
3186 field.insertNodeAtEnd(node);
3187 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
3188 continue;
3189 }
3190
3191
3192 lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
3193 }
3194
3195 lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR);
3196 }
3197
3198 }
3199
3200 /**
3201 * Parser for text nodes.
3202 */
3203 public static class ParseText implements Parser
3204 {
3205
3206 /**
3207 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3208 */
3209 public void parse(Lexer lexer, Node field, short mode)
3210 {
3211 Node node;
3212 TagTable tt = lexer.configuration.tt;
3213
3214 lexer.insert = -1;
3215
3216 if (field.tag == tt.tagTextarea)
3217 {
3218 mode = Lexer.PREFORMATTED;
3219 }
3220 else
3221 {
3222 mode = Lexer.MIXED_CONTENT;
3223 }
3224
3225 while ((node = lexer.getToken(mode)) != null)
3226 {
3227 if (node.tag == field.tag && node.type == Node.END_TAG)
3228 {
3229 field.closed = true;
3230 Node.trimSpaces(lexer, field);
3231 return;
3232 }
3233
3234
3235 if (Node.insertMisc(field, node))
3236 {
3237 continue;
3238 }
3239
3240 if (node.type == Node.TEXT_NODE)
3241 {
3242
3243 if (field.content == null && !((mode & Lexer.PREFORMATTED) != 0))
3244 {
3245 Node.trimSpaces(lexer, field);
3246 }
3247
3248 if (node.start >= node.end)
3249 {
3250 continue;
3251 }
3252
3253 field.insertNodeAtEnd(node);
3254 continue;
3255 }
3256
3257
3258
3259 if (node.tag != null
3260 && ((node.tag.model & Dict.CM_INLINE) != 0)
3261 && (node.tag.model & Dict.CM_FIELD) == 0)
3262 {
3263 lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
3264 continue;
3265 }
3266
3267
3268 if (!((field.tag.model & Dict.CM_OPT) != 0))
3269 {
3270 lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_BEFORE);
3271 }
3272
3273 lexer.ungetToken();
3274 Node.trimSpaces(lexer, field);
3275 return;
3276 }
3277
3278 if (!((field.tag.model & Dict.CM_OPT) != 0))
3279 {
3280 lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR);
3281 }
3282 }
3283
3284 }
3285
3286 /**
3287 * Parser for OPTGROUP.
3288 */
3289 public static class ParseOptGroup implements Parser
3290 {
3291
3292 /**
3293 * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3294 */
3295 public void parse(Lexer lexer, Node field, short mode)
3296 {
3297 Node node;
3298 TagTable tt = lexer.configuration.tt;
3299
3300 lexer.insert = -1;
3301
3302 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3303 {
3304 if (node.tag == field.tag && node.type == Node.END_TAG)
3305 {
3306 field.closed = true;
3307 Node.trimSpaces(lexer, field);
3308 return;
3309 }
3310
3311
3312 if (Node.insertMisc(field, node))
3313 {
3314 continue;
3315 }
3316
3317 if (node.type == Node.START_TAG && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup))
3318 {
3319 if (node.tag == tt.tagOptgroup)
3320 {
3321 lexer.report.warning(lexer, field, node, Report.CANT_BE_NESTED);
3322 }
3323
3324 field.insertNodeAtEnd(node);
3325 parseTag(lexer, node, Lexer.MIXED_CONTENT);
3326 continue;
3327 }
3328
3329
3330 lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
3331 }
3332 }
3333
3334 }
3335
3336 /**
3337 * HTML is the top level element.
3338 */
3339 public static Node parseDocument(Lexer lexer)
3340 {
3341 Node node, document, html;
3342 Node doctype = null;
3343 TagTable tt = lexer.configuration.tt;
3344
3345 document = lexer.newNode();
3346 document.type = Node.ROOT_NODE;
3347
3348 lexer.root = document;
3349
3350 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3351 {
3352
3353 if (Node.insertMisc(document, node))
3354 {
3355 continue;
3356 }
3357
3358 if (node.type == Node.DOCTYPE_TAG)
3359 {
3360 if (doctype == null)
3361 {
3362 document.insertNodeAtEnd(node);
3363 doctype = node;
3364 }
3365 else
3366 {
3367 lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED);
3368 }
3369 continue;
3370 }
3371
3372 if (node.type == Node.END_TAG)
3373 {
3374 lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED);
3375 continue;
3376 }
3377
3378 if (node.type != Node.START_TAG || node.tag != tt.tagHtml)
3379 {
3380 lexer.ungetToken();
3381 html = lexer.inferredTag("html");
3382 }
3383 else
3384 {
3385 html = node;
3386 }
3387
3388 if (document.findDocType() == null && !lexer.configuration.bodyOnly)
3389 {
3390 lexer.report.warning(lexer, null, null, Report.MISSING_DOCTYPE);
3391 }
3392
3393 document.insertNodeAtEnd(html);
3394 HTML.parse(lexer, html, (short) 0);
3395 break;
3396 }
3397
3398 return document;
3399 }
3400
3401 /**
3402 * Indicates whether or not whitespace should be preserved for this element. If an <code>xml:space</code>
3403 * attribute is found, then if the attribute value is <code>preserve</code>, returns <code>true</code>. For
3404 * any other value, returns <code>false</code>. If an <code>xml:space</code> attribute was <em>not</em>
3405 * found, then the following element names result in a return value of <code>true:
3406 * pre, script, style,</code> and
3407 * <code>xsl:text</code>. Finally, if a <code>TagTable</code> was passed in and the element appears as the
3408 * "pre" element in the <code>TagTable</code>, then <code>true</code> will be returned. Otherwise,
3409 * <code>false</code> is returned.
3410 * @param element The <code>Node</code> to test to see if whitespace should be preserved.
3411 * @param tt The <code>TagTable</code> to test for the <code>getNodePre()</code> function. This may be
3412 * <code>null</code>, in which case this test is bypassed.
3413 * @return <code>true</code> or <code>false</code>, as explained above.
3414 */
3415 public static boolean XMLPreserveWhiteSpace(Node element, TagTable tt)
3416 {
3417 AttVal attribute;
3418
3419
3420 for (attribute = element.attributes; attribute != null; attribute = attribute.next)
3421 {
3422 if (attribute.attribute.equals("xml:space"))
3423 {
3424 if (attribute.value.equals("preserve"))
3425 {
3426 return true;
3427 }
3428
3429 return false;
3430 }
3431 }
3432
3433 if (element.element == null)
3434 {
3435 return false;
3436 }
3437
3438
3439 if ("pre".equalsIgnoreCase(element.element)
3440 || "script".equalsIgnoreCase(element.element)
3441 || "style".equalsIgnoreCase(element.element))
3442 {
3443 return true;
3444 }
3445
3446 if ((tt != null) && (tt.findParser(element) == PRE))
3447 {
3448 return true;
3449 }
3450
3451
3452 if ("xsl:text".equalsIgnoreCase(element.element))
3453 {
3454 return true;
3455 }
3456
3457 return false;
3458 }
3459
3460 /**
3461 * XML documents.
3462 */
3463 public static void parseXMLElement(Lexer lexer, Node element, short mode)
3464 {
3465 Node node;
3466
3467
3468
3469 if (XMLPreserveWhiteSpace(element, lexer.configuration.tt))
3470 {
3471 mode = Lexer.PREFORMATTED;
3472 }
3473
3474 while ((node = lexer.getToken(mode)) != null)
3475 {
3476 if (node.type == Node.END_TAG && node.element.equals(element.element))
3477 {
3478 element.closed = true;
3479 break;
3480 }
3481
3482
3483 if (node.type == Node.END_TAG)
3484 {
3485 lexer.report.error(lexer, element, node, Report.UNEXPECTED_ENDTAG);
3486 continue;
3487 }
3488
3489
3490 if (node.type == Node.START_TAG)
3491 {
3492 parseXMLElement(lexer, node, mode);
3493 }
3494
3495 element.insertNodeAtEnd(node);
3496 }
3497
3498
3499
3500 node = element.content;
3501
3502 if (node != null && node.type == Node.TEXT_NODE && mode != Lexer.PREFORMATTED)
3503 {
3504 if (node.textarray[node.start] == (byte) ' ')
3505 {
3506 node.start++;
3507
3508 if (node.start >= node.end)
3509 {
3510 Node.discardElement(node);
3511 }
3512 }
3513 }
3514
3515
3516
3517 node = element.last;
3518
3519 if (node != null && node.type == Node.TEXT_NODE && mode != Lexer.PREFORMATTED)
3520 {
3521 if (node.textarray[node.end - 1] == (byte) ' ')
3522 {
3523 node.end--;
3524
3525 if (node.start >= node.end)
3526 {
3527 Node.discardElement(node);
3528 }
3529 }
3530 }
3531 }
3532
3533 public static Node parseXMLDocument(Lexer lexer)
3534 {
3535 Node node, document, doctype;
3536
3537 document = lexer.newNode();
3538 document.type = Node.ROOT_NODE;
3539 doctype = null;
3540 lexer.configuration.xmlTags = true;
3541
3542 while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3543 {
3544
3545 if (node.type == Node.END_TAG)
3546 {
3547 lexer.report.warning(lexer, null, node, Report.UNEXPECTED_ENDTAG);
3548 continue;
3549 }
3550
3551
3552 if (Node.insertMisc(document, node))
3553 {
3554 continue;
3555 }
3556
3557 if (node.type == Node.DOCTYPE_TAG)
3558 {
3559 if (doctype == null)
3560 {
3561 document.insertNodeAtEnd(node);
3562 doctype = node;
3563 }
3564 else
3565 {
3566 lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED);
3567 }
3568 continue;
3569 }
3570
3571 if (node.type == Node.START_END_TAG)
3572 {
3573 document.insertNodeAtEnd(node);
3574 continue;
3575 }
3576
3577
3578 if (node.type == Node.START_TAG)
3579 {
3580 document.insertNodeAtEnd(node);
3581 parseXMLElement(lexer, node, Lexer.IGNORE_WHITESPACE);
3582 }
3583
3584 }
3585
3586 if (doctype != null && !lexer.checkDocTypeKeyWords(doctype))
3587 {
3588 lexer.report.warning(lexer, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
3589 }
3590
3591
3592 if (lexer.configuration.xmlPi)
3593 {
3594 lexer.fixXmlDecl(document);
3595 }
3596
3597 return document;
3598 }
3599
3600 /**
3601 * errors in positioning of form start or end tags generally require human intervention to fix.
3602 */
3603 static void badForm(Lexer lexer)
3604 {
3605 lexer.badForm = 1;
3606 lexer.errors++;
3607 }
3608
3609 }