View Javadoc

1   /*
2    *  Java HTML Tidy - JTidy
3    *  HTML parser and pretty printer
4    *
5    *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6    *  Institute of Technology, Institut National de Recherche en
7    *  Informatique et en Automatique, Keio University). All Rights
8    *  Reserved.
9    *
10   *  Contributing Author(s):
11   *
12   *     Dave Raggett <dsr@w3.org>
13   *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14   *     Gary L Peskin <garyp@firstech.com> (Java development)
15   *     Sami Lempinen <sami@lempinen.net> (release management)
16   *     Fabrizio Giustina <fgiust at users.sourceforge.net>
17   *
18   *  The contributing author(s) would like to thank all those who
19   *  helped with testing, bug fixes, and patience.  This wouldn't
20   *  have been possible without all of you.
21   *
22   *  COPYRIGHT NOTICE:
23   *
24   *  This software and documentation is provided "as is," and
25   *  the copyright holders and contributing author(s) make no
26   *  representations or warranties, express or implied, including
27   *  but not limited to, warranties of merchantability or fitness
28   *  for any particular purpose or that the use of the software or
29   *  documentation will not infringe any third party patents,
30   *  copyrights, trademarks or other rights.
31   *
32   *  The copyright holders and contributing author(s) will not be
33   *  liable for any direct, indirect, special or consequential damages
34   *  arising out of any use of the software or documentation, even if
35   *  advised of the possibility of such damage.
36   *
37   *  Permission is hereby granted to use, copy, modify, and distribute
38   *  this source code, or portions hereof, documentation and executables,
39   *  for any purpose, without fee, subject to the following restrictions:
40   *
41   *  1. The origin of this source code must not be misrepresented.
42   *  2. Altered versions must be plainly marked as such and must
43   *     not be misrepresented as being the original source.
44   *  3. This Copyright notice may not be removed or altered from any
45   *     source or altered source distribution.
46   *
47   *  The copyright holders and contributing author(s) specifically
48   *  permit, without fee, and encourage the use of this source code
49   *  as a component for supporting the Hypertext Markup Language in
50   *  commercial products. If you use this source code in a product,
51   *  acknowledgment is not required but would be appreciated.
52   *
53   */
54  package org.w3c.tidy;
55  
56  /**
57   * HTML Parser implementation.
58   * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
59   * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
60   * @author Fabrizio Giustina
61   * @version $Revision: 806 $ ($Author: fgiust $)
62   */
63  public final class ParserImpl
64  {
65  
66      /**
67       * parser for html.
68       */
69      public static final Parser HTML = new ParseHTML();
70  
71      /**
72       * parser for head.
73       */
74      public static final Parser HEAD = new ParseHead();
75  
76      /**
77       * parser for title.
78       */
79      public static final Parser TITLE = new ParseTitle();
80  
81      /**
82       * parser for script.
83       */
84      public static final Parser SCRIPT = new ParseScript();
85  
86      /**
87       * parser for body.
88       */
89      public static final Parser BODY = new ParseBody();
90  
91      /**
92       * parser for frameset.
93       */
94      public static final Parser FRAMESET = new ParseFrameSet();
95  
96      /**
97       * parser for inline.
98       */
99      public static final Parser INLINE = new ParseInline();
100 
101     /**
102      * parser for list.
103      */
104     public static final Parser LIST = new ParseList();
105 
106     /**
107      * parser for definition lists.
108      */
109     public static final Parser DEFLIST = new ParseDefList();
110 
111     /**
112      * parser for pre.
113      */
114     public static final Parser PRE = new ParsePre();
115 
116     /**
117      * parser for block elements.
118      */
119     public static final Parser BLOCK = new ParseBlock();
120 
121     /**
122      * parser for table.
123      */
124     public static final Parser TABLETAG = new ParseTableTag();
125 
126     /**
127      * parser for colgroup.
128      */
129     public static final Parser COLGROUP = new ParseColGroup();
130 
131     /**
132      * parser for rowgroup.
133      */
134     public static final Parser ROWGROUP = new ParseRowGroup();
135 
136     /**
137      * parser for row.
138      */
139     public static final Parser ROW = new ParseRow();
140 
141     /**
142      * parser for noframes.
143      */
144     public static final Parser NOFRAMES = new ParseNoFrames();
145 
146     /**
147      * parser for select.
148      */
149     public static final Parser SELECT = new ParseSelect();
150 
151     /**
152      * parser for text.
153      */
154     public static final Parser TEXT = new ParseText();
155 
156     /**
157      * parser for empty elements.
158      */
159     public static final Parser EMPTY = new ParseEmpty();
160 
161     /**
162      * parser for optgroup.
163      */
164     public static final Parser OPTGROUP = new ParseOptGroup();
165 
166     /**
167      * ParserImpl should not be instantiated.
168      */
169     private ParserImpl()
170     {
171         // unused
172     }
173 
174     /**
175      * @param lexer
176      * @param node
177      * @param mode
178      */
179     protected static void parseTag(Lexer lexer, Node node, short mode)
180     {
181         // Fix by GLP 2000-12-21. Need to reset insertspace if this
182         // is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
183         if ((node.tag.model & Dict.CM_EMPTY) != 0)
184         {
185             lexer.waswhite = false;
186         }
187         else if (!((node.tag.model & Dict.CM_INLINE) != 0))
188         {
189             lexer.insertspace = false;
190         }
191 
192         if (node.tag.getParser() == null)
193         {
194             return;
195         }
196 
197         if (node.type == Node.START_END_TAG)
198         {
199             Node.trimEmptyElement(lexer, node);
200             return;
201         }
202 
203         node.tag.getParser().parse(lexer, node, mode);
204     }
205 
206     /**
207      * Move node to the head, where element is used as starting point in hunt for head. Normally called during parsing.
208      * @param lexer
209      * @param element
210      * @param node
211      */
212     protected static void moveToHead(Lexer lexer, Node element, Node node)
213     {
214         Node head;
215         node.removeNode(); // make sure that node is isolated
216 
217         TagTable tt = lexer.configuration.tt;
218 
219         if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
220         {
221             lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
222 
223             while (element.tag != tt.tagHtml)
224             {
225                 element = element.parent;
226             }
227 
228             for (head = element.content; head != null; head = head.next)
229             {
230                 if (head.tag == tt.tagHead)
231                 {
232                     head.insertNodeAtEnd(node);
233                     break;
234                 }
235             }
236 
237             if (node.tag.getParser() != null)
238             {
239                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
240             }
241         }
242         else
243         {
244             lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
245         }
246     }
247 
248     /**
249      * moves given node to end of body element.
250      * @param lexer Lexer
251      * @param node Node to insert
252      */
253     static void moveNodeToBody(Lexer lexer, Node node)
254     {
255         node.removeNode();
256         Node body = lexer.root.findBody(lexer.configuration.tt);
257         body.insertNodeAtEnd(node);
258     }
259 
260     /**
261      * Parser for HTML.
262      */
263     public static class ParseHTML implements Parser
264     {
265 
266         /**
267          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
268          */
269         public void parse(Lexer lexer, Node html, short mode)
270         {
271             Node node, head;
272             Node frameset = null;
273             Node noframes = null;
274 
275             lexer.configuration.xmlTags = false;
276             lexer.seenEndBody = false;
277             TagTable tt = lexer.configuration.tt;
278 
279             while (true)
280             {
281                 node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
282 
283                 if (node == null)
284                 {
285                     node = lexer.inferredTag("head");
286                     break;
287                 }
288 
289                 if (node.tag == tt.tagHead)
290                 {
291                     break;
292                 }
293 
294                 if (node.tag == html.tag && node.type == Node.END_TAG)
295                 {
296                     lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
297                     continue;
298                 }
299 
300                 // deal with comments etc.
301                 if (Node.insertMisc(html, node))
302                 {
303                     continue;
304                 }
305 
306                 lexer.ungetToken();
307                 node = lexer.inferredTag("head");
308                 break;
309             }
310 
311             head = node;
312             html.insertNodeAtEnd(head);
313             HEAD.parse(lexer, head, mode);
314 
315             while (true)
316             {
317                 node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
318 
319                 if (node == null)
320                 {
321                     if (frameset == null)
322                     {
323                         // implied body
324                         node = lexer.inferredTag("body");
325                         html.insertNodeAtEnd(node);
326                         BODY.parse(lexer, node, mode);
327                     }
328 
329                     return;
330                 }
331 
332                 // robustly handle html tags
333                 if (node.tag == html.tag)
334                 {
335                     if (node.type != Node.START_TAG && frameset == null)
336                     {
337                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
338                     }
339                     else if (node.type == Node.END_TAG)
340                     {
341                         lexer.seenEndHtml = true;
342                     }
343 
344                     continue;
345                 }
346 
347                 // deal with comments etc.
348                 if (Node.insertMisc(html, node))
349                 {
350                     continue;
351                 }
352 
353                 // if frameset document coerce <body> to <noframes>
354                 if (node.tag == tt.tagBody)
355                 {
356                     if (node.type != Node.START_TAG)
357                     {
358                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
359                         continue;
360                     }
361 
362                     if (frameset != null)
363                     {
364                         lexer.ungetToken();
365 
366                         if (noframes == null)
367                         {
368                             noframes = lexer.inferredTag("noframes");
369                             frameset.insertNodeAtEnd(noframes);
370                             lexer.report.warning(lexer, html, noframes, Report.INSERTING_TAG);
371                         }
372 
373                         parseTag(lexer, noframes, mode);
374                         continue;
375                     }
376 
377                     lexer.constrainVersion(~Dict.VERS_FRAMESET);
378                     break; // to parse body
379                 }
380 
381                 // flag an error if we see more than one frameset
382                 if (node.tag == tt.tagFrameset)
383                 {
384                     if (node.type != Node.START_TAG)
385                     {
386                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
387                         continue;
388                     }
389 
390                     if (frameset != null)
391                     {
392                         lexer.report.error(lexer, html, node, Report.DUPLICATE_FRAMESET);
393                     }
394                     else
395                     {
396                         frameset = node;
397                     }
398 
399                     html.insertNodeAtEnd(node);
400                     parseTag(lexer, node, mode);
401 
402                     // see if it includes a noframes element so that we can merge subsequent noframes elements
403 
404                     for (node = frameset.content; node != null; node = node.next)
405                     {
406                         if (node.tag == tt.tagNoframes)
407                         {
408                             noframes = node;
409                         }
410                     }
411                     continue;
412                 }
413 
414                 // if not a frameset document coerce <noframes> to <body>
415                 if (node.tag == tt.tagNoframes)
416                 {
417                     if (node.type != Node.START_TAG)
418                     {
419                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
420                         continue;
421                     }
422 
423                     if (frameset == null)
424                     {
425                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
426                         node = lexer.inferredTag("body");
427                         break;
428                     }
429 
430                     if (noframes == null)
431                     {
432                         noframes = node;
433                         frameset.insertNodeAtEnd(noframes);
434                     }
435 
436                     parseTag(lexer, noframes, mode);
437                     continue;
438                 }
439 
440                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
441                 {
442                     if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0)
443                     {
444                         moveToHead(lexer, html, node);
445                         continue;
446                     }
447 
448                     // #427675 - discard illegal frame element following a frameset - fix by Randy Waki 11 Oct 00
449                     if (frameset != null && node.tag == tt.tagFrame)
450                     {
451                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
452                         continue;
453                     }
454                 }
455 
456                 lexer.ungetToken();
457 
458                 // insert other content into noframes element
459                 if (frameset != null)
460                 {
461                     if (noframes == null)
462                     {
463                         noframes = lexer.inferredTag("noframes");
464                         frameset.insertNodeAtEnd(noframes);
465                     }
466                     else
467                     {
468                         lexer.report.warning(lexer, html, node, Report.NOFRAMES_CONTENT);
469                     }
470 
471                     lexer.constrainVersion(Dict.VERS_FRAMESET);
472                     parseTag(lexer, noframes, mode);
473                     continue;
474                 }
475 
476                 node = lexer.inferredTag("body");
477                 lexer.constrainVersion(~Dict.VERS_FRAMESET);
478                 break;
479             }
480 
481             // node must be body
482             html.insertNodeAtEnd(node);
483             parseTag(lexer, node, mode);
484             lexer.seenEndHtml = true;
485         }
486 
487     }
488 
489     /**
490      * Parser for HEAD.
491      */
492     public static class ParseHead implements Parser
493     {
494 
495         /**
496          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
497          */
498         public void parse(Lexer lexer, Node head, short mode)
499         {
500             Node node;
501             int hasTitle = 0;
502             int hasBase = 0;
503             TagTable tt = lexer.configuration.tt;
504 
505             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
506             {
507                 if (node.tag == head.tag && node.type == Node.END_TAG)
508                 {
509                     head.closed = true;
510                     break;
511                 }
512 
513                 if (node.type == Node.TEXT_NODE)
514                 {
515                     lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
516                     lexer.ungetToken();
517                     break;
518                 }
519 
520                 // deal with comments etc.
521                 if (Node.insertMisc(head, node))
522                 {
523                     continue;
524                 }
525 
526                 if (node.type == Node.DOCTYPE_TAG)
527                 {
528                     Node.insertDocType(lexer, head, node);
529                     continue;
530                 }
531 
532                 // discard unknown tags
533                 if (node.tag == null)
534                 {
535                     lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
536                     continue;
537                 }
538 
539                 if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_HEAD))
540                 {
541                     // #545067 Implicit closing of head broken - warn only for XHTML input
542                     if (lexer.isvoyager)
543                     {
544                         lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
545                     }
546                     lexer.ungetToken();
547                     break;
548                 }
549 
550                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
551                 {
552                     if (node.tag == tt.tagTitle)
553                     {
554                         ++hasTitle;
555 
556                         if (hasTitle > 1)
557                         {
558                             lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
559                         }
560                     }
561                     else if (node.tag == tt.tagBase)
562                     {
563                         ++hasBase;
564 
565                         if (hasBase > 1)
566                         {
567                             lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
568                         }
569                     }
570                     else if (node.tag == tt.tagNoscript)
571                     {
572                         lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
573                     }
574 
575                     head.insertNodeAtEnd(node);
576                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
577                     continue;
578                 }
579 
580                 // discard unexpected text nodes and end tags
581                 lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
582             }
583 
584             if (hasTitle == 0)
585             {
586                 if (!lexer.configuration.bodyOnly)
587                 {
588                     lexer.report.warning(lexer, head, null, Report.MISSING_TITLE_ELEMENT);
589                 }
590                 head.insertNodeAtEnd(lexer.inferredTag("title"));
591             }
592         }
593 
594     }
595 
596     /**
597      * Parser for TITLE.
598      */
599     public static class ParseTitle implements Parser
600     {
601 
602         /**
603          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
604          */
605         public void parse(Lexer lexer, Node title, short mode)
606         {
607             Node node;
608 
609             while ((node = lexer.getToken(Lexer.MIXED_CONTENT)) != null)
610             {
611                 // [438658] : Missing / in title endtag makes 2 titles
612                 if (node.tag == title.tag && node.type == Node.START_TAG)
613                 {
614                     lexer.report.warning(lexer, title, node, Report.COERCE_TO_ENDTAG);
615                     node.type = Node.END_TAG;
616                     continue;
617                 }
618                 else if (node.tag == title.tag && node.type == Node.END_TAG)
619                 {
620                     title.closed = true;
621                     Node.trimSpaces(lexer, title);
622                     return;
623                 }
624 
625                 if (node.type == Node.TEXT_NODE)
626                 {
627                     // only called for 1st child
628                     if (title.content == null)
629                     {
630                         Node.trimInitialSpace(lexer, title, node);
631                     }
632 
633                     if (node.start >= node.end)
634                     {
635                         continue;
636                     }
637 
638                     title.insertNodeAtEnd(node);
639                     continue;
640                 }
641 
642                 // deal with comments etc.
643                 if (Node.insertMisc(title, node))
644                 {
645                     continue;
646                 }
647 
648                 // discard unknown tags
649                 if (node.tag == null)
650                 {
651                     lexer.report.warning(lexer, title, node, Report.DISCARDING_UNEXPECTED);
652                     continue;
653                 }
654 
655                 // pushback unexpected tokens
656                 lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_BEFORE);
657                 lexer.ungetToken();
658                 Node.trimSpaces(lexer, title);
659                 return;
660             }
661 
662             lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_FOR);
663         }
664 
665     }
666 
667     /**
668      * Parser for SCRIPT.
669      */
670     public static class ParseScript implements Parser
671     {
672 
673         /**
674          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
675          */
676         public void parse(Lexer lexer, Node script, short mode)
677         {
678             // This isn't quite right for CDATA content as it recognises tags within the content and parses them
679             // accordingly. This will unfortunately screw up scripts which include < + letter, < + !, < + ? or < + / +
680             // letter
681 
682             Node node = lexer.getCDATA(script);
683 
684             if (node != null)
685             {
686                 script.insertNodeAtEnd(node);
687             }
688         }
689 
690     }
691 
692     /**
693      * Parser for BODY.
694      */
695     public static class ParseBody implements Parser
696     {
697 
698         /**
699          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
700          */
701         public void parse(Lexer lexer, Node body, short mode)
702         {
703             Node node;
704             boolean checkstack, iswhitenode;
705 
706             mode = Lexer.IGNORE_WHITESPACE;
707             checkstack = true;
708             TagTable tt = lexer.configuration.tt;
709 
710             Clean.bumpObject(lexer, body.parent);
711 
712             while ((node = lexer.getToken(mode)) != null)
713             {
714 
715                 // #538536 Extra endtags not detected
716                 if (node.tag == tt.tagHtml)
717                 {
718                     if (node.type == Node.START_TAG || node.type == Node.START_END_TAG || lexer.seenEndHtml)
719                     {
720                         lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
721                     }
722                     else
723                     {
724                         lexer.seenEndHtml = true;
725                     }
726 
727                     continue;
728                 }
729 
730                 if (lexer.seenEndBody
731                     && (node.type == Node.START_TAG || node.type == Node.END_TAG || node.type == Node.START_END_TAG))
732                 {
733                     lexer.report.warning(lexer, body, node, Report.CONTENT_AFTER_BODY);
734                 }
735 
736                 if (node.tag == body.tag && node.type == Node.END_TAG)
737                 {
738                     body.closed = true;
739                     Node.trimSpaces(lexer, body);
740                     lexer.seenEndBody = true;
741                     mode = Lexer.IGNORE_WHITESPACE;
742 
743                     if (body.parent.tag == tt.tagNoframes)
744                     {
745                         break;
746                     }
747 
748                     continue;
749                 }
750 
751                 if (node.tag == tt.tagNoframes)
752                 {
753                     if (node.type == Node.START_TAG)
754                     {
755                         body.insertNodeAtEnd(node);
756                         BLOCK.parse(lexer, node, mode);
757                         continue;
758                     }
759 
760                     if (node.type == Node.END_TAG && body.parent.tag == tt.tagNoframes)
761                     {
762                         Node.trimSpaces(lexer, body);
763                         lexer.ungetToken();
764                         break;
765                     }
766                 }
767 
768                 if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset) && body.parent.tag == tt.tagNoframes)
769                 {
770                     Node.trimSpaces(lexer, body);
771                     lexer.ungetToken();
772                     break;
773                 }
774 
775                 iswhitenode = false;
776 
777                 if (node.type == Node.TEXT_NODE
778                     && node.end <= node.start + 1
779                     && node.textarray[node.start] == (byte) ' ')
780                 {
781                     iswhitenode = true;
782                 }
783 
784                 // deal with comments etc.
785                 if (Node.insertMisc(body, node))
786                 {
787                     continue;
788                 }
789 
790                 // #538536 Extra endtags not detected
791                 // if (lexer.seenEndBody && !iswhitenode)
792                 // {
793                 // lexer.seenEndBody = true;
794                 // lexer.report.warning(lexer, body, node, Report.CONTENT_AFTER_BODY);
795                 // }
796 
797                 // mixed content model permits text
798                 if (node.type == Node.TEXT_NODE)
799                 {
800                     if (iswhitenode && mode == Lexer.IGNORE_WHITESPACE)
801                     {
802                         continue;
803                     }
804 
805                     if (lexer.configuration.encloseBodyText && !iswhitenode)
806                     {
807                         Node para;
808 
809                         lexer.ungetToken();
810                         para = lexer.inferredTag("p");
811                         body.insertNodeAtEnd(para);
812                         parseTag(lexer, para, mode);
813                         mode = Lexer.MIXED_CONTENT;
814                         continue;
815                     }
816 
817                     // HTML2 and HTML4 strict doesn't allow text here
818                     lexer.constrainVersion(~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20));
819 
820                     if (checkstack)
821                     {
822                         checkstack = false;
823 
824                         if (lexer.inlineDup(node) > 0)
825                         {
826                             continue;
827                         }
828                     }
829 
830                     body.insertNodeAtEnd(node);
831                     mode = Lexer.MIXED_CONTENT;
832                     continue;
833                 }
834 
835                 if (node.type == Node.DOCTYPE_TAG)
836                 {
837                     Node.insertDocType(lexer, body, node);
838                     continue;
839                 }
840                 // discard unknown and PARAM tags
841                 if (node.tag == null || node.tag == tt.tagParam)
842                 {
843                     lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
844                     continue;
845                 }
846 
847                 // Netscape allows LI and DD directly in BODY We infer UL or DL respectively and use this boolean to
848                 // exclude block-level elements so as to match Netscape's observed behaviour.
849 
850                 lexer.excludeBlocks = false;
851 
852                 if ((!((node.tag.model & Dict.CM_BLOCK) != 0) && !((node.tag.model & Dict.CM_INLINE) != 0))
853                     || node.tag == tt.tagInput)
854                 {
855                     // avoid this error message being issued twice
856                     if (!((node.tag.model & Dict.CM_HEAD) != 0))
857                     {
858                         lexer.report.warning(lexer, body, node, Report.TAG_NOT_ALLOWED_IN);
859                     }
860 
861                     if ((node.tag.model & Dict.CM_HTML) != 0)
862                     {
863                         // copy body attributes if current body was inferred
864                         if (node.tag == tt.tagBody && body.implicit && body.attributes == null)
865                         {
866                             body.attributes = node.attributes;
867                             node.attributes = null;
868                         }
869 
870                         continue;
871                     }
872 
873                     if ((node.tag.model & Dict.CM_HEAD) != 0)
874                     {
875                         moveToHead(lexer, body, node);
876                         continue;
877                     }
878 
879                     if ((node.tag.model & Dict.CM_LIST) != 0)
880                     {
881                         lexer.ungetToken();
882                         node = lexer.inferredTag("ul");
883                         node.addClass("noindent");
884                         lexer.excludeBlocks = true;
885                     }
886                     else if ((node.tag.model & Dict.CM_DEFLIST) != 0)
887                     {
888                         lexer.ungetToken();
889                         node = lexer.inferredTag("dl");
890                         lexer.excludeBlocks = true;
891                     }
892                     else if ((node.tag.model & (Dict.CM_TABLE | Dict.CM_ROWGRP | Dict.CM_ROW)) != 0)
893                     {
894                         lexer.ungetToken();
895                         node = lexer.inferredTag("table");
896                         lexer.excludeBlocks = true;
897                     }
898                     else if (node.tag == tt.tagInput)
899                     {
900                         lexer.ungetToken();
901                         node = lexer.inferredTag("form");
902                         lexer.excludeBlocks = true;
903                     }
904                     else
905                     {
906                         if (!((node.tag.model & (Dict.CM_ROW | Dict.CM_FIELD)) != 0))
907                         {
908                             lexer.ungetToken();
909                             return;
910                         }
911 
912                         // ignore </td></th> <option> etc.
913                         continue;
914                     }
915                 }
916 
917                 if (node.type == Node.END_TAG)
918                 {
919                     if (node.tag == tt.tagBr)
920                     {
921                         node.type = Node.START_TAG;
922                     }
923                     else if (node.tag == tt.tagP)
924                     {
925                         Node.coerceNode(lexer, node, tt.tagBr);
926                         body.insertNodeAtEnd(node);
927                         node = lexer.inferredTag("br");
928                     }
929                     else if ((node.tag.model & Dict.CM_INLINE) != 0)
930                     {
931                         lexer.popInline(node);
932                     }
933                 }
934 
935                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
936                 {
937                     if (((node.tag.model & Dict.CM_INLINE) != 0) && !((node.tag.model & Dict.CM_MIXED) != 0))
938                     {
939                         // HTML4 strict doesn't allow inline content here
940                         // but HTML2 does allow img elements as children of body
941                         if (node.tag == tt.tagImg)
942                         {
943                             lexer.constrainVersion(~Dict.VERS_HTML40_STRICT);
944                         }
945                         else
946                         {
947                             lexer.constrainVersion(~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20));
948                         }
949 
950                         if (checkstack && !node.implicit)
951                         {
952                             checkstack = false;
953 
954                             if (lexer.inlineDup(node) > 0)
955                             {
956                                 continue;
957                             }
958                         }
959 
960                         mode = Lexer.MIXED_CONTENT;
961                     }
962                     else
963                     {
964                         checkstack = true;
965                         mode = Lexer.IGNORE_WHITESPACE;
966                     }
967 
968                     if (node.implicit)
969                     {
970                         lexer.report.warning(lexer, body, node, Report.INSERTING_TAG);
971                     }
972 
973                     body.insertNodeAtEnd(node);
974                     parseTag(lexer, node, mode);
975                     continue;
976                 }
977 
978                 // discard unexpected tags
979                 lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
980             }
981         }
982 
983     }
984 
985     /**
986      * Parser for FRAMESET.
987      */
988     public static class ParseFrameSet implements Parser
989     {
990 
991         /**
992          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
993          */
994         public void parse(Lexer lexer, Node frameset, short mode)
995         {
996             Node node;
997             TagTable tt = lexer.configuration.tt;
998 
999             lexer.badAccess |= Report.USING_FRAMES;
1000 
1001             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
1002             {
1003                 if (node.tag == frameset.tag && node.type == Node.END_TAG)
1004                 {
1005                     frameset.closed = true;
1006                     Node.trimSpaces(lexer, frameset);
1007                     return;
1008                 }
1009 
1010                 // deal with comments etc.
1011                 if (Node.insertMisc(frameset, node))
1012                 {
1013                     continue;
1014                 }
1015 
1016                 if (node.tag == null)
1017                 {
1018                     lexer.report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED);
1019                     continue;
1020                 }
1021 
1022                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1023                 {
1024                     if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0)
1025                     {
1026                         moveToHead(lexer, frameset, node);
1027                         continue;
1028                     }
1029                 }
1030 
1031                 if (node.tag == tt.tagBody)
1032                 {
1033                     lexer.ungetToken();
1034                     node = lexer.inferredTag("noframes");
1035                     lexer.report.warning(lexer, frameset, node, Report.INSERTING_TAG);
1036                 }
1037 
1038                 if (node.type == Node.START_TAG && (node.tag.model & Dict.CM_FRAMES) != 0)
1039                 {
1040                     frameset.insertNodeAtEnd(node);
1041                     lexer.excludeBlocks = false;
1042                     parseTag(lexer, node, Lexer.MIXED_CONTENT);
1043                     continue;
1044                 }
1045                 else if (node.type == Node.START_END_TAG && (node.tag.model & Dict.CM_FRAMES) != 0)
1046                 {
1047                     frameset.insertNodeAtEnd(node);
1048                     continue;
1049                 }
1050 
1051                 // discard unexpected tags
1052                 lexer.report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED);
1053             }
1054 
1055             lexer.report.warning(lexer, frameset, node, Report.MISSING_ENDTAG_FOR);
1056         }
1057 
1058     }
1059 
1060     /**
1061      * Parser for INLINE.
1062      */
1063     public static class ParseInline implements Parser
1064     {
1065 
1066         /**
1067          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1068          */
1069         public void parse(Lexer lexer, Node element, short mode)
1070         {
1071             Node node, parent;
1072             TagTable tt = lexer.configuration.tt;
1073 
1074             if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY))
1075             {
1076                 return;
1077             }
1078 
1079             // ParseInline is used for some block level elements like H1 to H6 For such elements we need to insert
1080             // inline emphasis tags currently on the inline stack. For Inline elements, we normally push them onto the
1081             // inline stack provided they aren't implicit or OBJECT/APPLET. This test is carried out in PushInline and
1082             // PopInline, see istack.c We don't push SPAN to replicate current browser behavior
1083 
1084             if (TidyUtils.toBoolean(element.tag.model & Dict.CM_BLOCK) || (element.tag == tt.tagDt))
1085             {
1086                 lexer.inlineDup(null);
1087             }
1088             else if (TidyUtils.toBoolean(element.tag.model & Dict.CM_INLINE))
1089             {
1090                 // && element.tag != tt.tagSpan #540571 Inconsistent behaviour with span inline element
1091                 lexer.pushInline(element);
1092             }
1093 
1094             if (element.tag == tt.tagNobr)
1095             {
1096                 lexer.badLayout |= Report.USING_NOBR;
1097             }
1098             else if (element.tag == tt.tagFont)
1099             {
1100                 lexer.badLayout |= Report.USING_FONT;
1101             }
1102 
1103             // Inline elements may or may not be within a preformatted element
1104             if (mode != Lexer.PREFORMATTED)
1105             {
1106                 mode = Lexer.MIXED_CONTENT;
1107             }
1108 
1109             while ((node = lexer.getToken(mode)) != null)
1110             {
1111                 // end tag for current element
1112                 if (node.tag == element.tag && node.type == Node.END_TAG)
1113                 {
1114                     if (TidyUtils.toBoolean(element.tag.model & Dict.CM_INLINE))
1115                     {
1116                         lexer.popInline(node);
1117                     }
1118 
1119                     if (!TidyUtils.toBoolean(mode & Lexer.PREFORMATTED))
1120                     {
1121                         Node.trimSpaces(lexer, element);
1122                     }
1123 
1124                     // if a font element wraps an anchor and nothing else then move the font element inside the anchor
1125                     // since otherwise it won't alter the anchor text color
1126 
1127                     if (element.tag == tt.tagFont && element.content != null && element.content == element.last)
1128                     {
1129                         Node child = element.content;
1130 
1131                         if (child.tag == tt.tagA)
1132                         {
1133                             child.parent = element.parent;
1134                             child.next = element.next;
1135                             child.prev = element.prev;
1136 
1137                             if (child.prev != null)
1138                             {
1139                                 child.prev.next = child;
1140                             }
1141                             else
1142                             {
1143                                 child.parent.content = child;
1144                             }
1145 
1146                             if (child.next != null)
1147                             {
1148                                 child.next.prev = child;
1149                             }
1150                             else
1151                             {
1152                                 child.parent.last = child;
1153                             }
1154 
1155                             element.next = null;
1156                             element.prev = null;
1157                             element.parent = child;
1158                             element.content = child.content;
1159                             element.last = child.last;
1160                             child.content = element;
1161                             child.last = element;
1162                             for (child = element.content; child != null; child = child.next)
1163                             {
1164                                 child.parent = element;
1165                             }
1166                         }
1167                     }
1168                     element.closed = true;
1169                     Node.trimSpaces(lexer, element);
1170                     Node.trimEmptyElement(lexer, element);
1171                     return;
1172                 }
1173 
1174                 // <u> ... <u> map 2nd <u> to </u> if 1st is explicit
1175                 // otherwise emphasis nesting is probably unintentional
1176                 // big and small have cumulative effect to leave them alone
1177                 if (node.type == Node.START_TAG
1178                     && node.tag == element.tag
1179                     && lexer.isPushed(node)
1180                     && !node.implicit
1181                     && !element.implicit
1182                     && node.tag != null
1183                     && ((node.tag.model & Dict.CM_INLINE) != 0)
1184                     && node.tag != tt.tagA
1185                     && node.tag != tt.tagFont
1186                     && node.tag != tt.tagBig
1187                     && node.tag != tt.tagSmall
1188                     && node.tag != tt.tagQ)
1189                 {
1190                     if (element.content != null && node.attributes == null)
1191                     {
1192                         lexer.report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG);
1193                         node.type = Node.END_TAG;
1194                         lexer.ungetToken();
1195                         continue;
1196                     }
1197 
1198                     lexer.report.warning(lexer, element, node, Report.NESTED_EMPHASIS);
1199                 }
1200                 else if (lexer.isPushed(node) && node.type == Node.START_TAG && node.tag == tt.tagQ)
1201                 {
1202                     lexer.report.warning(lexer, element, node, Report.NESTED_QUOTATION);
1203                 }
1204 
1205                 if (node.type == Node.TEXT_NODE)
1206                 {
1207                     // only called for 1st child
1208                     if (element.content == null && !TidyUtils.toBoolean(mode & Lexer.PREFORMATTED))
1209                     {
1210                         Node.trimSpaces(lexer, element);
1211                     }
1212 
1213                     if (node.start >= node.end)
1214                     {
1215                         continue;
1216                     }
1217 
1218                     element.insertNodeAtEnd(node);
1219                     continue;
1220                 }
1221 
1222                 // mixed content model so allow text
1223                 if (Node.insertMisc(element, node))
1224                 {
1225                     continue;
1226                 }
1227 
1228                 // deal with HTML tags
1229                 if (node.tag == tt.tagHtml)
1230                 {
1231                     if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1232                     {
1233                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1234                         continue;
1235                     }
1236 
1237                     // otherwise infer end of inline element
1238                     lexer.ungetToken();
1239                     if (!((mode & Lexer.PREFORMATTED) != 0))
1240                     {
1241                         Node.trimSpaces(lexer, element);
1242                     }
1243                     Node.trimEmptyElement(lexer, element);
1244                     return;
1245                 }
1246 
1247                 // within <dt> or <pre> map <p> to <br>
1248                 if (node.tag == tt.tagP
1249                     && node.type == Node.START_TAG
1250                     && ((mode & Lexer.PREFORMATTED) != 0 || element.tag == tt.tagDt || element.isDescendantOf(tt.tagDt)))
1251                 {
1252                     node.tag = tt.tagBr;
1253                     node.element = "br";
1254                     Node.trimSpaces(lexer, element);
1255                     element.insertNodeAtEnd(node);
1256                     continue;
1257                 }
1258 
1259                 // ignore unknown and PARAM tags
1260                 if (node.tag == null || node.tag == tt.tagParam)
1261                 {
1262                     lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1263                     continue;
1264                 }
1265 
1266                 if (node.tag == tt.tagBr && node.type == Node.END_TAG)
1267                 {
1268                     node.type = Node.START_TAG;
1269                 }
1270 
1271                 if (node.type == Node.END_TAG)
1272                 {
1273                     // coerce </br> to <br>
1274                     if (node.tag == tt.tagBr)
1275                     {
1276                         node.type = Node.START_TAG;
1277                     }
1278                     else if (node.tag == tt.tagP)
1279                     {
1280                         // coerce unmatched </p> to <br><br>
1281                         if (!element.isDescendantOf(tt.tagP))
1282                         {
1283                             Node.coerceNode(lexer, node, tt.tagBr);
1284                             Node.trimSpaces(lexer, element);
1285                             element.insertNodeAtEnd(node);
1286                             node = lexer.inferredTag("br");
1287                             continue;
1288                         }
1289                     }
1290                     else if ((node.tag.model & Dict.CM_INLINE) != 0
1291                         && node.tag != tt.tagA
1292                         && !((node.tag.model & Dict.CM_OBJECT) != 0)
1293                         && (element.tag.model & Dict.CM_INLINE) != 0)
1294                     {
1295                         // allow any inline end tag to end current element
1296                         lexer.popInline(element);
1297 
1298                         if (element.tag != tt.tagA)
1299                         {
1300                             if (node.tag == tt.tagA && node.tag != element.tag)
1301                             {
1302                                 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1303                                 lexer.ungetToken();
1304                             }
1305                             else
1306                             {
1307                                 lexer.report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG);
1308                             }
1309 
1310                             if (!((mode & Lexer.PREFORMATTED) != 0))
1311                             {
1312                                 Node.trimSpaces(lexer, element);
1313                             }
1314                             Node.trimEmptyElement(lexer, element);
1315                             return;
1316                         }
1317 
1318                         // if parent is <a> then discard unexpected inline end tag
1319                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1320                         continue;
1321                     } // special case </tr> etc. for stuff moved in front of table
1322                     else if (lexer.exiled && node.tag.model != 0 && (node.tag.model & Dict.CM_TABLE) != 0)
1323                     {
1324                         lexer.ungetToken();
1325                         Node.trimSpaces(lexer, element);
1326                         Node.trimEmptyElement(lexer, element);
1327                         return;
1328                     }
1329                 }
1330 
1331                 // allow any header tag to end current header
1332                 if ((node.tag.model & Dict.CM_HEADING) != 0 && (element.tag.model & Dict.CM_HEADING) != 0)
1333                 {
1334                     if (node.tag == element.tag)
1335                     {
1336                         lexer.report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG);
1337                     }
1338                     else
1339                     {
1340                         lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1341                         lexer.ungetToken();
1342                     }
1343                     if (!((mode & Lexer.PREFORMATTED) != 0))
1344                     {
1345                         Node.trimSpaces(lexer, element);
1346                     }
1347                     Node.trimEmptyElement(lexer, element);
1348                     return;
1349                 }
1350 
1351                 // an <A> tag to ends any open <A> element but <A href=...> is mapped to </A><A href=...>
1352 
1353                 // #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00
1354                 // if (node.tag == tt.tagA && !node.implicit && lexer.isPushed(node))
1355                 if (node.tag == tt.tagA
1356                     && !node.implicit
1357                     && (element.tag == tt.tagA || element.isDescendantOf(tt.tagA)))
1358                 {
1359                     // coerce <a> to </a> unless it has some attributes
1360                     // #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00
1361                     // other fixes by Dave Raggett
1362                     // if (node.attributes == null)
1363                     if (node.type != Node.END_TAG && node.attributes == null)
1364                     {
1365                         node.type = Node.END_TAG;
1366                         lexer.report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG);
1367                         // lexer.popInline(node);
1368                         lexer.ungetToken();
1369                         continue;
1370                     }
1371 
1372                     lexer.ungetToken();
1373                     lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1374                     // lexer.popInline(element);
1375                     if (!((mode & Lexer.PREFORMATTED) != 0))
1376                     {
1377                         Node.trimSpaces(lexer, element);
1378                     }
1379                     Node.trimEmptyElement(lexer, element);
1380                     return;
1381                 }
1382 
1383                 if ((element.tag.model & Dict.CM_HEADING) != 0)
1384                 {
1385                     if (node.tag == tt.tagCenter || node.tag == tt.tagDiv)
1386                     {
1387                         if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
1388                         {
1389                             lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1390                             continue;
1391                         }
1392 
1393                         lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
1394 
1395                         // insert center as parent if heading is empty
1396                         if (element.content == null)
1397                         {
1398                             Node.insertNodeAsParent(element, node);
1399                             continue;
1400                         }
1401 
1402                         // split heading and make center parent of 2nd part
1403                         element.insertNodeAfterElement(node);
1404 
1405                         if (!((mode & Lexer.PREFORMATTED) != 0))
1406                         {
1407                             Node.trimSpaces(lexer, element);
1408                         }
1409 
1410                         element = lexer.cloneNode(element);
1411                         element.start = lexer.lexsize;
1412                         element.end = lexer.lexsize;
1413                         node.insertNodeAtEnd(element);
1414                         continue;
1415                     }
1416 
1417                     if (node.tag == tt.tagHr)
1418                     {
1419                         if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
1420                         {
1421                             lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1422                             continue;
1423                         }
1424 
1425                         lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
1426 
1427                         // insert hr before heading if heading is empty
1428                         if (element.content == null)
1429                         {
1430                             Node.insertNodeBeforeElement(element, node);
1431                             continue;
1432                         }
1433 
1434                         // split heading and insert hr before 2nd part
1435                         element.insertNodeAfterElement(node);
1436 
1437                         if (!((mode & Lexer.PREFORMATTED) != 0))
1438                         {
1439                             Node.trimSpaces(lexer, element);
1440                         }
1441 
1442                         element = lexer.cloneNode(element);
1443                         element.start = lexer.lexsize;
1444                         element.end = lexer.lexsize;
1445                         node.insertNodeAfterElement(element);
1446                         continue;
1447                     }
1448                 }
1449 
1450                 if (element.tag == tt.tagDt)
1451                 {
1452                     if (node.tag == tt.tagHr)
1453                     {
1454                         Node dd;
1455 
1456                         if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
1457                         {
1458                             lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1459                             continue;
1460                         }
1461 
1462                         lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
1463                         dd = lexer.inferredTag("dd");
1464 
1465                         // insert hr within dd before dt if dt is empty
1466                         if (element.content == null)
1467                         {
1468                             Node.insertNodeBeforeElement(element, dd);
1469                             dd.insertNodeAtEnd(node);
1470                             continue;
1471                         }
1472 
1473                         // split dt and insert hr within dd before 2nd part
1474                         element.insertNodeAfterElement(dd);
1475                         dd.insertNodeAtEnd(node);
1476 
1477                         if (!((mode & Lexer.PREFORMATTED) != 0))
1478                         {
1479                             Node.trimSpaces(lexer, element);
1480                         }
1481 
1482                         element = lexer.cloneNode(element);
1483                         element.start = lexer.lexsize;
1484                         element.end = lexer.lexsize;
1485                         dd.insertNodeAfterElement(element);
1486                         continue;
1487                     }
1488                 }
1489 
1490                 // if this is the end tag for an ancestor element then infer end tag for this element
1491 
1492                 if (node.type == Node.END_TAG)
1493                 {
1494                     for (parent = element.parent; parent != null; parent = parent.parent)
1495                     {
1496                         if (node.tag == parent.tag)
1497                         {
1498                             if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit)
1499                             {
1500                                 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1501                             }
1502 
1503                             if (element.tag == tt.tagA)
1504                             {
1505                                 lexer.popInline(element);
1506                             }
1507 
1508                             lexer.ungetToken();
1509 
1510                             if (!((mode & Lexer.PREFORMATTED) != 0))
1511                             {
1512                                 Node.trimSpaces(lexer, element);
1513                             }
1514 
1515                             Node.trimEmptyElement(lexer, element);
1516                             return;
1517                         }
1518                     }
1519                 }
1520 
1521                 // block level tags end this element
1522                 if (!((node.tag.model & Dict.CM_INLINE) != 0))
1523                 {
1524                     if (node.type != Node.START_TAG)
1525                     {
1526                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1527                         continue;
1528                     }
1529 
1530                     if (!((element.tag.model & Dict.CM_OPT) != 0))
1531                     {
1532                         lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1533                     }
1534 
1535                     if ((node.tag.model & Dict.CM_HEAD) != 0 && !((node.tag.model & Dict.CM_BLOCK) != 0))
1536                     {
1537                         moveToHead(lexer, element, node);
1538                         continue;
1539                     }
1540 
1541                     // prevent anchors from propagating into block tags except for headings h1 to h6
1542 
1543                     if (element.tag == tt.tagA)
1544                     {
1545                         if (node.tag != null && !((node.tag.model & Dict.CM_HEADING) != 0))
1546                         {
1547                             lexer.popInline(element);
1548                         }
1549                         else if (!(element.content != null))
1550                         {
1551                             Node.discardElement(element);
1552                             lexer.ungetToken();
1553                             return;
1554                         }
1555                     }
1556 
1557                     lexer.ungetToken();
1558 
1559                     if (!((mode & Lexer.PREFORMATTED) != 0))
1560                     {
1561                         Node.trimSpaces(lexer, element);
1562                     }
1563 
1564                     Node.trimEmptyElement(lexer, element);
1565                     return;
1566                 }
1567 
1568                 // parse inline element
1569                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1570                 {
1571                     if (node.implicit)
1572                     {
1573                         lexer.report.warning(lexer, element, node, Report.INSERTING_TAG);
1574                     }
1575 
1576                     // trim white space before <br>
1577                     if (node.tag == tt.tagBr)
1578                     {
1579                         Node.trimSpaces(lexer, element);
1580                     }
1581 
1582                     element.insertNodeAtEnd(node);
1583                     parseTag(lexer, node, mode);
1584                     continue;
1585                 }
1586 
1587                 // discard unexpected tags
1588                 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1589                 continue;
1590             }
1591 
1592             if (!((element.tag.model & Dict.CM_OPT) != 0))
1593             {
1594                 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR);
1595             }
1596 
1597             Node.trimEmptyElement(lexer, element);
1598         }
1599     }
1600 
1601     /**
1602      * Parser for LIST.
1603      */
1604     public static class ParseList implements Parser
1605     {
1606 
1607         public void parse(Lexer lexer, Node list, short mode)
1608         {
1609             Node node;
1610             Node parent;
1611             TagTable tt = lexer.configuration.tt;
1612 
1613             if ((list.tag.model & Dict.CM_EMPTY) != 0)
1614             {
1615                 return;
1616             }
1617 
1618             lexer.insert = -1; // defer implicit inline start tags
1619 
1620             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
1621             {
1622                 if (node.tag == list.tag && node.type == Node.END_TAG)
1623                 {
1624                     if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1625                     {
1626                         Node.coerceNode(lexer, list, tt.tagUl);
1627                     }
1628 
1629                     list.closed = true;
1630                     Node.trimEmptyElement(lexer, list);
1631                     return;
1632                 }
1633 
1634                 // deal with comments etc.
1635                 if (Node.insertMisc(list, node))
1636                 {
1637                     continue;
1638                 }
1639 
1640                 if (node.type != Node.TEXT_NODE && node.tag == null)
1641                 {
1642                     lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1643                     continue;
1644                 }
1645 
1646                 // if this is the end tag for an ancestor element then infer end tag for this element
1647 
1648                 if (node.type == Node.END_TAG)
1649                 {
1650                     if (node.tag == tt.tagForm)
1651                     {
1652                         badForm(lexer);
1653                         lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1654                         continue;
1655                     }
1656 
1657                     if (node.tag != null && (node.tag.model & Dict.CM_INLINE) != 0)
1658                     {
1659                         lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1660                         lexer.popInline(node);
1661                         continue;
1662                     }
1663 
1664                     for (parent = list.parent; parent != null; parent = parent.parent)
1665                     {
1666                         if (node.tag == parent.tag)
1667                         {
1668                             lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
1669                             lexer.ungetToken();
1670 
1671                             if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1672                             {
1673                                 Node.coerceNode(lexer, list, tt.tagUl);
1674                             }
1675 
1676                             Node.trimEmptyElement(lexer, list);
1677                             return;
1678                         }
1679                     }
1680 
1681                     lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1682                     continue;
1683                 }
1684 
1685                 if (node.tag != tt.tagLi)
1686                 {
1687                     lexer.ungetToken();
1688 
1689                     if (node.tag != null && (node.tag.model & Dict.CM_BLOCK) != 0 && lexer.excludeBlocks)
1690                     {
1691                         lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
1692                         Node.trimEmptyElement(lexer, list);
1693                         return;
1694                     }
1695 
1696                     node = lexer.inferredTag("li");
1697                     node.addAttribute("style", "list-style: none");
1698                     lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG);
1699                 }
1700 
1701                 // node should be <LI>
1702                 list.insertNodeAtEnd(node);
1703                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
1704             }
1705 
1706             if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1707             {
1708                 Node.coerceNode(lexer, list, tt.tagUl);
1709             }
1710 
1711             lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR);
1712             Node.trimEmptyElement(lexer, list);
1713         }
1714 
1715     }
1716 
1717     /**
1718      * Parser for empty elements.
1719      */
1720     public static class ParseEmpty implements Parser
1721     {
1722 
1723         /**
1724          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1725          */
1726         public void parse(Lexer lexer, Node element, short mode)
1727         {
1728             if (lexer.isvoyager)
1729             {
1730                 Node node = lexer.getToken(mode);
1731                 if (node != null && !(node.type == Node.END_TAG && node.tag == element.tag))
1732                 {
1733                     lexer.report.warning(lexer, element, node, Report.ELEMENT_NOT_EMPTY);
1734                     lexer.ungetToken();
1735                 }
1736             }
1737         }
1738     }
1739 
1740     /**
1741      * Parser for DEFLIST.
1742      */
1743     public static class ParseDefList implements Parser
1744     {
1745 
1746         /**
1747          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1748          */
1749         public void parse(Lexer lexer, Node list, short mode)
1750         {
1751             Node node, parent;
1752             TagTable tt = lexer.configuration.tt;
1753 
1754             if ((list.tag.model & Dict.CM_EMPTY) != 0)
1755             {
1756                 return;
1757             }
1758 
1759             lexer.insert = -1; // defer implicit inline start tags
1760 
1761             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
1762             {
1763                 if (node.tag == list.tag && node.type == Node.END_TAG)
1764                 {
1765                     list.closed = true;
1766                     Node.trimEmptyElement(lexer, list);
1767                     return;
1768                 }
1769 
1770                 // deal with comments etc.
1771                 if (Node.insertMisc(list, node))
1772                 {
1773                     continue;
1774                 }
1775 
1776                 if (node.type == Node.TEXT_NODE)
1777                 {
1778                     lexer.ungetToken();
1779                     node = lexer.inferredTag("dt");
1780                     lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG);
1781                 }
1782 
1783                 if (node.tag == null)
1784                 {
1785                     lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1786                     continue;
1787                 }
1788 
1789                 // if this is the end tag for an ancestor element then infer end tag for this element
1790 
1791                 if (node.type == Node.END_TAG)
1792                 {
1793                     if (node.tag == tt.tagForm)
1794                     {
1795                         badForm(lexer);
1796                         lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1797                         continue;
1798                     }
1799 
1800                     for (parent = list.parent; parent != null; parent = parent.parent)
1801                     {
1802                         if (node.tag == parent.tag)
1803                         {
1804                             lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
1805 
1806                             lexer.ungetToken();
1807                             Node.trimEmptyElement(lexer, list);
1808                             return;
1809                         }
1810                     }
1811                 }
1812 
1813                 // center in a dt or a dl breaks the dl list in two
1814                 if (node.tag == tt.tagCenter)
1815                 {
1816                     if (list.content != null)
1817                     {
1818                         list.insertNodeAfterElement(node);
1819                     }
1820                     else
1821                     {
1822                         // trim empty dl list
1823                         Node.insertNodeBeforeElement(list, node);
1824 
1825                         // #540296 tidy dumps with empty definition list
1826                         Node.discardElement(list);
1827                     }
1828 
1829                     // and parse contents of center
1830                     parseTag(lexer, node, mode);
1831 
1832                     // now create a new dl element
1833                     list = lexer.inferredTag("dl");
1834                     node.insertNodeAfterElement(list);
1835                     continue;
1836                 }
1837 
1838                 if (!(node.tag == tt.tagDt || node.tag == tt.tagDd))
1839                 {
1840                     lexer.ungetToken();
1841 
1842                     if (!((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
1843                     {
1844                         lexer.report.warning(lexer, list, node, Report.TAG_NOT_ALLOWED_IN);
1845                         Node.trimEmptyElement(lexer, list);
1846                         return;
1847                     }
1848 
1849                     // if DD appeared directly in BODY then exclude blocks
1850                     if (!((node.tag.model & Dict.CM_INLINE) != 0) && lexer.excludeBlocks)
1851                     {
1852                         Node.trimEmptyElement(lexer, list);
1853                         return;
1854                     }
1855 
1856                     node = lexer.inferredTag("dd");
1857                     lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG);
1858                 }
1859 
1860                 if (node.type == Node.END_TAG)
1861                 {
1862                     lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1863                     continue;
1864                 }
1865 
1866                 // node should be <DT> or <DD>
1867                 list.insertNodeAtEnd(node);
1868                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
1869             }
1870 
1871             lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR);
1872             Node.trimEmptyElement(lexer, list);
1873         }
1874 
1875     }
1876 
1877     /**
1878      * Parser for PRE.
1879      */
1880     public static class ParsePre implements Parser
1881     {
1882 
1883         /**
1884          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1885          */
1886         public void parse(Lexer lexer, Node pre, short mode)
1887         {
1888             Node node;
1889             TagTable tt = lexer.configuration.tt;
1890 
1891             if ((pre.tag.model & Dict.CM_EMPTY) != 0)
1892             {
1893                 return;
1894             }
1895 
1896             if ((pre.tag.model & Dict.CM_OBSOLETE) != 0)
1897             {
1898                 Node.coerceNode(lexer, pre, tt.tagPre);
1899             }
1900 
1901             lexer.inlineDup(null); // tell lexer to insert inlines if needed
1902 
1903             while ((node = lexer.getToken(Lexer.PREFORMATTED)) != null)
1904             {
1905                 if (node.tag == pre.tag && node.type == Node.END_TAG)
1906                 {
1907                     Node.trimSpaces(lexer, pre);
1908                     pre.closed = true;
1909                     Node.trimEmptyElement(lexer, pre);
1910                     return;
1911                 }
1912 
1913                 if (node.tag == tt.tagHtml)
1914                 {
1915                     if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1916                     {
1917                         lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
1918                     }
1919 
1920                     continue;
1921                 }
1922 
1923                 if (node.type == Node.TEXT_NODE)
1924                 {
1925                     // if first check for inital newline
1926                     if (pre.content == null)
1927                     {
1928                         if (node.textarray[node.start] == (byte) '\n')
1929                         {
1930                             ++node.start;
1931                         }
1932 
1933                         if (node.start >= node.end)
1934                         {
1935                             continue;
1936                         }
1937                     }
1938 
1939                     pre.insertNodeAtEnd(node);
1940                     continue;
1941                 }
1942 
1943                 // deal with comments etc.
1944                 if (Node.insertMisc(pre, node))
1945                 {
1946                     continue;
1947                 }
1948 
1949                 // strip unexpected tags
1950                 if (!lexer.preContent(node))
1951                 {
1952                     Node newnode;
1953 
1954                     lexer.report.warning(lexer, pre, node, Report.UNESCAPED_ELEMENT);
1955                     newnode = Node.escapeTag(lexer, node);
1956                     pre.insertNodeAtEnd(newnode);
1957                     continue;
1958                 }
1959 
1960                 if (node.tag == tt.tagP)
1961                 {
1962                     if (node.type == Node.START_TAG)
1963                     {
1964                         lexer.report.warning(lexer, pre, node, Report.USING_BR_INPLACE_OF);
1965 
1966                         // trim white space before <p> in <pre>
1967                         Node.trimSpaces(lexer, pre);
1968 
1969                         // coerce both <p> and </p> to <br>
1970                         Node.coerceNode(lexer, node, tt.tagBr);
1971                         pre.insertNodeAtEnd(node);
1972                     }
1973                     else
1974                     {
1975                         lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
1976                     }
1977                     continue;
1978                 }
1979 
1980                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1981                 {
1982                     // trim white space before <br>
1983                     if (node.tag == tt.tagBr)
1984                     {
1985                         Node.trimSpaces(lexer, pre);
1986                     }
1987 
1988                     pre.insertNodeAtEnd(node);
1989                     parseTag(lexer, node, Lexer.PREFORMATTED);
1990                     continue;
1991                 }
1992 
1993                 // discard unexpected tags
1994                 lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
1995             }
1996 
1997             lexer.report.warning(lexer, pre, node, Report.MISSING_ENDTAG_FOR);
1998             Node.trimEmptyElement(lexer, pre);
1999         }
2000 
2001     }
2002 
2003     /**
2004      * Parser for block elements.
2005      */
2006     public static class ParseBlock implements Parser
2007     {
2008 
2009         /**
2010          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2011          */
2012         public void parse(Lexer lexer, Node element, short mode)
2013         {
2014             // element is node created by the lexer upon seeing the start tag, or by the parser when the start tag is
2015             // inferred.
2016             Node node, parent;
2017             boolean checkstack;
2018             int istackbase = 0;
2019             TagTable tt = lexer.configuration.tt;
2020 
2021             checkstack = true;
2022 
2023             if ((element.tag.model & Dict.CM_EMPTY) != 0)
2024             {
2025                 return;
2026             }
2027 
2028             if (element.tag == tt.tagForm && element.isDescendantOf(tt.tagForm))
2029             {
2030                 lexer.report.warning(lexer, element, null, Report.ILLEGAL_NESTING);
2031             }
2032 
2033             // InlineDup() asks the lexer to insert inline emphasis tags currently pushed on the istack, but take care
2034             // to avoid propagating inline emphasis inside OBJECT or APPLET. For these elements a fresh inline stack
2035             // context is created and disposed of upon reaching the end of the element. They thus behave like table
2036             // cells in this respect.
2037 
2038             if ((element.tag.model & Dict.CM_OBJECT) != 0)
2039             {
2040                 istackbase = lexer.istackbase;
2041                 lexer.istackbase = lexer.istack.size();
2042             }
2043 
2044             if (!((element.tag.model & Dict.CM_MIXED) != 0))
2045             {
2046                 lexer.inlineDup(null);
2047             }
2048 
2049             mode = Lexer.IGNORE_WHITESPACE;
2050 
2051             while ((node = lexer.getToken(mode)) != null)
2052             {
2053                 // end tag for this element
2054                 if (node.type == Node.END_TAG
2055                     && node.tag != null
2056                     && (node.tag == element.tag || element.was == node.tag))
2057                 {
2058 
2059                     if ((element.tag.model & Dict.CM_OBJECT) != 0)
2060                     {
2061                         // pop inline stack
2062                         while (lexer.istack.size() > lexer.istackbase)
2063                         {
2064                             lexer.popInline(null);
2065                         }
2066                         lexer.istackbase = istackbase;
2067                     }
2068 
2069                     element.closed = true;
2070                     Node.trimSpaces(lexer, element);
2071                     Node.trimEmptyElement(lexer, element);
2072                     return;
2073                 }
2074 
2075                 if (node.tag == tt.tagHtml || node.tag == tt.tagHead || node.tag == tt.tagBody)
2076                 {
2077                     if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2078                     {
2079                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2080                     }
2081 
2082                     continue;
2083                 }
2084 
2085                 if (node.type == Node.END_TAG)
2086                 {
2087                     if (node.tag == null)
2088                     {
2089                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2090 
2091                         continue;
2092                     }
2093                     else if (node.tag == tt.tagBr)
2094                     {
2095                         node.type = Node.START_TAG;
2096                     }
2097                     else if (node.tag == tt.tagP)
2098                     {
2099                         Node.coerceNode(lexer, node, tt.tagBr);
2100                         element.insertNodeAtEnd(node);
2101                         node = lexer.inferredTag("br");
2102                     }
2103                     else
2104                     {
2105                         // if this is the end tag for an ancestor element then infer end tag for this element
2106 
2107                         for (parent = element.parent; parent != null; parent = parent.parent)
2108                         {
2109                             if (node.tag == parent.tag)
2110                             {
2111                                 if (!((element.tag.model & Dict.CM_OPT) != 0))
2112                                 {
2113                                     lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
2114                                 }
2115 
2116                                 lexer.ungetToken();
2117 
2118                                 if ((element.tag.model & Dict.CM_OBJECT) != 0)
2119                                 {
2120                                     // pop inline stack
2121                                     while (lexer.istack.size() > lexer.istackbase)
2122                                     {
2123                                         lexer.popInline(null);
2124                                     }
2125                                     lexer.istackbase = istackbase;
2126                                 }
2127 
2128                                 Node.trimSpaces(lexer, element);
2129                                 Node.trimEmptyElement(lexer, element);
2130                                 return;
2131                             }
2132                         }
2133                         // special case </tr> etc. for stuff moved in front of table
2134                         if (lexer.exiled && node.tag.model != 0 && (node.tag.model & Dict.CM_TABLE) != 0)
2135                         {
2136                             lexer.ungetToken();
2137                             Node.trimSpaces(lexer, element);
2138                             Node.trimEmptyElement(lexer, element);
2139                             return;
2140                         }
2141                     }
2142                 }
2143 
2144                 // mixed content model permits text
2145                 if (node.type == Node.TEXT_NODE)
2146                 {
2147                     boolean iswhitenode = false;
2148 
2149                     if (node.type == Node.TEXT_NODE
2150                         && node.end <= node.start + 1
2151                         && lexer.lexbuf[node.start] == (byte) ' ')
2152                     {
2153                         iswhitenode = true;
2154                     }
2155 
2156                     if (lexer.configuration.encloseBlockText && !iswhitenode)
2157                     {
2158                         lexer.ungetToken();
2159                         node = lexer.inferredTag("p");
2160                         element.insertNodeAtEnd(node);
2161                         parseTag(lexer, node, Lexer.MIXED_CONTENT);
2162                         continue;
2163                     }
2164 
2165                     if (checkstack)
2166                     {
2167                         checkstack = false;
2168 
2169                         if (!((element.tag.model & Dict.CM_MIXED) != 0))
2170                         {
2171                             if (lexer.inlineDup(node) > 0)
2172                             {
2173                                 continue;
2174                             }
2175                         }
2176                     }
2177 
2178                     element.insertNodeAtEnd(node);
2179                     mode = Lexer.MIXED_CONTENT;
2180 
2181                     // HTML4 strict doesn't allow mixed content for elements with %block; as their content model
2182                     // But only body, map, blockquote, form and noscript have content model %block;
2183                     if (element.tag == tt.tagBody
2184                         || element.tag == tt.tagMap
2185                         || element.tag == tt.tagBlockquote
2186                         || element.tag == tt.tagForm
2187                         || element.tag == tt.tagNoscript)
2188                     {
2189                         lexer.constrainVersion(~Dict.VERS_HTML40_STRICT);
2190                     }
2191                     continue;
2192                 }
2193 
2194                 if (Node.insertMisc(element, node))
2195                 {
2196                     continue;
2197                 }
2198 
2199                 // allow PARAM elements?
2200                 if (node.tag == tt.tagParam)
2201                 {
2202                     if (((element.tag.model & Dict.CM_PARAM) != 0)
2203                         && (node.type == Node.START_TAG || node.type == Node.START_END_TAG))
2204                     {
2205                         element.insertNodeAtEnd(node);
2206                         continue;
2207                     }
2208 
2209                     // otherwise discard it
2210                     lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2211                     continue;
2212                 }
2213 
2214                 // allow AREA elements?
2215                 if (node.tag == tt.tagArea)
2216                 {
2217                     if ((element.tag == tt.tagMap) && (node.type == Node.START_TAG || node.type == Node.START_END_TAG))
2218                     {
2219                         element.insertNodeAtEnd(node);
2220                         continue;
2221                     }
2222 
2223                     // otherwise discard it
2224                     lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2225                     continue;
2226                 }
2227 
2228                 // ignore unknown start/end tags
2229                 if (node.tag == null)
2230                 {
2231                     lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2232                     continue;
2233                 }
2234 
2235                 // Allow Dict.CM_INLINE elements here. Allow Dict.CM_BLOCK elements here unless lexer.excludeBlocks is
2236                 // yes. LI and DD are special cased. Otherwise infer end tag for this element.
2237 
2238                 if (!((node.tag.model & Dict.CM_INLINE) != 0))
2239                 {
2240                     if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
2241                     {
2242                         if (node.tag == tt.tagForm)
2243                         {
2244                             badForm(lexer);
2245                         }
2246                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2247                         continue;
2248                     }
2249 
2250                     // #427671 - Fix by Randy Waki - 10 Aug 00
2251                     // If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION start tag, discard the start
2252                     // tag and let the subsequent content get parsed as content of the enclosing LI. This seems to
2253                     // mimic IE and Netscape, and avoids an infinite loop: without this check, ParseBlock (which is
2254                     // parsing the LI's content) and ParseList (which is parsing the LI's parent's content) repeatedly
2255                     // defer to each other to parse the illegal start tag, each time inferring a missing </li> or <li>
2256                     // respectively. NOTE: This check is a bit fragile. It specifically checks for the four tags that
2257                     // happen to weave their way through the current series of tests performed by ParseBlock and
2258                     // ParseList to trigger the infinite loop.
2259 
2260                     if (element.tag == tt.tagLi)
2261                     {
2262                         if (node.tag == tt.tagFrame
2263                             || node.tag == tt.tagFrameset
2264                             || node.tag == tt.tagOptgroup
2265                             || node.tag == tt.tagOption)
2266                         {
2267                             lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2268                             continue;
2269                         }
2270                     }
2271 
2272                     if (element.tag == tt.tagTd || element.tag == tt.tagTh)
2273                     {
2274                         // if parent is a table cell, avoid inferring the end of the cell
2275 
2276                         if ((node.tag.model & Dict.CM_HEAD) != 0)
2277                         {
2278                             moveToHead(lexer, element, node);
2279                             continue;
2280                         }
2281 
2282                         if ((node.tag.model & Dict.CM_LIST) != 0)
2283                         {
2284                             lexer.ungetToken();
2285                             node = lexer.inferredTag("ul");
2286                             node.addClass("noindent");
2287                             lexer.excludeBlocks = true;
2288                         }
2289                         else if ((node.tag.model & Dict.CM_DEFLIST) != 0)
2290                         {
2291                             lexer.ungetToken();
2292                             node = lexer.inferredTag("dl");
2293                             lexer.excludeBlocks = true;
2294                         }
2295 
2296                         // infer end of current table cell
2297                         if (!((node.tag.model & Dict.CM_BLOCK) != 0))
2298                         {
2299                             lexer.ungetToken();
2300                             Node.trimSpaces(lexer, element);
2301                             Node.trimEmptyElement(lexer, element);
2302                             return;
2303                         }
2304                     }
2305                     else if ((node.tag.model & Dict.CM_BLOCK) != 0)
2306                     {
2307                         if (lexer.excludeBlocks)
2308                         {
2309                             if (!((element.tag.model & Dict.CM_OPT) != 0))
2310                             {
2311                                 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
2312                             }
2313 
2314                             lexer.ungetToken();
2315 
2316                             if ((element.tag.model & Dict.CM_OBJECT) != 0)
2317                             {
2318                                 lexer.istackbase = istackbase;
2319                             }
2320 
2321                             Node.trimSpaces(lexer, element);
2322                             Node.trimEmptyElement(lexer, element);
2323                             return;
2324                         }
2325                     }
2326                     else
2327                     {
2328                         // things like list items
2329 
2330                         if ((node.tag.model & Dict.CM_HEAD) != 0)
2331                         {
2332                             moveToHead(lexer, element, node);
2333                             continue;
2334                         }
2335 
2336                         // special case where a form start tag occurs in a tr and is followed by td or th
2337                         if (element.tag == tt.tagForm && element.parent.tag == tt.tagTd && element.parent.implicit)
2338                         {
2339                             if (node.tag == tt.tagTd)
2340                             {
2341                                 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2342                                 continue;
2343                             }
2344 
2345                             if (node.tag == tt.tagTh)
2346                             {
2347                                 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2348                                 node = element.parent;
2349                                 node.element = "th";
2350                                 node.tag = tt.tagTh;
2351                                 continue;
2352                             }
2353                         }
2354 
2355                         if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit)
2356                         {
2357                             lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
2358                         }
2359 
2360                         lexer.ungetToken();
2361 
2362                         if ((node.tag.model & Dict.CM_LIST) != 0)
2363                         {
2364                             if (element.parent != null
2365                                 && element.parent.tag != null
2366                                 && element.parent.tag.getParser() == LIST)
2367                             {
2368                                 Node.trimSpaces(lexer, element);
2369                                 Node.trimEmptyElement(lexer, element);
2370                                 return;
2371                             }
2372 
2373                             node = lexer.inferredTag("ul");
2374                             node.addClass("noindent");
2375                         }
2376                         else if ((node.tag.model & Dict.CM_DEFLIST) != 0)
2377                         {
2378                             if (element.parent.tag == tt.tagDl)
2379                             {
2380                                 Node.trimSpaces(lexer, element);
2381                                 Node.trimEmptyElement(lexer, element);
2382                                 return;
2383                             }
2384 
2385                             node = lexer.inferredTag("dl");
2386                         }
2387                         else if ((node.tag.model & Dict.CM_TABLE) != 0 || (node.tag.model & Dict.CM_ROW) != 0)
2388                         {
2389                             node = lexer.inferredTag("table");
2390                         }
2391                         else if ((element.tag.model & Dict.CM_OBJECT) != 0)
2392                         {
2393                             // pop inline stack
2394                             while (lexer.istack.size() > lexer.istackbase)
2395                             {
2396                                 lexer.popInline(null);
2397                             }
2398                             lexer.istackbase = istackbase;
2399                             Node.trimSpaces(lexer, element);
2400                             Node.trimEmptyElement(lexer, element);
2401                             return;
2402 
2403                         }
2404                         else
2405                         {
2406                             Node.trimSpaces(lexer, element);
2407                             Node.trimEmptyElement(lexer, element);
2408                             return;
2409                         }
2410                     }
2411                 }
2412 
2413                 // parse known element
2414                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2415                 {
2416                     if (TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
2417                     {
2418                         // DSR - 27Apr02 ensure we wrap anchors and other inline content
2419                         // fgiust: commented out due to [1403105]: java.lang.StackOverflowError in Tidy.parseDOM()
2420                         // if (lexer.configuration.encloseBlockText)
2421                         // {
2422                         // lexer.ungetToken();
2423                         // node = lexer.inferredTag("p");
2424                         // element.insertNodeAtEnd(node);
2425                         // parseTag(lexer, node, Lexer.MIXED_CONTENT);
2426                         // continue;
2427                         // }
2428 
2429                         if (checkstack && !node.implicit)
2430                         {
2431                             checkstack = false;
2432 
2433                             // #431731 - fix by Randy Waki 25 Dec 00
2434                             if (!TidyUtils.toBoolean(element.tag.model & Dict.CM_MIXED))
2435                             {
2436                                 if (lexer.inlineDup(node) > 0)
2437                                 {
2438                                     continue;
2439                                 }
2440                             }
2441                         }
2442 
2443                         mode = Lexer.MIXED_CONTENT;
2444                     }
2445                     else
2446                     {
2447                         checkstack = true;
2448                         mode = Lexer.IGNORE_WHITESPACE;
2449                     }
2450 
2451                     // trim white space before <br>
2452                     if (node.tag == tt.tagBr)
2453                     {
2454                         Node.trimSpaces(lexer, element);
2455                     }
2456 
2457                     element.insertNodeAtEnd(node);
2458 
2459                     if (node.implicit)
2460                     {
2461                         lexer.report.warning(lexer, element, node, Report.INSERTING_TAG);
2462                     }
2463 
2464                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE // Lexer.MixedContent
2465                     );
2466                     continue;
2467                 }
2468 
2469                 // discard unexpected tags
2470                 if (node.type == Node.END_TAG)
2471                 {
2472                     lexer.popInline(node); // if inline end tag
2473                 }
2474 
2475                 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2476                 continue;
2477             }
2478 
2479             if (!((element.tag.model & Dict.CM_OPT) != 0))
2480             {
2481                 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR);
2482             }
2483 
2484             if ((element.tag.model & Dict.CM_OBJECT) != 0)
2485             {
2486                 // pop inline stack
2487                 while (lexer.istack.size() > lexer.istackbase)
2488                 {
2489                     lexer.popInline(null);
2490                 }
2491                 lexer.istackbase = istackbase;
2492             }
2493 
2494             Node.trimSpaces(lexer, element);
2495             Node.trimEmptyElement(lexer, element);
2496         }
2497 
2498     }
2499 
2500     /**
2501      * Parser for TABLE.
2502      */
2503     public static class ParseTableTag implements Parser
2504     {
2505 
2506         /**
2507          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2508          */
2509         public void parse(Lexer lexer, Node table, short mode)
2510         {
2511             Node node, parent;
2512             int istackbase;
2513             TagTable tt = lexer.configuration.tt;
2514 
2515             lexer.deferDup();
2516             istackbase = lexer.istackbase;
2517             lexer.istackbase = lexer.istack.size();
2518 
2519             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2520             {
2521                 if (node.tag == table.tag && node.type == Node.END_TAG)
2522                 {
2523                     lexer.istackbase = istackbase;
2524                     table.closed = true;
2525                     Node.trimEmptyElement(lexer, table);
2526                     return;
2527                 }
2528 
2529                 // deal with comments etc.
2530                 if (Node.insertMisc(table, node))
2531                 {
2532                     continue;
2533                 }
2534 
2535                 // discard unknown tags
2536                 if (node.tag == null && node.type != Node.TEXT_NODE)
2537                 {
2538                     lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2539                     continue;
2540                 }
2541 
2542                 // if TD or TH or text or inline or block then infer <TR>
2543 
2544                 if (node.type != Node.END_TAG)
2545                 {
2546                     if (node.tag == tt.tagTd || node.tag == tt.tagTh || node.tag == tt.tagTable)
2547                     {
2548                         lexer.ungetToken();
2549                         node = lexer.inferredTag("tr");
2550                         lexer.report.warning(lexer, table, node, Report.MISSING_STARTTAG);
2551                     }
2552                     else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)
2553                     {
2554                         Node.insertNodeBeforeElement(table, node);
2555                         lexer.report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN);
2556                         lexer.exiled = true;
2557 
2558                         if (!(node.type == Node.TEXT_NODE)) // #427662 - was (!node.type == TextNode) - fix by Young
2559                         {
2560                             parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2561                         }
2562 
2563                         lexer.exiled = false;
2564                         continue;
2565                     }
2566                     else if ((node.tag.model & Dict.CM_HEAD) != 0)
2567                     {
2568                         moveToHead(lexer, table, node);
2569                         continue;
2570                     }
2571                 }
2572 
2573                 // if this is the end tag for an ancestor element then infer end tag for this element
2574 
2575                 if (node.type == Node.END_TAG)
2576                 {
2577                     if (node.tag == tt.tagForm
2578                         || (node.tag != null && ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)))
2579                     {
2580                         badForm(lexer);
2581                         lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2582                         continue;
2583                     }
2584 
2585                     if ((node.tag != null && (node.tag.model & (Dict.CM_TABLE | Dict.CM_ROW)) != 0)
2586                         || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
2587                     {
2588                         lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2589                         continue;
2590                     }
2591 
2592                     for (parent = table.parent; parent != null; parent = parent.parent)
2593                     {
2594                         if (node.tag == parent.tag)
2595                         {
2596                             lexer.report.warning(lexer, table, node, Report.MISSING_ENDTAG_BEFORE);
2597                             lexer.ungetToken();
2598                             lexer.istackbase = istackbase;
2599                             Node.trimEmptyElement(lexer, table);
2600                             return;
2601                         }
2602                     }
2603                 }
2604 
2605                 if (!((node.tag.model & Dict.CM_TABLE) != 0))
2606                 {
2607                     lexer.ungetToken();
2608                     lexer.report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN);
2609                     lexer.istackbase = istackbase;
2610                     Node.trimEmptyElement(lexer, table);
2611                     return;
2612                 }
2613 
2614                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2615                 {
2616                     table.insertNodeAtEnd(node);
2617 
2618                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2619                     continue;
2620                 }
2621 
2622                 // discard unexpected text nodes and end tags
2623                 lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2624             }
2625 
2626             lexer.report.warning(lexer, table, node, Report.MISSING_ENDTAG_FOR);
2627             Node.trimEmptyElement(lexer, table);
2628             lexer.istackbase = istackbase;
2629         }
2630 
2631     }
2632 
2633     /**
2634      * Parser for COLGROUP.
2635      */
2636     public static class ParseColGroup implements Parser
2637     {
2638 
2639         /**
2640          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2641          */
2642         public void parse(Lexer lexer, Node colgroup, short mode)
2643         {
2644             Node node, parent;
2645             TagTable tt = lexer.configuration.tt;
2646 
2647             if ((colgroup.tag.model & Dict.CM_EMPTY) != 0)
2648             {
2649                 return;
2650             }
2651 
2652             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2653             {
2654                 if (node.tag == colgroup.tag && node.type == Node.END_TAG)
2655                 {
2656                     colgroup.closed = true;
2657                     return;
2658                 }
2659 
2660                 // if this is the end tag for an ancestor element then infer end tag for this element
2661 
2662                 if (node.type == Node.END_TAG)
2663                 {
2664                     if (node.tag == tt.tagForm)
2665                     {
2666                         badForm(lexer);
2667                         lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
2668                         continue;
2669                     }
2670 
2671                     for (parent = colgroup.parent; parent != null; parent = parent.parent)
2672                     {
2673 
2674                         if (node.tag == parent.tag)
2675                         {
2676                             lexer.ungetToken();
2677                             return;
2678                         }
2679                     }
2680                 }
2681 
2682                 if (node.type == Node.TEXT_NODE)
2683                 {
2684                     lexer.ungetToken();
2685                     return;
2686                 }
2687 
2688                 // deal with comments etc.
2689                 if (Node.insertMisc(colgroup, node))
2690                 {
2691                     continue;
2692                 }
2693 
2694                 // discard unknown tags
2695                 if (node.tag == null)
2696                 {
2697                     lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
2698                     continue;
2699                 }
2700 
2701                 if (node.tag != tt.tagCol)
2702                 {
2703                     lexer.ungetToken();
2704                     return;
2705                 }
2706 
2707                 if (node.type == Node.END_TAG)
2708                 {
2709                     lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
2710                     continue;
2711                 }
2712 
2713                 // node should be <COL>
2714                 colgroup.insertNodeAtEnd(node);
2715                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2716             }
2717         }
2718 
2719     }
2720 
2721     /**
2722      * Parser for ROWGROUP.
2723      */
2724     public static class ParseRowGroup implements Parser
2725     {
2726 
2727         /**
2728          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2729          */
2730         public void parse(Lexer lexer, Node rowgroup, short mode)
2731         {
2732             Node node, parent;
2733             TagTable tt = lexer.configuration.tt;
2734 
2735             if ((rowgroup.tag.model & Dict.CM_EMPTY) != 0)
2736             {
2737                 return;
2738             }
2739 
2740             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2741             {
2742                 if (node.tag == rowgroup.tag)
2743                 {
2744                     if (node.type == Node.END_TAG)
2745                     {
2746                         rowgroup.closed = true;
2747                         Node.trimEmptyElement(lexer, rowgroup);
2748                         return;
2749                     }
2750 
2751                     lexer.ungetToken();
2752                     return;
2753                 }
2754 
2755                 // if </table> infer end tag
2756                 if (node.tag == tt.tagTable && node.type == Node.END_TAG)
2757                 {
2758                     lexer.ungetToken();
2759                     Node.trimEmptyElement(lexer, rowgroup);
2760                     return;
2761                 }
2762 
2763                 // deal with comments etc.
2764                 if (Node.insertMisc(rowgroup, node))
2765                 {
2766                     continue;
2767                 }
2768 
2769                 // discard unknown tags
2770                 if (node.tag == null && node.type != Node.TEXT_NODE)
2771                 {
2772                     lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2773                     continue;
2774                 }
2775 
2776                 // if TD or TH then infer <TR> if text or inline or block move before table if head content move to
2777                 // head
2778 
2779                 if (node.type != Node.END_TAG)
2780                 {
2781                     if (node.tag == tt.tagTd || node.tag == tt.tagTh)
2782                     {
2783                         lexer.ungetToken();
2784                         node = lexer.inferredTag("tr");
2785                         lexer.report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG);
2786                     }
2787                     else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)
2788                     {
2789                         Node.moveBeforeTable(rowgroup, node, tt);
2790                         lexer.report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN);
2791                         lexer.exiled = true;
2792 
2793                         // #427662 was (!node.type == TextNode) fix by Young 04 Aug 00
2794                         if (node.type != Node.TEXT_NODE)
2795                         {
2796                             parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2797                         }
2798 
2799                         lexer.exiled = false;
2800                         continue;
2801                     }
2802                     else if ((node.tag.model & Dict.CM_HEAD) != 0)
2803                     {
2804                         lexer.report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN);
2805                         moveToHead(lexer, rowgroup, node);
2806                         continue;
2807                     }
2808                 }
2809 
2810                 // if this is the end tag for ancestor element then infer end tag for this element
2811 
2812                 if (node.type == Node.END_TAG)
2813                 {
2814 
2815                     if (node.tag == tt.tagForm
2816                         || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
2817                     {
2818                         if (node.tag == tt.tagForm)
2819                         {
2820                             badForm(lexer);
2821                         }
2822                         lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2823                         continue;
2824                     }
2825 
2826                     if (node.tag == tt.tagTr || node.tag == tt.tagTd || node.tag == tt.tagTh)
2827                     {
2828                         lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2829                         continue;
2830                     }
2831 
2832                     for (parent = rowgroup.parent; parent != null; parent = parent.parent)
2833                     {
2834                         if (node.tag == parent.tag)
2835                         {
2836                             lexer.ungetToken();
2837                             Node.trimEmptyElement(lexer, rowgroup);
2838                             return;
2839                         }
2840                     }
2841 
2842                 }
2843 
2844                 // if THEAD, TFOOT or TBODY then implied end tag
2845 
2846                 if ((node.tag.model & Dict.CM_ROWGRP) != 0)
2847                 {
2848                     if (node.type != Node.END_TAG)
2849                     {
2850                         lexer.ungetToken();
2851                     }
2852 
2853                     Node.trimEmptyElement(lexer, rowgroup);
2854                     return;
2855                 }
2856 
2857                 if (node.type == Node.END_TAG)
2858                 {
2859                     lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2860                     continue;
2861                 }
2862 
2863                 if (!(node.tag == tt.tagTr))
2864                 {
2865                     node = lexer.inferredTag("tr");
2866                     lexer.report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG);
2867                     lexer.ungetToken();
2868                 }
2869 
2870                 // node should be <TR>
2871                 rowgroup.insertNodeAtEnd(node);
2872                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2873             }
2874             Node.trimEmptyElement(lexer, rowgroup);
2875         }
2876     }
2877 
2878     /**
2879      * Parser for ROW.
2880      */
2881     public static class ParseRow implements Parser
2882     {
2883 
2884         /**
2885          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2886          */
2887         public void parse(Lexer lexer, Node row, short mode)
2888         {
2889             Node node, parent;
2890             boolean excludeState;
2891             TagTable tt = lexer.configuration.tt;
2892 
2893             if ((row.tag.model & Dict.CM_EMPTY) != 0)
2894             {
2895                 return;
2896             }
2897 
2898             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2899             {
2900                 if (node.tag == row.tag)
2901                 {
2902                     if (node.type == Node.END_TAG)
2903                     {
2904                         row.closed = true;
2905                         Node.fixEmptyRow(lexer, row);
2906                         return;
2907                     }
2908 
2909                     lexer.ungetToken();
2910                     Node.fixEmptyRow(lexer, row);
2911                     return;
2912                 }
2913 
2914                 // if this is the end tag for an ancestor element then infer end tag for this element
2915                 if (node.type == Node.END_TAG)
2916                 {
2917                     if (node.tag == tt.tagForm
2918                         || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
2919                     {
2920                         if (node.tag == tt.tagForm)
2921                         {
2922                             badForm(lexer);
2923                         }
2924                         lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2925                         continue;
2926                     }
2927 
2928                     if (node.tag == tt.tagTd || node.tag == tt.tagTh)
2929                     {
2930                         lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2931                         continue;
2932                     }
2933 
2934                     for (parent = row.parent; parent != null; parent = parent.parent)
2935                     {
2936                         if (node.tag == parent.tag)
2937                         {
2938                             lexer.ungetToken();
2939                             Node.trimEmptyElement(lexer, row);
2940                             return;
2941                         }
2942                     }
2943                 }
2944 
2945                 // deal with comments etc.
2946                 if (Node.insertMisc(row, node))
2947                 {
2948                     continue;
2949                 }
2950 
2951                 // discard unknown tags
2952                 if (node.tag == null && node.type != Node.TEXT_NODE)
2953                 {
2954                     lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2955                     continue;
2956                 }
2957 
2958                 // discard unexpected <table> element
2959                 if (node.tag == tt.tagTable)
2960                 {
2961                     lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2962                     continue;
2963                 }
2964 
2965                 // THEAD, TFOOT or TBODY
2966                 if (node.tag != null && (node.tag.model & Dict.CM_ROWGRP) != 0)
2967                 {
2968                     lexer.ungetToken();
2969                     Node.trimEmptyElement(lexer, row);
2970                     return;
2971                 }
2972 
2973                 if (node.type == Node.END_TAG)
2974                 {
2975                     lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2976                     continue;
2977                 }
2978 
2979                 // if text or inline or block move before table if head content move to head
2980 
2981                 if (node.type != Node.END_TAG)
2982                 {
2983                     if (node.tag == tt.tagForm)
2984                     {
2985                         lexer.ungetToken();
2986                         node = lexer.inferredTag("td");
2987                         lexer.report.warning(lexer, row, node, Report.MISSING_STARTTAG);
2988                     }
2989                     else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)
2990                     {
2991                         Node.moveBeforeTable(row, node, tt);
2992                         lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
2993                         lexer.exiled = true;
2994 
2995                         if (node.type != Node.TEXT_NODE)
2996                         {
2997                             parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2998                         }
2999 
3000                         lexer.exiled = false;
3001                         continue;
3002                     }
3003                     else if ((node.tag.model & Dict.CM_HEAD) != 0)
3004                     {
3005                         lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
3006                         moveToHead(lexer, row, node);
3007                         continue;
3008                     }
3009                 }
3010 
3011                 if (!(node.tag == tt.tagTd || node.tag == tt.tagTh))
3012                 {
3013                     lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
3014                     continue;
3015                 }
3016 
3017                 // node should be <TD> or <TH>
3018                 row.insertNodeAtEnd(node);
3019                 excludeState = lexer.excludeBlocks;
3020                 lexer.excludeBlocks = false;
3021                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
3022                 lexer.excludeBlocks = excludeState;
3023 
3024                 // pop inline stack
3025 
3026                 while (lexer.istack.size() > lexer.istackbase)
3027                 {
3028                     lexer.popInline(null);
3029                 }
3030             }
3031 
3032             Node.trimEmptyElement(lexer, row);
3033         }
3034 
3035     }
3036 
3037     /**
3038      * Parser for NOFRAMES.
3039      */
3040     public static class ParseNoFrames implements Parser
3041     {
3042 
3043         /**
3044          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3045          */
3046         public void parse(Lexer lexer, Node noframes, short mode)
3047         {
3048             Node node;
3049             TagTable tt = lexer.configuration.tt;
3050 
3051             lexer.badAccess |= Report.USING_NOFRAMES;
3052             mode = Lexer.IGNORE_WHITESPACE;
3053 
3054             while ((node = lexer.getToken(mode)) != null)
3055             {
3056                 if (node.tag == noframes.tag && node.type == Node.END_TAG)
3057                 {
3058                     noframes.closed = true;
3059                     Node.trimSpaces(lexer, noframes);
3060                     return;
3061                 }
3062 
3063                 if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset))
3064                 {
3065 
3066                     Node.trimSpaces(lexer, noframes);
3067 
3068                     // fix for [539369]
3069                     if (node.type == Node.END_TAG)
3070                     {
3071                         lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
3072                         // Throw it away
3073                     }
3074                     else
3075                     {
3076                         lexer.report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_BEFORE);
3077 
3078                         lexer.ungetToken();
3079                     }
3080                     return;
3081                 }
3082 
3083                 if (node.tag == tt.tagHtml)
3084                 {
3085                     if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
3086                     {
3087                         lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
3088                     }
3089 
3090                     continue;
3091                 }
3092 
3093                 // deal with comments etc.
3094                 if (Node.insertMisc(noframes, node))
3095                 {
3096                     continue;
3097                 }
3098 
3099                 if (node.tag == tt.tagBody && node.type == Node.START_TAG)
3100                 {
3101                     boolean seenbody = lexer.seenEndBody;
3102                     noframes.insertNodeAtEnd(node);
3103                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); // MixedContent
3104 
3105                     if (seenbody)
3106                     {
3107                         Node.coerceNode(lexer, node, tt.tagDiv);
3108                         moveNodeToBody(lexer, node);
3109                     }
3110                     continue;
3111                 }
3112 
3113                 // implicit body element inferred
3114                 if (node.type == Node.TEXT_NODE || (node.tag != null && node.type != Node.END_TAG))
3115                 {
3116                     if (lexer.seenEndBody)
3117                     {
3118                         Node body = lexer.root.findBody(tt);
3119 
3120                         if (node.type == Node.TEXT_NODE)
3121                         {
3122                             lexer.ungetToken();
3123                             node = lexer.inferredTag("p");
3124                             lexer.report.warning(lexer, noframes, node, Report.CONTENT_AFTER_BODY);
3125                         }
3126 
3127                         body.insertNodeAtEnd(node);
3128                     }
3129                     else
3130                     {
3131                         lexer.ungetToken();
3132                         node = lexer.inferredTag("body");
3133                         if (lexer.configuration.xmlOut)
3134                         {
3135                             lexer.report.warning(lexer, noframes, node, Report.INSERTING_TAG);
3136                         }
3137                         noframes.insertNodeAtEnd(node);
3138                     }
3139                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
3140                     // MixedContent
3141                     continue;
3142                 }
3143                 // discard unexpected end tags
3144                 lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
3145             }
3146 
3147             lexer.report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_FOR);
3148         }
3149 
3150     }
3151 
3152     /**
3153      * Parser for SELECT.
3154      */
3155     public static class ParseSelect implements Parser
3156     {
3157 
3158         /**
3159          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3160          */
3161         public void parse(Lexer lexer, Node field, short mode)
3162         {
3163             Node node;
3164             TagTable tt = lexer.configuration.tt;
3165 
3166             lexer.insert = -1; // defer implicit inline start tags
3167 
3168             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3169             {
3170                 if (node.tag == field.tag && node.type == Node.END_TAG)
3171                 {
3172                     field.closed = true;
3173                     Node.trimSpaces(lexer, field);
3174                     return;
3175                 }
3176 
3177                 // deal with comments etc.
3178                 if (Node.insertMisc(field, node))
3179                 {
3180                     continue;
3181                 }
3182 
3183                 if (node.type == Node.START_TAG
3184                     && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup || node.tag == tt.tagScript))
3185                 {
3186                     field.insertNodeAtEnd(node);
3187                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
3188                     continue;
3189                 }
3190 
3191                 // discard unexpected tags
3192                 lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
3193             }
3194 
3195             lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR);
3196         }
3197 
3198     }
3199 
3200     /**
3201      * Parser for text nodes.
3202      */
3203     public static class ParseText implements Parser
3204     {
3205 
3206         /**
3207          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3208          */
3209         public void parse(Lexer lexer, Node field, short mode)
3210         {
3211             Node node;
3212             TagTable tt = lexer.configuration.tt;
3213 
3214             lexer.insert = -1; // defer implicit inline start tags
3215 
3216             if (field.tag == tt.tagTextarea)
3217             {
3218                 mode = Lexer.PREFORMATTED;
3219             }
3220             else
3221             {
3222                 mode = Lexer.MIXED_CONTENT; // kludge for font tags
3223             }
3224 
3225             while ((node = lexer.getToken(mode)) != null)
3226             {
3227                 if (node.tag == field.tag && node.type == Node.END_TAG)
3228                 {
3229                     field.closed = true;
3230                     Node.trimSpaces(lexer, field);
3231                     return;
3232                 }
3233 
3234                 // deal with comments etc.
3235                 if (Node.insertMisc(field, node))
3236                 {
3237                     continue;
3238                 }
3239 
3240                 if (node.type == Node.TEXT_NODE)
3241                 {
3242                     // only called for 1st child
3243                     if (field.content == null && !((mode & Lexer.PREFORMATTED) != 0))
3244                     {
3245                         Node.trimSpaces(lexer, field);
3246                     }
3247 
3248                     if (node.start >= node.end)
3249                     {
3250                         continue;
3251                     }
3252 
3253                     field.insertNodeAtEnd(node);
3254                     continue;
3255                 }
3256 
3257                 // for textarea should all cases of < and & be escaped?
3258                 // discard inline tags e.g. font
3259                 if (node.tag != null
3260                     && ((node.tag.model & Dict.CM_INLINE) != 0)
3261                     && (node.tag.model & Dict.CM_FIELD) == 0) // #487283 - fix by Lee Passey 25 Jan 02
3262                 {
3263                     lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
3264                     continue;
3265                 }
3266 
3267                 // terminate element on other tags
3268                 if (!((field.tag.model & Dict.CM_OPT) != 0))
3269                 {
3270                     lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_BEFORE);
3271                 }
3272 
3273                 lexer.ungetToken();
3274                 Node.trimSpaces(lexer, field);
3275                 return;
3276             }
3277 
3278             if (!((field.tag.model & Dict.CM_OPT) != 0))
3279             {
3280                 lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR);
3281             }
3282         }
3283 
3284     }
3285 
3286     /**
3287      * Parser for OPTGROUP.
3288      */
3289     public static class ParseOptGroup implements Parser
3290     {
3291 
3292         /**
3293          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3294          */
3295         public void parse(Lexer lexer, Node field, short mode)
3296         {
3297             Node node;
3298             TagTable tt = lexer.configuration.tt;
3299 
3300             lexer.insert = -1; // defer implicit inline start tags
3301 
3302             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3303             {
3304                 if (node.tag == field.tag && node.type == Node.END_TAG)
3305                 {
3306                     field.closed = true;
3307                     Node.trimSpaces(lexer, field);
3308                     return;
3309                 }
3310 
3311                 // deal with comments etc.
3312                 if (Node.insertMisc(field, node))
3313                 {
3314                     continue;
3315                 }
3316 
3317                 if (node.type == Node.START_TAG && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup))
3318                 {
3319                     if (node.tag == tt.tagOptgroup)
3320                     {
3321                         lexer.report.warning(lexer, field, node, Report.CANT_BE_NESTED);
3322                     }
3323 
3324                     field.insertNodeAtEnd(node);
3325                     parseTag(lexer, node, Lexer.MIXED_CONTENT);
3326                     continue;
3327                 }
3328 
3329                 // discard unexpected tags
3330                 lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
3331             }
3332         }
3333 
3334     }
3335 
3336     /**
3337      * HTML is the top level element.
3338      */
3339     public static Node parseDocument(Lexer lexer)
3340     {
3341         Node node, document, html;
3342         Node doctype = null;
3343         TagTable tt = lexer.configuration.tt;
3344 
3345         document = lexer.newNode();
3346         document.type = Node.ROOT_NODE;
3347 
3348         lexer.root = document;
3349 
3350         while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3351         {
3352             // deal with comments etc.
3353             if (Node.insertMisc(document, node))
3354             {
3355                 continue;
3356             }
3357 
3358             if (node.type == Node.DOCTYPE_TAG)
3359             {
3360                 if (doctype == null)
3361                 {
3362                     document.insertNodeAtEnd(node);
3363                     doctype = node;
3364                 }
3365                 else
3366                 {
3367                     lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED);
3368                 }
3369                 continue;
3370             }
3371 
3372             if (node.type == Node.END_TAG)
3373             {
3374                 lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); // TODO?
3375                 continue;
3376             }
3377 
3378             if (node.type != Node.START_TAG || node.tag != tt.tagHtml)
3379             {
3380                 lexer.ungetToken();
3381                 html = lexer.inferredTag("html");
3382             }
3383             else
3384             {
3385                 html = node;
3386             }
3387 
3388             if (document.findDocType() == null && !lexer.configuration.bodyOnly)
3389             {
3390                 lexer.report.warning(lexer, null, null, Report.MISSING_DOCTYPE);
3391             }
3392 
3393             document.insertNodeAtEnd(html);
3394             HTML.parse(lexer, html, (short) 0); // TODO?
3395             break;
3396         }
3397 
3398         return document;
3399     }
3400 
3401     /**
3402      * Indicates whether or not whitespace should be preserved for this element. If an <code>xml:space</code>
3403      * attribute is found, then if the attribute value is <code>preserve</code>, returns <code>true</code>. For
3404      * any other value, returns <code>false</code>. If an <code>xml:space</code> attribute was <em>not</em>
3405      * found, then the following element names result in a return value of <code>true:
3406      *  pre, script, style,</code> and
3407      * <code>xsl:text</code>. Finally, if a <code>TagTable</code> was passed in and the element appears as the
3408      * "pre" element in the <code>TagTable</code>, then <code>true</code> will be returned. Otherwise,
3409      * <code>false</code> is returned.
3410      * @param element The <code>Node</code> to test to see if whitespace should be preserved.
3411      * @param tt The <code>TagTable</code> to test for the <code>getNodePre()</code> function. This may be
3412      * <code>null</code>, in which case this test is bypassed.
3413      * @return <code>true</code> or <code>false</code>, as explained above.
3414      */
3415     public static boolean XMLPreserveWhiteSpace(Node element, TagTable tt)
3416     {
3417         AttVal attribute;
3418 
3419         // search attributes for xml:space
3420         for (attribute = element.attributes; attribute != null; attribute = attribute.next)
3421         {
3422             if (attribute.attribute.equals("xml:space"))
3423             {
3424                 if (attribute.value.equals("preserve"))
3425                 {
3426                     return true;
3427                 }
3428 
3429                 return false;
3430             }
3431         }
3432 
3433         if (element.element == null) // Debian Bug #137124. Fix based on suggestion by Cesar Eduardo Barros 06 Mar 02
3434         {
3435             return false;
3436         }
3437 
3438         // kludge for html docs without explicit xml:space attribute
3439         if ("pre".equalsIgnoreCase(element.element)
3440             || "script".equalsIgnoreCase(element.element)
3441             || "style".equalsIgnoreCase(element.element))
3442         {
3443             return true;
3444         }
3445 
3446         if ((tt != null) && (tt.findParser(element) == PRE))
3447         {
3448             return true;
3449         }
3450 
3451         // kludge for XSL docs
3452         if ("xsl:text".equalsIgnoreCase(element.element))
3453         {
3454             return true;
3455         }
3456 
3457         return false;
3458     }
3459 
3460     /**
3461      * XML documents.
3462      */
3463     public static void parseXMLElement(Lexer lexer, Node element, short mode)
3464     {
3465         Node node;
3466 
3467         // if node is pre or has xml:space="preserve" then do so
3468 
3469         if (XMLPreserveWhiteSpace(element, lexer.configuration.tt))
3470         {
3471             mode = Lexer.PREFORMATTED;
3472         }
3473 
3474         while ((node = lexer.getToken(mode)) != null)
3475         {
3476             if (node.type == Node.END_TAG && node.element.equals(element.element))
3477             {
3478                 element.closed = true;
3479                 break;
3480             }
3481 
3482             // discard unexpected end tags
3483             if (node.type == Node.END_TAG)
3484             {
3485                 lexer.report.error(lexer, element, node, Report.UNEXPECTED_ENDTAG);
3486                 continue;
3487             }
3488 
3489             // parse content on seeing start tag
3490             if (node.type == Node.START_TAG)
3491             {
3492                 parseXMLElement(lexer, node, mode);
3493             }
3494 
3495             element.insertNodeAtEnd(node);
3496         }
3497 
3498         // if first child is text then trim initial space and delete text node if it is empty.
3499 
3500         node = element.content;
3501 
3502         if (node != null && node.type == Node.TEXT_NODE && mode != Lexer.PREFORMATTED)
3503         {
3504             if (node.textarray[node.start] == (byte) ' ')
3505             {
3506                 node.start++;
3507 
3508                 if (node.start >= node.end)
3509                 {
3510                     Node.discardElement(node);
3511                 }
3512             }
3513         }
3514 
3515         // if last child is text then trim final space and delete the text node if it is empty
3516 
3517         node = element.last;
3518 
3519         if (node != null && node.type == Node.TEXT_NODE && mode != Lexer.PREFORMATTED)
3520         {
3521             if (node.textarray[node.end - 1] == (byte) ' ')
3522             {
3523                 node.end--;
3524 
3525                 if (node.start >= node.end)
3526                 {
3527                     Node.discardElement(node);
3528                 }
3529             }
3530         }
3531     }
3532 
3533     public static Node parseXMLDocument(Lexer lexer)
3534     {
3535         Node node, document, doctype;
3536 
3537         document = lexer.newNode();
3538         document.type = Node.ROOT_NODE;
3539         doctype = null;
3540         lexer.configuration.xmlTags = true;
3541 
3542         while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3543         {
3544             // discard unexpected end tags
3545             if (node.type == Node.END_TAG)
3546             {
3547                 lexer.report.warning(lexer, null, node, Report.UNEXPECTED_ENDTAG);
3548                 continue;
3549             }
3550 
3551             // deal with comments etc.
3552             if (Node.insertMisc(document, node))
3553             {
3554                 continue;
3555             }
3556 
3557             if (node.type == Node.DOCTYPE_TAG)
3558             {
3559                 if (doctype == null)
3560                 {
3561                     document.insertNodeAtEnd(node);
3562                     doctype = node;
3563                 }
3564                 else
3565                 {
3566                     lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); // TODO
3567                 }
3568                 continue;
3569             }
3570 
3571             if (node.type == Node.START_END_TAG)
3572             {
3573                 document.insertNodeAtEnd(node);
3574                 continue;
3575             }
3576 
3577             // if start tag then parse element's content
3578             if (node.type == Node.START_TAG)
3579             {
3580                 document.insertNodeAtEnd(node);
3581                 parseXMLElement(lexer, node, Lexer.IGNORE_WHITESPACE);
3582             }
3583 
3584         }
3585 
3586         if (doctype != null && !lexer.checkDocTypeKeyWords(doctype))
3587         {
3588             lexer.report.warning(lexer, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
3589         }
3590 
3591         // ensure presence of initial <?XML version="1.0"?>
3592         if (lexer.configuration.xmlPi)
3593         {
3594             lexer.fixXmlDecl(document);
3595         }
3596 
3597         return document;
3598     }
3599 
3600     /**
3601      * errors in positioning of form start or end tags generally require human intervention to fix.
3602      */
3603     static void badForm(Lexer lexer)
3604     {
3605         lexer.badForm = 1;
3606         lexer.errors++;
3607     }
3608 
3609 }