View Javadoc

1   /*
2    *  Java HTML Tidy - JTidy
3    *  HTML parser and pretty printer
4    *
5    *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6    *  Institute of Technology, Institut National de Recherche en
7    *  Informatique et en Automatique, Keio University). All Rights
8    *  Reserved.
9    *
10   *  Contributing Author(s):
11   *
12   *     Dave Raggett <dsr@w3.org>
13   *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14   *     Gary L Peskin <garyp@firstech.com> (Java development)
15   *     Sami Lempinen <sami@lempinen.net> (release management)
16   *     Fabrizio Giustina <fgiust at users.sourceforge.net>
17   *
18   *  The contributing author(s) would like to thank all those who
19   *  helped with testing, bug fixes, and patience.  This wouldn't
20   *  have been possible without all of you.
21   *
22   *  COPYRIGHT NOTICE:
23   * 
24   *  This software and documentation is provided "as is," and
25   *  the copyright holders and contributing author(s) make no
26   *  representations or warranties, express or implied, including
27   *  but not limited to, warranties of merchantability or fitness
28   *  for any particular purpose or that the use of the software or
29   *  documentation will not infringe any third party patents,
30   *  copyrights, trademarks or other rights. 
31   *
32   *  The copyright holders and contributing author(s) will not be
33   *  liable for any direct, indirect, special or consequential damages
34   *  arising out of any use of the software or documentation, even if
35   *  advised of the possibility of such damage.
36   *
37   *  Permission is hereby granted to use, copy, modify, and distribute
38   *  this source code, or portions hereof, documentation and executables,
39   *  for any purpose, without fee, subject to the following restrictions:
40   *
41   *  1. The origin of this source code must not be misrepresented.
42   *  2. Altered versions must be plainly marked as such and must
43   *     not be misrepresented as being the original source.
44   *  3. This Copyright notice may not be removed or altered from any
45   *     source or altered source distribution.
46   * 
47   *  The copyright holders and contributing author(s) specifically
48   *  permit, without fee, and encourage the use of this source code
49   *  as a component for supporting the Hypertext Markup Language in
50   *  commercial products. If you use this source code in a product,
51   *  acknowledgment is not required but would be appreciated.
52   *
53   */
54  package org.w3c.tidy;
55  
56  /***
57   * HTML Parser implementation.
58   * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
59   * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
60   * @author Fabrizio Giustina
61   * @version $Revision: 1.53 $ ($Author: fgiust $)
62   */
63  public final class ParserImpl
64  {
65  
66      /***
67       * parser for html.
68       */
69      public static final Parser HTML = new ParseHTML();
70  
71      /***
72       * parser for head.
73       */
74      public static final Parser HEAD = new ParseHead();
75  
76      /***
77       * parser for title.
78       */
79      public static final Parser TITLE = new ParseTitle();
80  
81      /***
82       * parser for script.
83       */
84      public static final Parser SCRIPT = new ParseScript();
85  
86      /***
87       * parser for body.
88       */
89      public static final Parser BODY = new ParseBody();
90  
91      /***
92       * parser for frameset.
93       */
94      public static final Parser FRAMESET = new ParseFrameSet();
95  
96      /***
97       * parser for inline.
98       */
99      public static final Parser INLINE = new ParseInline();
100 
101     /***
102      * parser for list.
103      */
104     public static final Parser LIST = new ParseList();
105 
106     /***
107      * parser for definition lists.
108      */
109     public static final Parser DEFLIST = new ParseDefList();
110 
111     /***
112      * parser for pre.
113      */
114     public static final Parser PRE = new ParsePre();
115 
116     /***
117      * parser for block elements.
118      */
119     public static final Parser BLOCK = new ParseBlock();
120 
121     /***
122      * parser for table.
123      */
124     public static final Parser TABLETAG = new ParseTableTag();
125 
126     /***
127      * parser for colgroup.
128      */
129     public static final Parser COLGROUP = new ParseColGroup();
130 
131     /***
132      * parser for rowgroup.
133      */
134     public static final Parser ROWGROUP = new ParseRowGroup();
135 
136     /***
137      * parser for row.
138      */
139     public static final Parser ROW = new ParseRow();
140 
141     /***
142      * parser for noframes.
143      */
144     public static final Parser NOFRAMES = new ParseNoFrames();
145 
146     /***
147      * parser for select.
148      */
149     public static final Parser SELECT = new ParseSelect();
150 
151     /***
152      * parser for text.
153      */
154     public static final Parser TEXT = new ParseText();
155 
156     /***
157      * parser for empty elements.
158      */
159     public static final Parser EMPTY = new ParseEmpty();
160 
161     /***
162      * parser for optgroup.
163      */
164     public static final Parser OPTGROUP = new ParseOptGroup();
165 
166     /***
167      * ParserImpl should not be instantiated.
168      */
169     private ParserImpl()
170     {
171         // unused
172     }
173 
174     /***
175      * @param lexer
176      * @param node
177      * @param mode
178      */
179     protected static void parseTag(Lexer lexer, Node node, short mode)
180     {
181         // Fix by GLP 2000-12-21. Need to reset insertspace if this
182         // is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
183         if ((node.tag.model & Dict.CM_EMPTY) != 0)
184         {
185             lexer.waswhite = false;
186         }
187         else if (!((node.tag.model & Dict.CM_INLINE) != 0))
188         {
189             lexer.insertspace = false;
190         }
191 
192         if (node.tag.getParser() == null)
193         {
194             return;
195         }
196 
197         if (node.type == Node.START_END_TAG)
198         {
199             Node.trimEmptyElement(lexer, node);
200             return;
201         }
202 
203         node.tag.getParser().parse(lexer, node, mode);
204     }
205 
206     /***
207      * Move node to the head, where element is used as starting point in hunt for head. Normally called during parsing.
208      * @param lexer
209      * @param element
210      * @param node
211      */
212     protected static void moveToHead(Lexer lexer, Node element, Node node)
213     {
214         Node head;
215         node.removeNode(); // make sure that node is isolated
216 
217         TagTable tt = lexer.configuration.tt;
218 
219         if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
220         {
221             lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
222 
223             while (element.tag != tt.tagHtml)
224             {
225                 element = element.parent;
226             }
227 
228             for (head = element.content; head != null; head = head.next)
229             {
230                 if (head.tag == tt.tagHead)
231                 {
232                     head.insertNodeAtEnd(node);
233                     break;
234                 }
235             }
236 
237             if (node.tag.getParser() != null)
238             {
239                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
240             }
241         }
242         else
243         {
244             lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
245         }
246     }
247 
248     /***
249      * moves given node to end of body element.
250      * @param lexer Lexer
251      * @param node Node to insert
252      */
253     static void moveNodeToBody(Lexer lexer, Node node)
254     {
255         node.removeNode();
256         Node body = lexer.root.findBody(lexer.configuration.tt);
257         body.insertNodeAtEnd(node);
258     }
259 
260     /***
261      * Parser for HTML.
262      */
263     public static class ParseHTML implements Parser
264     {
265 
266         /***
267          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
268          */
269         public void parse(Lexer lexer, Node html, short mode)
270         {
271             Node node, head;
272             Node frameset = null;
273             Node noframes = null;
274 
275             lexer.configuration.xmlTags = false;
276             lexer.seenEndBody = false;
277             TagTable tt = lexer.configuration.tt;
278 
279             while (true)
280             {
281                 node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
282 
283                 if (node == null)
284                 {
285                     node = lexer.inferredTag("head");
286                     break;
287                 }
288 
289                 if (node.tag == tt.tagHead)
290                 {
291                     break;
292                 }
293 
294                 if (node.tag == html.tag && node.type == Node.END_TAG)
295                 {
296                     lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
297                     continue;
298                 }
299 
300                 // deal with comments etc.
301                 if (Node.insertMisc(html, node))
302                 {
303                     continue;
304                 }
305 
306                 lexer.ungetToken();
307                 node = lexer.inferredTag("head");
308                 break;
309             }
310 
311             head = node;
312             html.insertNodeAtEnd(head);
313             HEAD.parse(lexer, head, mode);
314 
315             while (true)
316             {
317                 node = lexer.getToken(Lexer.IGNORE_WHITESPACE);
318 
319                 if (node == null)
320                 {
321                     if (frameset == null)
322                     {
323                         // implied body
324                         node = lexer.inferredTag("body");
325                         html.insertNodeAtEnd(node);
326                         BODY.parse(lexer, node, mode);
327                     }
328 
329                     return;
330                 }
331 
332                 // robustly handle html tags
333                 if (node.tag == html.tag)
334                 {
335                     if (node.type != Node.START_TAG && frameset == null)
336                     {
337                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
338                     }
339                     else if (node.type == Node.END_TAG)
340                     {
341                         lexer.seenEndHtml = true;
342                     }
343 
344                     continue;
345                 }
346 
347                 // deal with comments etc.
348                 if (Node.insertMisc(html, node))
349                 {
350                     continue;
351                 }
352 
353                 // if frameset document coerce <body> to <noframes>
354                 if (node.tag == tt.tagBody)
355                 {
356                     if (node.type != Node.START_TAG)
357                     {
358                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
359                         continue;
360                     }
361 
362                     if (frameset != null)
363                     {
364                         lexer.ungetToken();
365 
366                         if (noframes == null)
367                         {
368                             noframes = lexer.inferredTag("noframes");
369                             frameset.insertNodeAtEnd(noframes);
370                             lexer.report.warning(lexer, html, noframes, Report.INSERTING_TAG);
371                         }
372 
373                         parseTag(lexer, noframes, mode);
374                         continue;
375                     }
376 
377                     lexer.constrainVersion(~Dict.VERS_FRAMESET);
378                     break; // to parse body
379                 }
380 
381                 // flag an error if we see more than one frameset
382                 if (node.tag == tt.tagFrameset)
383                 {
384                     if (node.type != Node.START_TAG)
385                     {
386                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
387                         continue;
388                     }
389 
390                     if (frameset != null)
391                     {
392                         lexer.report.error(lexer, html, node, Report.DUPLICATE_FRAMESET);
393                     }
394                     else
395                     {
396                         frameset = node;
397                     }
398 
399                     html.insertNodeAtEnd(node);
400                     parseTag(lexer, node, mode);
401 
402                     // see if it includes a noframes element so that we can merge subsequent noframes elements
403 
404                     for (node = frameset.content; node != null; node = node.next)
405                     {
406                         if (node.tag == tt.tagNoframes)
407                         {
408                             noframes = node;
409                         }
410                     }
411                     continue;
412                 }
413 
414                 // if not a frameset document coerce <noframes> to <body>
415                 if (node.tag == tt.tagNoframes)
416                 {
417                     if (node.type != Node.START_TAG)
418                     {
419                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
420                         continue;
421                     }
422 
423                     if (frameset == null)
424                     {
425                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
426                         node = lexer.inferredTag("body");
427                         break;
428                     }
429 
430                     if (noframes == null)
431                     {
432                         noframes = node;
433                         frameset.insertNodeAtEnd(noframes);
434                     }
435 
436                     parseTag(lexer, noframes, mode);
437                     continue;
438                 }
439 
440                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
441                 {
442                     if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0)
443                     {
444                         moveToHead(lexer, html, node);
445                         continue;
446                     }
447 
448                     // #427675 - discard illegal frame element following a frameset - fix by Randy Waki 11 Oct 00
449                     if (frameset != null && node.tag == tt.tagFrame)
450                     {
451                         lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
452                         continue;
453                     }
454                 }
455 
456                 lexer.ungetToken();
457 
458                 // insert other content into noframes element
459                 if (frameset != null)
460                 {
461                     if (noframes == null)
462                     {
463                         noframes = lexer.inferredTag("noframes");
464                         frameset.insertNodeAtEnd(noframes);
465                     }
466                     else
467                     {
468                         lexer.report.warning(lexer, html, node, Report.NOFRAMES_CONTENT);
469                     }
470 
471                     lexer.constrainVersion(Dict.VERS_FRAMESET);
472                     parseTag(lexer, noframes, mode);
473                     continue;
474                 }
475 
476                 node = lexer.inferredTag("body");
477                 lexer.constrainVersion(~Dict.VERS_FRAMESET);
478                 break;
479             }
480 
481             // node must be body
482             html.insertNodeAtEnd(node);
483             parseTag(lexer, node, mode);
484             lexer.seenEndHtml = true;
485         }
486 
487     }
488 
489     /***
490      * Parser for HEAD.
491      */
492     public static class ParseHead implements Parser
493     {
494 
495         /***
496          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
497          */
498         public void parse(Lexer lexer, Node head, short mode)
499         {
500             Node node;
501             int hasTitle = 0;
502             int hasBase = 0;
503             TagTable tt = lexer.configuration.tt;
504 
505             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
506             {
507                 if (node.tag == head.tag && node.type == Node.END_TAG)
508                 {
509                     head.closed = true;
510                     break;
511                 }
512 
513                 if (node.type == Node.TEXT_NODE)
514                 {
515                     lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
516                     lexer.ungetToken();
517                     break;
518                 }
519 
520                 // deal with comments etc.
521                 if (Node.insertMisc(head, node))
522                 {
523                     continue;
524                 }
525 
526                 if (node.type == Node.DOCTYPE_TAG)
527                 {
528                     Node.insertDocType(lexer, head, node);
529                     continue;
530                 }
531 
532                 // discard unknown tags
533                 if (node.tag == null)
534                 {
535                     lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
536                     continue;
537                 }
538 
539                 if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_HEAD))
540                 {
541                     // #545067 Implicit closing of head broken - warn only for XHTML input
542                     if (lexer.isvoyager)
543                     {
544                         lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
545                     }
546                     lexer.ungetToken();
547                     break;
548                 }
549 
550                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
551                 {
552                     if (node.tag == tt.tagTitle)
553                     {
554                         ++hasTitle;
555 
556                         if (hasTitle > 1)
557                         {
558                             lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
559                         }
560                     }
561                     else if (node.tag == tt.tagBase)
562                     {
563                         ++hasBase;
564 
565                         if (hasBase > 1)
566                         {
567                             lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
568                         }
569                     }
570                     else if (node.tag == tt.tagNoscript)
571                     {
572                         lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
573                     }
574 
575                     head.insertNodeAtEnd(node);
576                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
577                     continue;
578                 }
579 
580                 // discard unexpected text nodes and end tags
581                 lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
582             }
583 
584             if (hasTitle == 0)
585             {
586                 if (!lexer.configuration.bodyOnly)
587                 {
588                     lexer.report.warning(lexer, head, null, Report.MISSING_TITLE_ELEMENT);
589                 }
590                 head.insertNodeAtEnd(lexer.inferredTag("title"));
591             }
592         }
593 
594     }
595 
596     /***
597      * Parser for TITLE.
598      */
599     public static class ParseTitle implements Parser
600     {
601 
602         /***
603          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
604          */
605         public void parse(Lexer lexer, Node title, short mode)
606         {
607             Node node;
608 
609             while ((node = lexer.getToken(Lexer.MIXED_CONTENT)) != null)
610             {
611                 // [438658] : Missing / in title endtag makes 2 titles
612                 if (node.tag == title.tag && node.type == Node.START_TAG)
613                 {
614                     lexer.report.warning(lexer, title, node, Report.COERCE_TO_ENDTAG);
615                     node.type = Node.END_TAG;
616                     continue;
617                 }
618                 else if (node.tag == title.tag && node.type == Node.END_TAG)
619                 {
620                     title.closed = true;
621                     Node.trimSpaces(lexer, title);
622                     return;
623                 }
624 
625                 if (node.type == Node.TEXT_NODE)
626                 {
627                     // only called for 1st child
628                     if (title.content == null)
629                     {
630                         Node.trimInitialSpace(lexer, title, node);
631                     }
632 
633                     if (node.start >= node.end)
634                     {
635                         continue;
636                     }
637 
638                     title.insertNodeAtEnd(node);
639                     continue;
640                 }
641 
642                 // deal with comments etc.
643                 if (Node.insertMisc(title, node))
644                 {
645                     continue;
646                 }
647 
648                 // discard unknown tags
649                 if (node.tag == null)
650                 {
651                     lexer.report.warning(lexer, title, node, Report.DISCARDING_UNEXPECTED);
652                     continue;
653                 }
654 
655                 // pushback unexpected tokens
656                 lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_BEFORE);
657                 lexer.ungetToken();
658                 Node.trimSpaces(lexer, title);
659                 return;
660             }
661 
662             lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_FOR);
663         }
664 
665     }
666 
667     /***
668      * Parser for SCRIPT.
669      */
670     public static class ParseScript implements Parser
671     {
672 
673         /***
674          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
675          */
676         public void parse(Lexer lexer, Node script, short mode)
677         {
678             // This isn't quite right for CDATA content as it recognises tags within the content and parses them
679             // accordingly. This will unfortunately screw up scripts which include < + letter, < + !, < + ? or < + / +
680             // letter
681 
682             Node node = lexer.getCDATA(script);
683 
684             if (node != null)
685             {
686                 script.insertNodeAtEnd(node);
687             }
688         }
689 
690     }
691 
692     /***
693      * Parser for BODY.
694      */
695     public static class ParseBody implements Parser
696     {
697 
698         /***
699          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
700          */
701         public void parse(Lexer lexer, Node body, short mode)
702         {
703             Node node;
704             boolean checkstack, iswhitenode;
705 
706             mode = Lexer.IGNORE_WHITESPACE;
707             checkstack = true;
708             TagTable tt = lexer.configuration.tt;
709 
710             Clean.bumpObject(lexer, body.parent);
711 
712             while ((node = lexer.getToken(mode)) != null)
713             {
714 
715                 // #538536 Extra endtags not detected
716                 if (node.tag == tt.tagHtml)
717                 {
718                     if (node.type == Node.START_TAG || node.type == Node.START_END_TAG || lexer.seenEndHtml)
719                     {
720                         lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
721                     }
722                     else
723                     {
724                         lexer.seenEndHtml = true;
725                     }
726 
727                     continue;
728                 }
729 
730                 if (lexer.seenEndBody
731                     && (node.type == Node.START_TAG || node.type == Node.END_TAG || node.type == Node.START_END_TAG))
732                 {
733                     lexer.report.warning(lexer, body, node, Report.CONTENT_AFTER_BODY);
734                 }
735 
736                 if (node.tag == body.tag && node.type == Node.END_TAG)
737                 {
738                     body.closed = true;
739                     Node.trimSpaces(lexer, body);
740                     lexer.seenEndBody = true;
741                     mode = Lexer.IGNORE_WHITESPACE;
742 
743                     if (body.parent.tag == tt.tagNoframes)
744                     {
745                         break;
746                     }
747 
748                     continue;
749                 }
750 
751                 if (node.tag == tt.tagNoframes)
752                 {
753                     if (node.type == Node.START_TAG)
754                     {
755                         body.insertNodeAtEnd(node);
756                         BLOCK.parse(lexer, node, mode);
757                         continue;
758                     }
759 
760                     if (node.type == Node.END_TAG && body.parent.tag == tt.tagNoframes)
761                     {
762                         Node.trimSpaces(lexer, body);
763                         lexer.ungetToken();
764                         break;
765                     }
766                 }
767 
768                 if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset) && body.parent.tag == tt.tagNoframes)
769                 {
770                     Node.trimSpaces(lexer, body);
771                     lexer.ungetToken();
772                     break;
773                 }
774 
775                 iswhitenode = false;
776 
777                 if (node.type == Node.TEXT_NODE
778                     && node.end <= node.start + 1
779                     && node.textarray[node.start] == (byte) ' ')
780                 {
781                     iswhitenode = true;
782                 }
783 
784                 // deal with comments etc.
785                 if (Node.insertMisc(body, node))
786                 {
787                     continue;
788                 }
789 
790                 // #538536 Extra endtags not detected
791                 // if (lexer.seenEndBody && !iswhitenode)
792                 // {
793                 //     lexer.seenEndBody = true;
794                 //     lexer.report.warning(lexer, body, node, Report.CONTENT_AFTER_BODY);
795                 // }
796 
797                 // mixed content model permits text
798                 if (node.type == Node.TEXT_NODE)
799                 {
800                     if (iswhitenode && mode == Lexer.IGNORE_WHITESPACE)
801                     {
802                         continue;
803                     }
804 
805                     if (lexer.configuration.encloseBodyText && !iswhitenode)
806                     {
807                         Node para;
808 
809                         lexer.ungetToken();
810                         para = lexer.inferredTag("p");
811                         body.insertNodeAtEnd(para);
812                         parseTag(lexer, para, mode);
813                         mode = Lexer.MIXED_CONTENT;
814                         continue;
815                     }
816 
817                     //  HTML2 and HTML4 strict doesn't allow text here
818                     lexer.constrainVersion(~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20));
819 
820                     if (checkstack)
821                     {
822                         checkstack = false;
823 
824                         if (lexer.inlineDup(node) > 0)
825                         {
826                             continue;
827                         }
828                     }
829 
830                     body.insertNodeAtEnd(node);
831                     mode = Lexer.MIXED_CONTENT;
832                     continue;
833                 }
834 
835                 if (node.type == Node.DOCTYPE_TAG)
836                 {
837                     Node.insertDocType(lexer, body, node);
838                     continue;
839                 }
840                 // discard unknown and PARAM tags
841                 if (node.tag == null || node.tag == tt.tagParam)
842                 {
843                     lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
844                     continue;
845                 }
846 
847                 // Netscape allows LI and DD directly in BODY We infer UL or DL respectively and use this boolean to
848                 // exclude block-level elements so as to match Netscape's observed behaviour.
849 
850                 lexer.excludeBlocks = false;
851 
852                 if ((!((node.tag.model & Dict.CM_BLOCK) != 0) && !((node.tag.model & Dict.CM_INLINE) != 0))
853                     || node.tag == tt.tagInput)
854                 {
855                     // avoid this error message being issued twice
856                     if (!((node.tag.model & Dict.CM_HEAD) != 0))
857                     {
858                         lexer.report.warning(lexer, body, node, Report.TAG_NOT_ALLOWED_IN);
859                     }
860 
861                     if ((node.tag.model & Dict.CM_HTML) != 0)
862                     {
863                         // copy body attributes if current body was inferred
864                         if (node.tag == tt.tagBody && body.implicit && body.attributes == null)
865                         {
866                             body.attributes = node.attributes;
867                             node.attributes = null;
868                         }
869 
870                         continue;
871                     }
872 
873                     if ((node.tag.model & Dict.CM_HEAD) != 0)
874                     {
875                         moveToHead(lexer, body, node);
876                         continue;
877                     }
878 
879                     if ((node.tag.model & Dict.CM_LIST) != 0)
880                     {
881                         lexer.ungetToken();
882                         node = lexer.inferredTag("ul");
883                         node.addClass("noindent");
884                         lexer.excludeBlocks = true;
885                     }
886                     else if ((node.tag.model & Dict.CM_DEFLIST) != 0)
887                     {
888                         lexer.ungetToken();
889                         node = lexer.inferredTag("dl");
890                         lexer.excludeBlocks = true;
891                     }
892                     else if ((node.tag.model & (Dict.CM_TABLE | Dict.CM_ROWGRP | Dict.CM_ROW)) != 0)
893                     {
894                         lexer.ungetToken();
895                         node = lexer.inferredTag("table");
896                         lexer.excludeBlocks = true;
897                     }
898                     else if (node.tag == tt.tagInput)
899                     {
900                         lexer.ungetToken();
901                         node = lexer.inferredTag("form");
902                         lexer.excludeBlocks = true;
903                     }
904                     else
905                     {
906                         if (!((node.tag.model & (Dict.CM_ROW | Dict.CM_FIELD)) != 0))
907                         {
908                             lexer.ungetToken();
909                             return;
910                         }
911 
912                         // ignore </td></th> <option> etc.
913                         continue;
914                     }
915                 }
916 
917                 if (node.type == Node.END_TAG)
918                 {
919                     if (node.tag == tt.tagBr)
920                     {
921                         node.type = Node.START_TAG;
922                     }
923                     else if (node.tag == tt.tagP)
924                     {
925                         Node.coerceNode(lexer, node, tt.tagBr);
926                         body.insertNodeAtEnd(node);
927                         node = lexer.inferredTag("br");
928                     }
929                     else if ((node.tag.model & Dict.CM_INLINE) != 0)
930                     {
931                         lexer.popInline(node);
932                     }
933                 }
934 
935                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
936                 {
937                     if (((node.tag.model & Dict.CM_INLINE) != 0) && !((node.tag.model & Dict.CM_MIXED) != 0))
938                     {
939                         // HTML4 strict doesn't allow inline content here
940                         // but HTML2 does allow img elements as children of body
941                         if (node.tag == tt.tagImg)
942                         {
943                             lexer.constrainVersion(~Dict.VERS_HTML40_STRICT);
944                         }
945                         else
946                         {
947                             lexer.constrainVersion(~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20));
948                         }
949 
950                         if (checkstack && !node.implicit)
951                         {
952                             checkstack = false;
953 
954                             if (lexer.inlineDup(node) > 0)
955                             {
956                                 continue;
957                             }
958                         }
959 
960                         mode = Lexer.MIXED_CONTENT;
961                     }
962                     else
963                     {
964                         checkstack = true;
965                         mode = Lexer.IGNORE_WHITESPACE;
966                     }
967 
968                     if (node.implicit)
969                     {
970                         lexer.report.warning(lexer, body, node, Report.INSERTING_TAG);
971                     }
972 
973                     body.insertNodeAtEnd(node);
974                     parseTag(lexer, node, mode);
975                     continue;
976                 }
977 
978                 // discard unexpected tags
979                 lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
980             }
981         }
982 
983     }
984 
985     /***
986      * Parser for FRAMESET.
987      */
988     public static class ParseFrameSet implements Parser
989     {
990 
991         /***
992          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
993          */
994         public void parse(Lexer lexer, Node frameset, short mode)
995         {
996             Node node;
997             TagTable tt = lexer.configuration.tt;
998 
999             lexer.badAccess |= Report.USING_FRAMES;
1000 
1001             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
1002             {
1003                 if (node.tag == frameset.tag && node.type == Node.END_TAG)
1004                 {
1005                     frameset.closed = true;
1006                     Node.trimSpaces(lexer, frameset);
1007                     return;
1008                 }
1009 
1010                 // deal with comments etc.
1011                 if (Node.insertMisc(frameset, node))
1012                 {
1013                     continue;
1014                 }
1015 
1016                 if (node.tag == null)
1017                 {
1018                     lexer.report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED);
1019                     continue;
1020                 }
1021 
1022                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1023                 {
1024                     if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0)
1025                     {
1026                         moveToHead(lexer, frameset, node);
1027                         continue;
1028                     }
1029                 }
1030 
1031                 if (node.tag == tt.tagBody)
1032                 {
1033                     lexer.ungetToken();
1034                     node = lexer.inferredTag("noframes");
1035                     lexer.report.warning(lexer, frameset, node, Report.INSERTING_TAG);
1036                 }
1037 
1038                 if (node.type == Node.START_TAG && (node.tag.model & Dict.CM_FRAMES) != 0)
1039                 {
1040                     frameset.insertNodeAtEnd(node);
1041                     lexer.excludeBlocks = false;
1042                     parseTag(lexer, node, Lexer.MIXED_CONTENT);
1043                     continue;
1044                 }
1045                 else if (node.type == Node.START_END_TAG && (node.tag.model & Dict.CM_FRAMES) != 0)
1046                 {
1047                     frameset.insertNodeAtEnd(node);
1048                     continue;
1049                 }
1050 
1051                 // discard unexpected tags
1052                 lexer.report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED);
1053             }
1054 
1055             lexer.report.warning(lexer, frameset, node, Report.MISSING_ENDTAG_FOR);
1056         }
1057 
1058     }
1059 
1060     /***
1061      * Parser for INLINE.
1062      */
1063     public static class ParseInline implements Parser
1064     {
1065 
1066         /***
1067          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1068          */
1069         public void parse(Lexer lexer, Node element, short mode)
1070         {
1071             Node node, parent;
1072             TagTable tt = lexer.configuration.tt;
1073 
1074             if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY))
1075             {
1076                 return;
1077             }
1078 
1079             // ParseInline is used for some block level elements like H1 to H6 For such elements we need to insert
1080             // inline emphasis tags currently on the inline stack. For Inline elements, we normally push them onto the
1081             // inline stack provided they aren't implicit or OBJECT/APPLET. This test is carried out in PushInline and
1082             // PopInline, see istack.c We don't push SPAN to replicate current browser behavior
1083 
1084             if (TidyUtils.toBoolean(element.tag.model & Dict.CM_BLOCK) || (element.tag == tt.tagDt))
1085             {
1086                 lexer.inlineDup(null);
1087             }
1088             else if (TidyUtils.toBoolean(element.tag.model & Dict.CM_INLINE))
1089             {
1090                 // && element.tag != tt.tagSpan #540571 Inconsistent behaviour with span inline element
1091                 lexer.pushInline(element);
1092             }
1093 
1094             if (element.tag == tt.tagNobr)
1095             {
1096                 lexer.badLayout |= Report.USING_NOBR;
1097             }
1098             else if (element.tag == tt.tagFont)
1099             {
1100                 lexer.badLayout |= Report.USING_FONT;
1101             }
1102 
1103             // Inline elements may or may not be within a preformatted element
1104             if (mode != Lexer.PREFORMATTED)
1105             {
1106                 mode = Lexer.MIXED_CONTENT;
1107             }
1108 
1109             while ((node = lexer.getToken(mode)) != null)
1110             {
1111                 // end tag for current element
1112                 if (node.tag == element.tag && node.type == Node.END_TAG)
1113                 {
1114                     if (TidyUtils.toBoolean(element.tag.model & Dict.CM_INLINE))
1115                     {
1116                         lexer.popInline(node);
1117                     }
1118 
1119                     if (!TidyUtils.toBoolean(mode & Lexer.PREFORMATTED))
1120                     {
1121                         Node.trimSpaces(lexer, element);
1122                     }
1123 
1124                     // if a font element wraps an anchor and nothing else then move the font element inside the anchor
1125                     // since otherwise it won't alter the anchor text color
1126 
1127                     if (element.tag == tt.tagFont && element.content != null && element.content == element.last)
1128                     {
1129                         Node child = element.content;
1130 
1131                         if (child.tag == tt.tagA)
1132                         {
1133                             child.parent = element.parent;
1134                             child.next = element.next;
1135                             child.prev = element.prev;
1136 
1137                             if (child.prev != null)
1138                             {
1139                                 child.prev.next = child;
1140                             }
1141                             else
1142                             {
1143                                 child.parent.content = child;
1144                             }
1145 
1146                             if (child.next != null)
1147                             {
1148                                 child.next.prev = child;
1149                             }
1150                             else
1151                             {
1152                                 child.parent.last = child;
1153                             }
1154 
1155                             element.next = null;
1156                             element.prev = null;
1157                             element.parent = child;
1158                             element.content = child.content;
1159                             element.last = child.last;
1160                             child.content = element;
1161                             child.last = element;
1162                             for (child = element.content; child != null; child = child.next)
1163                             {
1164                                 child.parent = element;
1165                             }
1166                         }
1167                     }
1168                     element.closed = true;
1169                     Node.trimSpaces(lexer, element);
1170                     Node.trimEmptyElement(lexer, element);
1171                     return;
1172                 }
1173 
1174                 // <u> ... <u> map 2nd <u> to </u> if 1st is explicit
1175                 // otherwise emphasis nesting is probably unintentional
1176                 // big and small have cumulative effect to leave them alone
1177                 if (node.type == Node.START_TAG
1178                     && node.tag == element.tag
1179                     && lexer.isPushed(node)
1180                     && !node.implicit
1181                     && !element.implicit
1182                     && node.tag != null
1183                     && ((node.tag.model & Dict.CM_INLINE) != 0)
1184                     && node.tag != tt.tagA
1185                     && node.tag != tt.tagFont
1186                     && node.tag != tt.tagBig
1187                     && node.tag != tt.tagSmall
1188                     && node.tag != tt.tagQ)
1189                 {
1190                     if (element.content != null && node.attributes == null)
1191                     {
1192                         lexer.report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG);
1193                         node.type = Node.END_TAG;
1194                         lexer.ungetToken();
1195                         continue;
1196                     }
1197 
1198                     lexer.report.warning(lexer, element, node, Report.NESTED_EMPHASIS);
1199                 }
1200                 else if (lexer.isPushed(node) && node.type == Node.START_TAG && node.tag == tt.tagQ)
1201                 {
1202                     lexer.report.warning(lexer, element, node, Report.NESTED_QUOTATION);
1203                 }
1204 
1205                 if (node.type == Node.TEXT_NODE)
1206                 {
1207                     // only called for 1st child
1208                     if (element.content == null && !TidyUtils.toBoolean(mode & Lexer.PREFORMATTED))
1209                     {
1210                         Node.trimSpaces(lexer, element);
1211                     }
1212 
1213                     if (node.start >= node.end)
1214                     {
1215                         continue;
1216                     }
1217 
1218                     element.insertNodeAtEnd(node);
1219                     continue;
1220                 }
1221 
1222                 // mixed content model so allow text
1223                 if (Node.insertMisc(element, node))
1224                 {
1225                     continue;
1226                 }
1227 
1228                 // deal with HTML tags
1229                 if (node.tag == tt.tagHtml)
1230                 {
1231                     if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1232                     {
1233                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1234                         continue;
1235                     }
1236 
1237                     // otherwise infer end of inline element
1238                     lexer.ungetToken();
1239                     if (!((mode & Lexer.PREFORMATTED) != 0))
1240                     {
1241                         Node.trimSpaces(lexer, element);
1242                     }
1243                     Node.trimEmptyElement(lexer, element);
1244                     return;
1245                 }
1246 
1247                 // within <dt> or <pre> map <p> to <br>
1248                 if (node.tag == tt.tagP
1249                     && node.type == Node.START_TAG
1250                     && ((mode & Lexer.PREFORMATTED) != 0 || element.tag == tt.tagDt || element.isDescendantOf(tt.tagDt)))
1251                 {
1252                     node.tag = tt.tagBr;
1253                     node.element = "br";
1254                     Node.trimSpaces(lexer, element);
1255                     element.insertNodeAtEnd(node);
1256                     continue;
1257                 }
1258 
1259                 // ignore unknown and PARAM tags
1260                 if (node.tag == null || node.tag == tt.tagParam)
1261                 {
1262                     lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1263                     continue;
1264                 }
1265 
1266                 if (node.tag == tt.tagBr && node.type == Node.END_TAG)
1267                 {
1268                     node.type = Node.START_TAG;
1269                 }
1270 
1271                 if (node.type == Node.END_TAG)
1272                 {
1273                     // coerce </br> to <br>
1274                     if (node.tag == tt.tagBr)
1275                     {
1276                         node.type = Node.START_TAG;
1277                     }
1278                     else if (node.tag == tt.tagP)
1279                     {
1280                         // coerce unmatched </p> to <br><br>
1281                         if (!element.isDescendantOf(tt.tagP))
1282                         {
1283                             Node.coerceNode(lexer, node, tt.tagBr);
1284                             Node.trimSpaces(lexer, element);
1285                             element.insertNodeAtEnd(node);
1286                             node = lexer.inferredTag("br");
1287                             continue;
1288                         }
1289                     }
1290                     else if ((node.tag.model & Dict.CM_INLINE) != 0
1291                         && node.tag != tt.tagA
1292                         && !((node.tag.model & Dict.CM_OBJECT) != 0)
1293                         && (element.tag.model & Dict.CM_INLINE) != 0)
1294                     {
1295                         // allow any inline end tag to end current element
1296                         lexer.popInline(element);
1297 
1298                         if (element.tag != tt.tagA)
1299                         {
1300                             if (node.tag == tt.tagA && node.tag != element.tag)
1301                             {
1302                                 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1303                                 lexer.ungetToken();
1304                             }
1305                             else
1306                             {
1307                                 lexer.report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG);
1308                             }
1309 
1310                             if (!((mode & Lexer.PREFORMATTED) != 0))
1311                             {
1312                                 Node.trimSpaces(lexer, element);
1313                             }
1314                             Node.trimEmptyElement(lexer, element);
1315                             return;
1316                         }
1317 
1318                         // if parent is <a> then discard unexpected inline end tag
1319                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1320                         continue;
1321                     } // special case </tr> etc. for stuff moved in front of table
1322                     else if (lexer.exiled && node.tag.model != 0 && (node.tag.model & Dict.CM_TABLE) != 0)
1323                     {
1324                         lexer.ungetToken();
1325                         Node.trimSpaces(lexer, element);
1326                         Node.trimEmptyElement(lexer, element);
1327                         return;
1328                     }
1329                 }
1330 
1331                 // allow any header tag to end current header
1332                 if ((node.tag.model & Dict.CM_HEADING) != 0 && (element.tag.model & Dict.CM_HEADING) != 0)
1333                 {
1334                     if (node.tag == element.tag)
1335                     {
1336                         lexer.report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG);
1337                     }
1338                     else
1339                     {
1340                         lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1341                         lexer.ungetToken();
1342                     }
1343                     if (!((mode & Lexer.PREFORMATTED) != 0))
1344                     {
1345                         Node.trimSpaces(lexer, element);
1346                     }
1347                     Node.trimEmptyElement(lexer, element);
1348                     return;
1349                 }
1350 
1351                 // an <A> tag to ends any open <A> element but <A href=...> is mapped to </A><A href=...>
1352 
1353                 // #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00
1354                 // if (node.tag == tt.tagA && !node.implicit && lexer.isPushed(node))
1355                 if (node.tag == tt.tagA
1356                     && !node.implicit
1357                     && (element.tag == tt.tagA || element.isDescendantOf(tt.tagA)))
1358                 {
1359                     // coerce <a> to </a> unless it has some attributes
1360                     // #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00
1361                     // other fixes by Dave Raggett
1362                     // if (node.attributes == null)
1363                     if (node.type != Node.END_TAG && node.attributes == null)
1364                     {
1365                         node.type = Node.END_TAG;
1366                         lexer.report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG);
1367                         // lexer.popInline(node);
1368                         lexer.ungetToken();
1369                         continue;
1370                     }
1371 
1372                     lexer.ungetToken();
1373                     lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1374                     // lexer.popInline(element);
1375                     if (!((mode & Lexer.PREFORMATTED) != 0))
1376                     {
1377                         Node.trimSpaces(lexer, element);
1378                     }
1379                     Node.trimEmptyElement(lexer, element);
1380                     return;
1381                 }
1382 
1383                 if ((element.tag.model & Dict.CM_HEADING) != 0)
1384                 {
1385                     if (node.tag == tt.tagCenter || node.tag == tt.tagDiv)
1386                     {
1387                         if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
1388                         {
1389                             lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1390                             continue;
1391                         }
1392 
1393                         lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
1394 
1395                         // insert center as parent if heading is empty
1396                         if (element.content == null)
1397                         {
1398                             Node.insertNodeAsParent(element, node);
1399                             continue;
1400                         }
1401 
1402                         // split heading and make center parent of 2nd part
1403                         element.insertNodeAfterElement(node);
1404 
1405                         if (!((mode & Lexer.PREFORMATTED) != 0))
1406                         {
1407                             Node.trimSpaces(lexer, element);
1408                         }
1409 
1410                         element = lexer.cloneNode(element);
1411                         element.start = lexer.lexsize;
1412                         element.end = lexer.lexsize;
1413                         node.insertNodeAtEnd(element);
1414                         continue;
1415                     }
1416 
1417                     if (node.tag == tt.tagHr)
1418                     {
1419                         if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
1420                         {
1421                             lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1422                             continue;
1423                         }
1424 
1425                         lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
1426 
1427                         // insert hr before heading if heading is empty
1428                         if (element.content == null)
1429                         {
1430                             Node.insertNodeBeforeElement(element, node);
1431                             continue;
1432                         }
1433 
1434                         // split heading and insert hr before 2nd part
1435                         element.insertNodeAfterElement(node);
1436 
1437                         if (!((mode & Lexer.PREFORMATTED) != 0))
1438                         {
1439                             Node.trimSpaces(lexer, element);
1440                         }
1441 
1442                         element = lexer.cloneNode(element);
1443                         element.start = lexer.lexsize;
1444                         element.end = lexer.lexsize;
1445                         node.insertNodeAfterElement(element);
1446                         continue;
1447                     }
1448                 }
1449 
1450                 if (element.tag == tt.tagDt)
1451                 {
1452                     if (node.tag == tt.tagHr)
1453                     {
1454                         Node dd;
1455 
1456                         if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
1457                         {
1458                             lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1459                             continue;
1460                         }
1461 
1462                         lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
1463                         dd = lexer.inferredTag("dd");
1464 
1465                         // insert hr within dd before dt if dt is empty
1466                         if (element.content == null)
1467                         {
1468                             Node.insertNodeBeforeElement(element, dd);
1469                             dd.insertNodeAtEnd(node);
1470                             continue;
1471                         }
1472 
1473                         // split dt and insert hr within dd before 2nd part
1474                         element.insertNodeAfterElement(dd);
1475                         dd.insertNodeAtEnd(node);
1476 
1477                         if (!((mode & Lexer.PREFORMATTED) != 0))
1478                         {
1479                             Node.trimSpaces(lexer, element);
1480                         }
1481 
1482                         element = lexer.cloneNode(element);
1483                         element.start = lexer.lexsize;
1484                         element.end = lexer.lexsize;
1485                         dd.insertNodeAfterElement(element);
1486                         continue;
1487                     }
1488                 }
1489 
1490                 // if this is the end tag for an ancestor element then infer end tag for this element
1491 
1492                 if (node.type == Node.END_TAG)
1493                 {
1494                     for (parent = element.parent; parent != null; parent = parent.parent)
1495                     {
1496                         if (node.tag == parent.tag)
1497                         {
1498                             if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit)
1499                             {
1500                                 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1501                             }
1502 
1503                             if (element.tag == tt.tagA)
1504                             {
1505                                 lexer.popInline(element);
1506                             }
1507 
1508                             lexer.ungetToken();
1509 
1510                             if (!((mode & Lexer.PREFORMATTED) != 0))
1511                             {
1512                                 Node.trimSpaces(lexer, element);
1513                             }
1514 
1515                             Node.trimEmptyElement(lexer, element);
1516                             return;
1517                         }
1518                     }
1519                 }
1520 
1521                 // block level tags end this element
1522                 if (!((node.tag.model & Dict.CM_INLINE) != 0))
1523                 {
1524                     if (node.type != Node.START_TAG)
1525                     {
1526                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1527                         continue;
1528                     }
1529 
1530                     if (!((element.tag.model & Dict.CM_OPT) != 0))
1531                     {
1532                         lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
1533                     }
1534 
1535                     if ((node.tag.model & Dict.CM_HEAD) != 0 && !((node.tag.model & Dict.CM_BLOCK) != 0))
1536                     {
1537                         moveToHead(lexer, element, node);
1538                         continue;
1539                     }
1540 
1541                     // prevent anchors from propagating into block tags except for headings h1 to h6
1542 
1543                     if (element.tag == tt.tagA)
1544                     {
1545                         if (node.tag != null && !((node.tag.model & Dict.CM_HEADING) != 0))
1546                         {
1547                             lexer.popInline(element);
1548                         }
1549                         else if (!(element.content != null))
1550                         {
1551                             Node.discardElement(element);
1552                             lexer.ungetToken();
1553                             return;
1554                         }
1555                     }
1556 
1557                     lexer.ungetToken();
1558 
1559                     if (!((mode & Lexer.PREFORMATTED) != 0))
1560                     {
1561                         Node.trimSpaces(lexer, element);
1562                     }
1563 
1564                     Node.trimEmptyElement(lexer, element);
1565                     return;
1566                 }
1567 
1568                 // parse inline element
1569                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1570                 {
1571                     if (node.implicit)
1572                     {
1573                         lexer.report.warning(lexer, element, node, Report.INSERTING_TAG);
1574                     }
1575 
1576                     // trim white space before <br>
1577                     if (node.tag == tt.tagBr)
1578                     {
1579                         Node.trimSpaces(lexer, element);
1580                     }
1581 
1582                     element.insertNodeAtEnd(node);
1583                     parseTag(lexer, node, mode);
1584                     continue;
1585                 }
1586 
1587                 // discard unexpected tags
1588                 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
1589                 continue;
1590             }
1591 
1592             if (!((element.tag.model & Dict.CM_OPT) != 0))
1593             {
1594                 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR);
1595             }
1596 
1597             Node.trimEmptyElement(lexer, element);
1598         }
1599     }
1600 
1601     /***
1602      * Parser for LIST.
1603      */
1604     public static class ParseList implements Parser
1605     {
1606 
1607         public void parse(Lexer lexer, Node list, short mode)
1608         {
1609             Node node;
1610             Node parent;
1611             TagTable tt = lexer.configuration.tt;
1612 
1613             if ((list.tag.model & Dict.CM_EMPTY) != 0)
1614             {
1615                 return;
1616             }
1617 
1618             lexer.insert = -1; // defer implicit inline start tags
1619 
1620             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
1621             {
1622                 if (node.tag == list.tag && node.type == Node.END_TAG)
1623                 {
1624                     if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1625                     {
1626                         Node.coerceNode(lexer, list, tt.tagUl);
1627                     }
1628 
1629                     list.closed = true;
1630                     Node.trimEmptyElement(lexer, list);
1631                     return;
1632                 }
1633 
1634                 // deal with comments etc.
1635                 if (Node.insertMisc(list, node))
1636                 {
1637                     continue;
1638                 }
1639 
1640                 if (node.type != Node.TEXT_NODE && node.tag == null)
1641                 {
1642                     lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1643                     continue;
1644                 }
1645 
1646                 // if this is the end tag for an ancestor element then infer end tag for this element
1647 
1648                 if (node.type == Node.END_TAG)
1649                 {
1650                     if (node.tag == tt.tagForm)
1651                     {
1652                         badForm(lexer);
1653                         lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1654                         continue;
1655                     }
1656 
1657                     if (node.tag != null && (node.tag.model & Dict.CM_INLINE) != 0)
1658                     {
1659                         lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1660                         lexer.popInline(node);
1661                         continue;
1662                     }
1663 
1664                     for (parent = list.parent; parent != null; parent = parent.parent)
1665                     {
1666                         if (node.tag == parent.tag)
1667                         {
1668                             lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
1669                             lexer.ungetToken();
1670 
1671                             if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1672                             {
1673                                 Node.coerceNode(lexer, list, tt.tagUl);
1674                             }
1675 
1676                             Node.trimEmptyElement(lexer, list);
1677                             return;
1678                         }
1679                     }
1680 
1681                     lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1682                     continue;
1683                 }
1684 
1685                 if (node.tag != tt.tagLi)
1686                 {
1687                     lexer.ungetToken();
1688 
1689                     if (node.tag != null && (node.tag.model & Dict.CM_BLOCK) != 0 && lexer.excludeBlocks)
1690                     {
1691                         lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
1692                         Node.trimEmptyElement(lexer, list);
1693                         return;
1694                     }
1695 
1696                     node = lexer.inferredTag("li");
1697                     node.addAttribute("style", "list-style: none");
1698                     lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG);
1699                 }
1700 
1701                 // node should be <LI>
1702                 list.insertNodeAtEnd(node);
1703                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
1704             }
1705 
1706             if ((list.tag.model & Dict.CM_OBSOLETE) != 0)
1707             {
1708                 Node.coerceNode(lexer, list, tt.tagUl);
1709             }
1710 
1711             lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR);
1712             Node.trimEmptyElement(lexer, list);
1713         }
1714 
1715     }
1716 
1717     /***
1718      * Parser for empty elements.
1719      */
1720     public static class ParseEmpty implements Parser
1721     {
1722 
1723         /***
1724          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1725          */
1726         public void parse(Lexer lexer, Node element, short mode)
1727         {
1728             if (lexer.isvoyager)
1729             {
1730                 Node node = lexer.getToken(mode);
1731                 if (node != null && !(node.type == Node.END_TAG && node.tag == element.tag))
1732                 {
1733                     lexer.report.warning(lexer, element, node, Report.ELEMENT_NOT_EMPTY);
1734                     lexer.ungetToken();
1735                 }
1736             }
1737         }
1738     }
1739 
1740     /***
1741      * Parser for DEFLIST.
1742      */
1743     public static class ParseDefList implements Parser
1744     {
1745 
1746         /***
1747          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1748          */
1749         public void parse(Lexer lexer, Node list, short mode)
1750         {
1751             Node node, parent;
1752             TagTable tt = lexer.configuration.tt;
1753 
1754             if ((list.tag.model & Dict.CM_EMPTY) != 0)
1755             {
1756                 return;
1757             }
1758 
1759             lexer.insert = -1; // defer implicit inline start tags
1760 
1761             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
1762             {
1763                 if (node.tag == list.tag && node.type == Node.END_TAG)
1764                 {
1765                     list.closed = true;
1766                     Node.trimEmptyElement(lexer, list);
1767                     return;
1768                 }
1769 
1770                 // deal with comments etc.
1771                 if (Node.insertMisc(list, node))
1772                 {
1773                     continue;
1774                 }
1775 
1776                 if (node.type == Node.TEXT_NODE)
1777                 {
1778                     lexer.ungetToken();
1779                     node = lexer.inferredTag("dt");
1780                     lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG);
1781                 }
1782 
1783                 if (node.tag == null)
1784                 {
1785                     lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1786                     continue;
1787                 }
1788 
1789                 // if this is the end tag for an ancestor element then infer end tag for this element
1790 
1791                 if (node.type == Node.END_TAG)
1792                 {
1793                     if (node.tag == tt.tagForm)
1794                     {
1795                         badForm(lexer);
1796                         lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1797                         continue;
1798                     }
1799 
1800                     for (parent = list.parent; parent != null; parent = parent.parent)
1801                     {
1802                         if (node.tag == parent.tag)
1803                         {
1804                             lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
1805 
1806                             lexer.ungetToken();
1807                             Node.trimEmptyElement(lexer, list);
1808                             return;
1809                         }
1810                     }
1811                 }
1812 
1813                 // center in a dt or a dl breaks the dl list in two
1814                 if (node.tag == tt.tagCenter)
1815                 {
1816                     if (list.content != null)
1817                     {
1818                         list.insertNodeAfterElement(node);
1819                     }
1820                     else
1821                     {
1822                         // trim empty dl list
1823                         Node.insertNodeBeforeElement(list, node);
1824 
1825                         // #540296 tidy dumps with empty definition list
1826                         Node.discardElement(list);
1827                     }
1828 
1829                     // and parse contents of center
1830                     parseTag(lexer, node, mode);
1831 
1832                     // now create a new dl element
1833                     list = lexer.inferredTag("dl");
1834                     node.insertNodeAfterElement(list);
1835                     continue;
1836                 }
1837 
1838                 if (!(node.tag == tt.tagDt || node.tag == tt.tagDd))
1839                 {
1840                     lexer.ungetToken();
1841 
1842                     if (!((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
1843                     {
1844                         lexer.report.warning(lexer, list, node, Report.TAG_NOT_ALLOWED_IN);
1845                         Node.trimEmptyElement(lexer, list);
1846                         return;
1847                     }
1848 
1849                     // if DD appeared directly in BODY then exclude blocks
1850                     if (!((node.tag.model & Dict.CM_INLINE) != 0) && lexer.excludeBlocks)
1851                     {
1852                         Node.trimEmptyElement(lexer, list);
1853                         return;
1854                     }
1855 
1856                     node = lexer.inferredTag("dd");
1857                     lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG);
1858                 }
1859 
1860                 if (node.type == Node.END_TAG)
1861                 {
1862                     lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
1863                     continue;
1864                 }
1865 
1866                 // node should be <DT> or <DD>
1867                 list.insertNodeAtEnd(node);
1868                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
1869             }
1870 
1871             lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR);
1872             Node.trimEmptyElement(lexer, list);
1873         }
1874 
1875     }
1876 
1877     /***
1878      * Parser for PRE.
1879      */
1880     public static class ParsePre implements Parser
1881     {
1882 
1883         /***
1884          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
1885          */
1886         public void parse(Lexer lexer, Node pre, short mode)
1887         {
1888             Node node;
1889             TagTable tt = lexer.configuration.tt;
1890 
1891             if ((pre.tag.model & Dict.CM_EMPTY) != 0)
1892             {
1893                 return;
1894             }
1895 
1896             if ((pre.tag.model & Dict.CM_OBSOLETE) != 0)
1897             {
1898                 Node.coerceNode(lexer, pre, tt.tagPre);
1899             }
1900 
1901             lexer.inlineDup(null); // tell lexer to insert inlines if needed
1902 
1903             while ((node = lexer.getToken(Lexer.PREFORMATTED)) != null)
1904             {
1905                 if (node.tag == pre.tag && node.type == Node.END_TAG)
1906                 {
1907                     Node.trimSpaces(lexer, pre);
1908                     pre.closed = true;
1909                     Node.trimEmptyElement(lexer, pre);
1910                     return;
1911                 }
1912 
1913                 if (node.tag == tt.tagHtml)
1914                 {
1915                     if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1916                     {
1917                         lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
1918                     }
1919 
1920                     continue;
1921                 }
1922 
1923                 if (node.type == Node.TEXT_NODE)
1924                 {
1925                     // if first check for inital newline
1926                     if (pre.content == null)
1927                     {
1928                         if (node.textarray[node.start] == (byte) '\n')
1929                         {
1930                             ++node.start;
1931                         }
1932 
1933                         if (node.start >= node.end)
1934                         {
1935                             continue;
1936                         }
1937                     }
1938 
1939                     pre.insertNodeAtEnd(node);
1940                     continue;
1941                 }
1942 
1943                 // deal with comments etc.
1944                 if (Node.insertMisc(pre, node))
1945                 {
1946                     continue;
1947                 }
1948 
1949                 // strip unexpected tags
1950                 if (!lexer.preContent(node))
1951                 {
1952                     Node newnode;
1953 
1954                     lexer.report.warning(lexer, pre, node, Report.UNESCAPED_ELEMENT);
1955                     newnode = Node.escapeTag(lexer, node);
1956                     pre.insertNodeAtEnd(newnode);
1957                     continue;
1958                 }
1959 
1960                 if (node.tag == tt.tagP)
1961                 {
1962                     if (node.type == Node.START_TAG)
1963                     {
1964                         lexer.report.warning(lexer, pre, node, Report.USING_BR_INPLACE_OF);
1965 
1966                         // trim white space before <p> in <pre>
1967                         Node.trimSpaces(lexer, pre);
1968 
1969                         // coerce both <p> and </p> to <br>
1970                         Node.coerceNode(lexer, node, tt.tagBr);
1971                         pre.insertNodeAtEnd(node);
1972                     }
1973                     else
1974                     {
1975                         lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
1976                     }
1977                     continue;
1978                 }
1979 
1980                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
1981                 {
1982                     // trim white space before <br>
1983                     if (node.tag == tt.tagBr)
1984                     {
1985                         Node.trimSpaces(lexer, pre);
1986                     }
1987 
1988                     pre.insertNodeAtEnd(node);
1989                     parseTag(lexer, node, Lexer.PREFORMATTED);
1990                     continue;
1991                 }
1992 
1993                 // discard unexpected tags
1994                 lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
1995             }
1996 
1997             lexer.report.warning(lexer, pre, node, Report.MISSING_ENDTAG_FOR);
1998             Node.trimEmptyElement(lexer, pre);
1999         }
2000 
2001     }
2002 
2003     /***
2004      * Parser for block elements.
2005      */
2006     public static class ParseBlock implements Parser
2007     {
2008 
2009         /***
2010          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2011          */
2012         public void parse(Lexer lexer, Node element, short mode)
2013         {
2014             // element is node created by the lexer upon seeing the start tag, or by the parser when the start tag is
2015             // inferred.
2016             Node node, parent;
2017             boolean checkstack;
2018             int istackbase = 0;
2019             TagTable tt = lexer.configuration.tt;
2020 
2021             checkstack = true;
2022 
2023             if ((element.tag.model & Dict.CM_EMPTY) != 0)
2024             {
2025                 return;
2026             }
2027 
2028             if (element.tag == tt.tagForm && element.isDescendantOf(tt.tagForm))
2029             {
2030                 lexer.report.warning(lexer, element, null, Report.ILLEGAL_NESTING);
2031             }
2032 
2033             // InlineDup() asks the lexer to insert inline emphasis tags currently pushed on the istack, but take care
2034             // to avoid propagating inline emphasis inside OBJECT or APPLET. For these elements a fresh inline stack
2035             // context is created and disposed of upon reaching the end of the element. They thus behave like table
2036             // cells in this respect.
2037 
2038             if ((element.tag.model & Dict.CM_OBJECT) != 0)
2039             {
2040                 istackbase = lexer.istackbase;
2041                 lexer.istackbase = lexer.istack.size();
2042             }
2043 
2044             if (!((element.tag.model & Dict.CM_MIXED) != 0))
2045             {
2046                 lexer.inlineDup(null);
2047             }
2048 
2049             mode = Lexer.IGNORE_WHITESPACE;
2050 
2051             while ((node = lexer.getToken(mode)) != null)
2052             {
2053                 // end tag for this element
2054                 if (node.type == Node.END_TAG
2055                     && node.tag != null
2056                     && (node.tag == element.tag || element.was == node.tag))
2057                 {
2058 
2059                     if ((element.tag.model & Dict.CM_OBJECT) != 0)
2060                     {
2061                         // pop inline stack
2062                         while (lexer.istack.size() > lexer.istackbase)
2063                         {
2064                             lexer.popInline(null);
2065                         }
2066                         lexer.istackbase = istackbase;
2067                     }
2068 
2069                     element.closed = true;
2070                     Node.trimSpaces(lexer, element);
2071                     Node.trimEmptyElement(lexer, element);
2072                     return;
2073                 }
2074 
2075                 if (node.tag == tt.tagHtml || node.tag == tt.tagHead || node.tag == tt.tagBody)
2076                 {
2077                     if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2078                     {
2079                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2080                     }
2081 
2082                     continue;
2083                 }
2084 
2085                 if (node.type == Node.END_TAG)
2086                 {
2087                     if (node.tag == null)
2088                     {
2089                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2090 
2091                         continue;
2092                     }
2093                     else if (node.tag == tt.tagBr)
2094                     {
2095                         node.type = Node.START_TAG;
2096                     }
2097                     else if (node.tag == tt.tagP)
2098                     {
2099                         Node.coerceNode(lexer, node, tt.tagBr);
2100                         element.insertNodeAtEnd(node);
2101                         node = lexer.inferredTag("br");
2102                     }
2103                     else
2104                     {
2105                         // if this is the end tag for an ancestor element then infer end tag for this element
2106 
2107                         for (parent = element.parent; parent != null; parent = parent.parent)
2108                         {
2109                             if (node.tag == parent.tag)
2110                             {
2111                                 if (!((element.tag.model & Dict.CM_OPT) != 0))
2112                                 {
2113                                     lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
2114                                 }
2115 
2116                                 lexer.ungetToken();
2117 
2118                                 if ((element.tag.model & Dict.CM_OBJECT) != 0)
2119                                 {
2120                                     // pop inline stack
2121                                     while (lexer.istack.size() > lexer.istackbase)
2122                                     {
2123                                         lexer.popInline(null);
2124                                     }
2125                                     lexer.istackbase = istackbase;
2126                                 }
2127 
2128                                 Node.trimSpaces(lexer, element);
2129                                 Node.trimEmptyElement(lexer, element);
2130                                 return;
2131                             }
2132                         }
2133                         // special case </tr> etc. for stuff moved in front of table
2134                         if (lexer.exiled && node.tag.model != 0 && (node.tag.model & Dict.CM_TABLE) != 0)
2135                         {
2136                             lexer.ungetToken();
2137                             Node.trimSpaces(lexer, element);
2138                             Node.trimEmptyElement(lexer, element);
2139                             return;
2140                         }
2141                     }
2142                 }
2143 
2144                 // mixed content model permits text
2145                 if (node.type == Node.TEXT_NODE)
2146                 {
2147                     boolean iswhitenode = false;
2148 
2149                     if (node.type == Node.TEXT_NODE
2150                         && node.end <= node.start + 1
2151                         && lexer.lexbuf[node.start] == (byte) ' ')
2152                     {
2153                         iswhitenode = true;
2154                     }
2155 
2156                     if (lexer.configuration.encloseBlockText && !iswhitenode)
2157                     {
2158                         lexer.ungetToken();
2159                         node = lexer.inferredTag("p");
2160                         element.insertNodeAtEnd(node);
2161                         parseTag(lexer, node, Lexer.MIXED_CONTENT);
2162                         continue;
2163                     }
2164 
2165                     if (checkstack)
2166                     {
2167                         checkstack = false;
2168 
2169                         if (!((element.tag.model & Dict.CM_MIXED) != 0))
2170                         {
2171                             if (lexer.inlineDup(node) > 0)
2172                             {
2173                                 continue;
2174                             }
2175                         }
2176                     }
2177 
2178                     element.insertNodeAtEnd(node);
2179                     mode = Lexer.MIXED_CONTENT;
2180 
2181                     // HTML4 strict doesn't allow mixed content for elements with %block; as their content model
2182                     // But only body, map, blockquote, form and noscript have content model %block;
2183                     if (element.tag == tt.tagBody
2184                         || element.tag == tt.tagMap
2185                         || element.tag == tt.tagBlockquote
2186                         || element.tag == tt.tagForm
2187                         || element.tag == tt.tagNoscript)
2188                     {
2189                         lexer.constrainVersion(~Dict.VERS_HTML40_STRICT);
2190                     }
2191                     continue;
2192                 }
2193 
2194                 if (Node.insertMisc(element, node))
2195                 {
2196                     continue;
2197                 }
2198 
2199                 // allow PARAM elements?
2200                 if (node.tag == tt.tagParam)
2201                 {
2202                     if (((element.tag.model & Dict.CM_PARAM) != 0)
2203                         && (node.type == Node.START_TAG || node.type == Node.START_END_TAG))
2204                     {
2205                         element.insertNodeAtEnd(node);
2206                         continue;
2207                     }
2208 
2209                     // otherwise discard it
2210                     lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2211                     continue;
2212                 }
2213 
2214                 // allow AREA elements?
2215                 if (node.tag == tt.tagArea)
2216                 {
2217                     if ((element.tag == tt.tagMap) && (node.type == Node.START_TAG || node.type == Node.START_END_TAG))
2218                     {
2219                         element.insertNodeAtEnd(node);
2220                         continue;
2221                     }
2222 
2223                     // otherwise discard it
2224                     lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2225                     continue;
2226                 }
2227 
2228                 // ignore unknown start/end tags
2229                 if (node.tag == null)
2230                 {
2231                     lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2232                     continue;
2233                 }
2234 
2235                 // Allow Dict.CM_INLINE elements here. Allow Dict.CM_BLOCK elements here unless lexer.excludeBlocks is
2236                 // yes. LI and DD are special cased. Otherwise infer end tag for this element.
2237 
2238                 if (!((node.tag.model & Dict.CM_INLINE) != 0))
2239                 {
2240                     if (node.type != Node.START_TAG && node.type != Node.START_END_TAG)
2241                     {
2242                         if (node.tag == tt.tagForm)
2243                         {
2244                             badForm(lexer);
2245                         }
2246                         lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2247                         continue;
2248                     }
2249 
2250                     // #427671 - Fix by Randy Waki - 10 Aug 00
2251                     // If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION start tag, discard the start
2252                     // tag and let the subsequent content get parsed as content of the enclosing LI. This seems to
2253                     // mimic IE and Netscape, and avoids an infinite loop: without this check, ParseBlock (which is
2254                     // parsing the LI's content) and ParseList (which is parsing the LI's parent's content) repeatedly
2255                     // defer to each other to parse the illegal start tag, each time inferring a missing </li> or <li>
2256                     // respectively. NOTE: This check is a bit fragile. It specifically checks for the four tags that
2257                     // happen to weave their way through the current series of tests performed by ParseBlock and
2258                     // ParseList to trigger the infinite loop.
2259 
2260                     if (element.tag == tt.tagLi)
2261                     {
2262                         if (node.tag == tt.tagFrame
2263                             || node.tag == tt.tagFrameset
2264                             || node.tag == tt.tagOptgroup
2265                             || node.tag == tt.tagOption)
2266                         {
2267                             lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2268                             continue;
2269                         }
2270                     }
2271 
2272                     if (element.tag == tt.tagTd || element.tag == tt.tagTh)
2273                     {
2274                         // if parent is a table cell, avoid inferring the end of the cell
2275 
2276                         if ((node.tag.model & Dict.CM_HEAD) != 0)
2277                         {
2278                             moveToHead(lexer, element, node);
2279                             continue;
2280                         }
2281 
2282                         if ((node.tag.model & Dict.CM_LIST) != 0)
2283                         {
2284                             lexer.ungetToken();
2285                             node = lexer.inferredTag("ul");
2286                             node.addClass("noindent");
2287                             lexer.excludeBlocks = true;
2288                         }
2289                         else if ((node.tag.model & Dict.CM_DEFLIST) != 0)
2290                         {
2291                             lexer.ungetToken();
2292                             node = lexer.inferredTag("dl");
2293                             lexer.excludeBlocks = true;
2294                         }
2295 
2296                         // infer end of current table cell
2297                         if (!((node.tag.model & Dict.CM_BLOCK) != 0))
2298                         {
2299                             lexer.ungetToken();
2300                             Node.trimSpaces(lexer, element);
2301                             Node.trimEmptyElement(lexer, element);
2302                             return;
2303                         }
2304                     }
2305                     else if ((node.tag.model & Dict.CM_BLOCK) != 0)
2306                     {
2307                         if (lexer.excludeBlocks)
2308                         {
2309                             if (!((element.tag.model & Dict.CM_OPT) != 0))
2310                             {
2311                                 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
2312                             }
2313 
2314                             lexer.ungetToken();
2315 
2316                             if ((element.tag.model & Dict.CM_OBJECT) != 0)
2317                             {
2318                                 lexer.istackbase = istackbase;
2319                             }
2320 
2321                             Node.trimSpaces(lexer, element);
2322                             Node.trimEmptyElement(lexer, element);
2323                             return;
2324                         }
2325                     }
2326                     else
2327                     {
2328                         // things like list items
2329 
2330                         if ((node.tag.model & Dict.CM_HEAD) != 0)
2331                         {
2332                             moveToHead(lexer, element, node);
2333                             continue;
2334                         }
2335 
2336                         // special case where a form start tag occurs in a tr and is followed by td or th
2337                         if (element.tag == tt.tagForm && element.parent.tag == tt.tagTd && element.parent.implicit)
2338                         {
2339                             if (node.tag == tt.tagTd)
2340                             {
2341                                 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2342                                 continue;
2343                             }
2344 
2345                             if (node.tag == tt.tagTh)
2346                             {
2347                                 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2348                                 node = element.parent;
2349                                 node.element = "th";
2350                                 node.tag = tt.tagTh;
2351                                 continue;
2352                             }
2353                         }
2354 
2355                         if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit)
2356                         {
2357                             lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
2358                         }
2359 
2360                         lexer.ungetToken();
2361 
2362                         if ((node.tag.model & Dict.CM_LIST) != 0)
2363                         {
2364                             if (element.parent != null
2365                                 && element.parent.tag != null
2366                                 && element.parent.tag.getParser() == LIST)
2367                             {
2368                                 Node.trimSpaces(lexer, element);
2369                                 Node.trimEmptyElement(lexer, element);
2370                                 return;
2371                             }
2372 
2373                             node = lexer.inferredTag("ul");
2374                             node.addClass("noindent");
2375                         }
2376                         else if ((node.tag.model & Dict.CM_DEFLIST) != 0)
2377                         {
2378                             if (element.parent.tag == tt.tagDl)
2379                             {
2380                                 Node.trimSpaces(lexer, element);
2381                                 Node.trimEmptyElement(lexer, element);
2382                                 return;
2383                             }
2384 
2385                             node = lexer.inferredTag("dl");
2386                         }
2387                         else if ((node.tag.model & Dict.CM_TABLE) != 0 || (node.tag.model & Dict.CM_ROW) != 0)
2388                         {
2389                             node = lexer.inferredTag("table");
2390                         }
2391                         else if ((element.tag.model & Dict.CM_OBJECT) != 0)
2392                         {
2393                             // pop inline stack
2394                             while (lexer.istack.size() > lexer.istackbase)
2395                             {
2396                                 lexer.popInline(null);
2397                             }
2398                             lexer.istackbase = istackbase;
2399                             Node.trimSpaces(lexer, element);
2400                             Node.trimEmptyElement(lexer, element);
2401                             return;
2402 
2403                         }
2404                         else
2405                         {
2406                             Node.trimSpaces(lexer, element);
2407                             Node.trimEmptyElement(lexer, element);
2408                             return;
2409                         }
2410                     }
2411                 }
2412 
2413                 // parse known element
2414                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2415                 {
2416                     if (TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
2417                     {
2418                         // DSR - 27Apr02 ensure we wrap anchors and other inline content
2419                         if (lexer.configuration.encloseBlockText)
2420                         {
2421                             lexer.ungetToken();
2422                             node = lexer.inferredTag("p");
2423                             element.insertNodeAtEnd(node);
2424                             parseTag(lexer, node, Lexer.MIXED_CONTENT);
2425                             continue;
2426                         }
2427 
2428                         if (checkstack && !node.implicit)
2429                         {
2430                             checkstack = false;
2431 
2432                             // #431731 - fix by Randy Waki 25 Dec 00
2433                             if (!TidyUtils.toBoolean(element.tag.model & Dict.CM_MIXED))
2434                             {
2435                                 if (lexer.inlineDup(node) > 0)
2436                                 {
2437                                     continue;
2438                                 }
2439                             }
2440                         }
2441 
2442                         mode = Lexer.MIXED_CONTENT;
2443                     }
2444                     else
2445                     {
2446                         checkstack = true;
2447                         mode = Lexer.IGNORE_WHITESPACE;
2448                     }
2449 
2450                     // trim white space before <br>
2451                     if (node.tag == tt.tagBr)
2452                     {
2453                         Node.trimSpaces(lexer, element);
2454                     }
2455 
2456                     element.insertNodeAtEnd(node);
2457 
2458                     if (node.implicit)
2459                     {
2460                         lexer.report.warning(lexer, element, node, Report.INSERTING_TAG);
2461                     }
2462 
2463                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE // Lexer.MixedContent
2464                     );
2465                     continue;
2466                 }
2467 
2468                 // discard unexpected tags
2469                 if (node.type == Node.END_TAG)
2470                 {
2471                     lexer.popInline(node); // if inline end tag
2472                 }
2473 
2474                 lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
2475                 continue;
2476             }
2477 
2478             if (!((element.tag.model & Dict.CM_OPT) != 0))
2479             {
2480                 lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR);
2481             }
2482 
2483             if ((element.tag.model & Dict.CM_OBJECT) != 0)
2484             {
2485                 // pop inline stack
2486                 while (lexer.istack.size() > lexer.istackbase)
2487                 {
2488                     lexer.popInline(null);
2489                 }
2490                 lexer.istackbase = istackbase;
2491             }
2492 
2493             Node.trimSpaces(lexer, element);
2494             Node.trimEmptyElement(lexer, element);
2495         }
2496 
2497     }
2498 
2499     /***
2500      * Parser for TABLE.
2501      */
2502     public static class ParseTableTag implements Parser
2503     {
2504 
2505         /***
2506          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2507          */
2508         public void parse(Lexer lexer, Node table, short mode)
2509         {
2510             Node node, parent;
2511             int istackbase;
2512             TagTable tt = lexer.configuration.tt;
2513 
2514             lexer.deferDup();
2515             istackbase = lexer.istackbase;
2516             lexer.istackbase = lexer.istack.size();
2517 
2518             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2519             {
2520                 if (node.tag == table.tag && node.type == Node.END_TAG)
2521                 {
2522                     lexer.istackbase = istackbase;
2523                     table.closed = true;
2524                     Node.trimEmptyElement(lexer, table);
2525                     return;
2526                 }
2527 
2528                 // deal with comments etc.
2529                 if (Node.insertMisc(table, node))
2530                 {
2531                     continue;
2532                 }
2533 
2534                 // discard unknown tags
2535                 if (node.tag == null && node.type != Node.TEXT_NODE)
2536                 {
2537                     lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2538                     continue;
2539                 }
2540 
2541                 // if TD or TH or text or inline or block then infer <TR>
2542 
2543                 if (node.type != Node.END_TAG)
2544                 {
2545                     if (node.tag == tt.tagTd || node.tag == tt.tagTh || node.tag == tt.tagTable)
2546                     {
2547                         lexer.ungetToken();
2548                         node = lexer.inferredTag("tr");
2549                         lexer.report.warning(lexer, table, node, Report.MISSING_STARTTAG);
2550                     }
2551                     else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)
2552                     {
2553                         Node.insertNodeBeforeElement(table, node);
2554                         lexer.report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN);
2555                         lexer.exiled = true;
2556 
2557                         if (!(node.type == Node.TEXT_NODE)) // #427662 - was (!node.type == TextNode) - fix by Young
2558                         {
2559                             parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2560                         }
2561 
2562                         lexer.exiled = false;
2563                         continue;
2564                     }
2565                     else if ((node.tag.model & Dict.CM_HEAD) != 0)
2566                     {
2567                         moveToHead(lexer, table, node);
2568                         continue;
2569                     }
2570                 }
2571 
2572                 // if this is the end tag for an ancestor element then infer end tag for this element
2573 
2574                 if (node.type == Node.END_TAG)
2575                 {
2576                     if (node.tag == tt.tagForm
2577                         || (node.tag != null && ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)))
2578                     {
2579                         badForm(lexer);
2580                         lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2581                         continue;
2582                     }
2583 
2584                     if ((node.tag != null && (node.tag.model & (Dict.CM_TABLE | Dict.CM_ROW)) != 0)
2585                         || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
2586                     {
2587                         lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2588                         continue;
2589                     }
2590 
2591                     for (parent = table.parent; parent != null; parent = parent.parent)
2592                     {
2593                         if (node.tag == parent.tag)
2594                         {
2595                             lexer.report.warning(lexer, table, node, Report.MISSING_ENDTAG_BEFORE);
2596                             lexer.ungetToken();
2597                             lexer.istackbase = istackbase;
2598                             Node.trimEmptyElement(lexer, table);
2599                             return;
2600                         }
2601                     }
2602                 }
2603 
2604                 if (!((node.tag.model & Dict.CM_TABLE) != 0))
2605                 {
2606                     lexer.ungetToken();
2607                     lexer.report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN);
2608                     lexer.istackbase = istackbase;
2609                     Node.trimEmptyElement(lexer, table);
2610                     return;
2611                 }
2612 
2613                 if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
2614                 {
2615                     table.insertNodeAtEnd(node);
2616 
2617                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2618                     continue;
2619                 }
2620 
2621                 // discard unexpected text nodes and end tags
2622                 lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
2623             }
2624 
2625             lexer.report.warning(lexer, table, node, Report.MISSING_ENDTAG_FOR);
2626             Node.trimEmptyElement(lexer, table);
2627             lexer.istackbase = istackbase;
2628         }
2629 
2630     }
2631 
2632     /***
2633      * Parser for COLGROUP.
2634      */
2635     public static class ParseColGroup implements Parser
2636     {
2637 
2638         /***
2639          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2640          */
2641         public void parse(Lexer lexer, Node colgroup, short mode)
2642         {
2643             Node node, parent;
2644             TagTable tt = lexer.configuration.tt;
2645 
2646             if ((colgroup.tag.model & Dict.CM_EMPTY) != 0)
2647             {
2648                 return;
2649             }
2650 
2651             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2652             {
2653                 if (node.tag == colgroup.tag && node.type == Node.END_TAG)
2654                 {
2655                     colgroup.closed = true;
2656                     return;
2657                 }
2658 
2659                 // if this is the end tag for an ancestor element then infer end tag for this element
2660 
2661                 if (node.type == Node.END_TAG)
2662                 {
2663                     if (node.tag == tt.tagForm)
2664                     {
2665                         badForm(lexer);
2666                         lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
2667                         continue;
2668                     }
2669 
2670                     for (parent = colgroup.parent; parent != null; parent = parent.parent)
2671                     {
2672 
2673                         if (node.tag == parent.tag)
2674                         {
2675                             lexer.ungetToken();
2676                             return;
2677                         }
2678                     }
2679                 }
2680 
2681                 if (node.type == Node.TEXT_NODE)
2682                 {
2683                     lexer.ungetToken();
2684                     return;
2685                 }
2686 
2687                 // deal with comments etc.
2688                 if (Node.insertMisc(colgroup, node))
2689                 {
2690                     continue;
2691                 }
2692 
2693                 // discard unknown tags
2694                 if (node.tag == null)
2695                 {
2696                     lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
2697                     continue;
2698                 }
2699 
2700                 if (node.tag != tt.tagCol)
2701                 {
2702                     lexer.ungetToken();
2703                     return;
2704                 }
2705 
2706                 if (node.type == Node.END_TAG)
2707                 {
2708                     lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
2709                     continue;
2710                 }
2711 
2712                 // node should be <COL>
2713                 colgroup.insertNodeAtEnd(node);
2714                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2715             }
2716         }
2717 
2718     }
2719 
2720     /***
2721      * Parser for ROWGROUP.
2722      */
2723     public static class ParseRowGroup implements Parser
2724     {
2725 
2726         /***
2727          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2728          */
2729         public void parse(Lexer lexer, Node rowgroup, short mode)
2730         {
2731             Node node, parent;
2732             TagTable tt = lexer.configuration.tt;
2733 
2734             if ((rowgroup.tag.model & Dict.CM_EMPTY) != 0)
2735             {
2736                 return;
2737             }
2738 
2739             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2740             {
2741                 if (node.tag == rowgroup.tag)
2742                 {
2743                     if (node.type == Node.END_TAG)
2744                     {
2745                         rowgroup.closed = true;
2746                         Node.trimEmptyElement(lexer, rowgroup);
2747                         return;
2748                     }
2749 
2750                     lexer.ungetToken();
2751                     return;
2752                 }
2753 
2754                 // if </table> infer end tag
2755                 if (node.tag == tt.tagTable && node.type == Node.END_TAG)
2756                 {
2757                     lexer.ungetToken();
2758                     Node.trimEmptyElement(lexer, rowgroup);
2759                     return;
2760                 }
2761 
2762                 // deal with comments etc.
2763                 if (Node.insertMisc(rowgroup, node))
2764                 {
2765                     continue;
2766                 }
2767 
2768                 // discard unknown tags
2769                 if (node.tag == null && node.type != Node.TEXT_NODE)
2770                 {
2771                     lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2772                     continue;
2773                 }
2774 
2775                 // if TD or TH then infer <TR> if text or inline or block move before table if head content move to
2776                 // head
2777 
2778                 if (node.type != Node.END_TAG)
2779                 {
2780                     if (node.tag == tt.tagTd || node.tag == tt.tagTh)
2781                     {
2782                         lexer.ungetToken();
2783                         node = lexer.inferredTag("tr");
2784                         lexer.report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG);
2785                     }
2786                     else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)
2787                     {
2788                         Node.moveBeforeTable(rowgroup, node, tt);
2789                         lexer.report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN);
2790                         lexer.exiled = true;
2791 
2792                         // #427662 was (!node.type == TextNode) fix by Young 04 Aug 00
2793                         if (node.type != Node.TEXT_NODE)
2794                         {
2795                             parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2796                         }
2797 
2798                         lexer.exiled = false;
2799                         continue;
2800                     }
2801                     else if ((node.tag.model & Dict.CM_HEAD) != 0)
2802                     {
2803                         lexer.report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN);
2804                         moveToHead(lexer, rowgroup, node);
2805                         continue;
2806                     }
2807                 }
2808 
2809                 // if this is the end tag for ancestor element then infer end tag for this element
2810 
2811                 if (node.type == Node.END_TAG)
2812                 {
2813 
2814                     if (node.tag == tt.tagForm
2815                         || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
2816                     {
2817                         if (node.tag == tt.tagForm)
2818                         {
2819                             badForm(lexer);
2820                         }
2821                         lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2822                         continue;
2823                     }
2824 
2825                     if (node.tag == tt.tagTr || node.tag == tt.tagTd || node.tag == tt.tagTh)
2826                     {
2827                         lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2828                         continue;
2829                     }
2830 
2831                     for (parent = rowgroup.parent; parent != null; parent = parent.parent)
2832                     {
2833                         if (node.tag == parent.tag)
2834                         {
2835                             lexer.ungetToken();
2836                             Node.trimEmptyElement(lexer, rowgroup);
2837                             return;
2838                         }
2839                     }
2840 
2841                 }
2842 
2843                 // if THEAD, TFOOT or TBODY then implied end tag
2844 
2845                 if ((node.tag.model & Dict.CM_ROWGRP) != 0)
2846                 {
2847                     if (node.type != Node.END_TAG)
2848                     {
2849                         lexer.ungetToken();
2850                     }
2851 
2852                     Node.trimEmptyElement(lexer, rowgroup);
2853                     return;
2854                 }
2855 
2856                 if (node.type == Node.END_TAG)
2857                 {
2858                     lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
2859                     continue;
2860                 }
2861 
2862                 if (!(node.tag == tt.tagTr))
2863                 {
2864                     node = lexer.inferredTag("tr");
2865                     lexer.report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG);
2866                     lexer.ungetToken();
2867                 }
2868 
2869                 // node should be <TR>
2870                 rowgroup.insertNodeAtEnd(node);
2871                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2872             }
2873             Node.trimEmptyElement(lexer, rowgroup);
2874         }
2875     }
2876 
2877     /***
2878      * Parser for ROW.
2879      */
2880     public static class ParseRow implements Parser
2881     {
2882 
2883         /***
2884          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
2885          */
2886         public void parse(Lexer lexer, Node row, short mode)
2887         {
2888             Node node, parent;
2889             boolean excludeState;
2890             TagTable tt = lexer.configuration.tt;
2891 
2892             if ((row.tag.model & Dict.CM_EMPTY) != 0)
2893             {
2894                 return;
2895             }
2896 
2897             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
2898             {
2899                 if (node.tag == row.tag)
2900                 {
2901                     if (node.type == Node.END_TAG)
2902                     {
2903                         row.closed = true;
2904                         Node.fixEmptyRow(lexer, row);
2905                         return;
2906                     }
2907 
2908                     lexer.ungetToken();
2909                     Node.fixEmptyRow(lexer, row);
2910                     return;
2911                 }
2912 
2913                 // if this is the end tag for an ancestor element then infer end tag for this element
2914                 if (node.type == Node.END_TAG)
2915                 {
2916                     if (node.tag == tt.tagForm
2917                         || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0))
2918                     {
2919                         if (node.tag == tt.tagForm)
2920                         {
2921                             badForm(lexer);
2922                         }
2923                         lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2924                         continue;
2925                     }
2926 
2927                     if (node.tag == tt.tagTd || node.tag == tt.tagTh)
2928                     {
2929                         lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2930                         continue;
2931                     }
2932 
2933                     for (parent = row.parent; parent != null; parent = parent.parent)
2934                     {
2935                         if (node.tag == parent.tag)
2936                         {
2937                             lexer.ungetToken();
2938                             Node.trimEmptyElement(lexer, row);
2939                             return;
2940                         }
2941                     }
2942                 }
2943 
2944                 // deal with comments etc.
2945                 if (Node.insertMisc(row, node))
2946                 {
2947                     continue;
2948                 }
2949 
2950                 // discard unknown tags
2951                 if (node.tag == null && node.type != Node.TEXT_NODE)
2952                 {
2953                     lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2954                     continue;
2955                 }
2956 
2957                 // discard unexpected <table> element
2958                 if (node.tag == tt.tagTable)
2959                 {
2960                     lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2961                     continue;
2962                 }
2963 
2964                 // THEAD, TFOOT or TBODY
2965                 if (node.tag != null && (node.tag.model & Dict.CM_ROWGRP) != 0)
2966                 {
2967                     lexer.ungetToken();
2968                     Node.trimEmptyElement(lexer, row);
2969                     return;
2970                 }
2971 
2972                 if (node.type == Node.END_TAG)
2973                 {
2974                     lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
2975                     continue;
2976                 }
2977 
2978                 // if text or inline or block move before table if head content move to head
2979 
2980                 if (node.type != Node.END_TAG)
2981                 {
2982                     if (node.tag == tt.tagForm)
2983                     {
2984                         lexer.ungetToken();
2985                         node = lexer.inferredTag("td");
2986                         lexer.report.warning(lexer, row, node, Report.MISSING_STARTTAG);
2987                     }
2988                     else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)
2989                     {
2990                         Node.moveBeforeTable(row, node, tt);
2991                         lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
2992                         lexer.exiled = true;
2993 
2994                         if (node.type != Node.TEXT_NODE)
2995                         {
2996                             parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
2997                         }
2998 
2999                         lexer.exiled = false;
3000                         continue;
3001                     }
3002                     else if ((node.tag.model & Dict.CM_HEAD) != 0)
3003                     {
3004                         lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
3005                         moveToHead(lexer, row, node);
3006                         continue;
3007                     }
3008                 }
3009 
3010                 if (!(node.tag == tt.tagTd || node.tag == tt.tagTh))
3011                 {
3012                     lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
3013                     continue;
3014                 }
3015 
3016                 // node should be <TD> or <TH>
3017                 row.insertNodeAtEnd(node);
3018                 excludeState = lexer.excludeBlocks;
3019                 lexer.excludeBlocks = false;
3020                 parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
3021                 lexer.excludeBlocks = excludeState;
3022 
3023                 // pop inline stack
3024 
3025                 while (lexer.istack.size() > lexer.istackbase)
3026                 {
3027                     lexer.popInline(null);
3028                 }
3029             }
3030 
3031             Node.trimEmptyElement(lexer, row);
3032         }
3033 
3034     }
3035 
3036     /***
3037      * Parser for NOFRAMES.
3038      */
3039     public static class ParseNoFrames implements Parser
3040     {
3041 
3042         /***
3043          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3044          */
3045         public void parse(Lexer lexer, Node noframes, short mode)
3046         {
3047             Node node;
3048             TagTable tt = lexer.configuration.tt;
3049 
3050             lexer.badAccess |= Report.USING_NOFRAMES;
3051             mode = Lexer.IGNORE_WHITESPACE;
3052 
3053             while ((node = lexer.getToken(mode)) != null)
3054             {
3055                 if (node.tag == noframes.tag && node.type == Node.END_TAG)
3056                 {
3057                     noframes.closed = true;
3058                     Node.trimSpaces(lexer, noframes);
3059                     return;
3060                 }
3061 
3062                 if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset))
3063                 {
3064 
3065                     Node.trimSpaces(lexer, noframes);
3066 
3067                     // fix for [539369]
3068                     if (node.type == Node.END_TAG)
3069                     {
3070                         lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
3071                         // Throw it away
3072                     }
3073                     else
3074                     {
3075                         lexer.report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_BEFORE);
3076 
3077                         lexer.ungetToken();
3078                     }
3079                     return;
3080                 }
3081 
3082                 if (node.tag == tt.tagHtml)
3083                 {
3084                     if (node.type == Node.START_TAG || node.type == Node.START_END_TAG)
3085                     {
3086                         lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
3087                     }
3088 
3089                     continue;
3090                 }
3091 
3092                 // deal with comments etc.
3093                 if (Node.insertMisc(noframes, node))
3094                 {
3095                     continue;
3096                 }
3097 
3098                 if (node.tag == tt.tagBody && node.type == Node.START_TAG)
3099                 {
3100                     boolean seenbody = lexer.seenEndBody;
3101                     noframes.insertNodeAtEnd(node);
3102                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); // MixedContent
3103 
3104                     if (seenbody)
3105                     {
3106                         Node.coerceNode(lexer, node, tt.tagDiv);
3107                         moveNodeToBody(lexer, node);
3108                     }
3109                     continue;
3110                 }
3111 
3112                 // implicit body element inferred
3113                 if (node.type == Node.TEXT_NODE || (node.tag != null && node.type != Node.END_TAG))
3114                 {
3115                     if (lexer.seenEndBody)
3116                     {
3117                         Node body = lexer.root.findBody(tt);
3118 
3119                         if (node.type == Node.TEXT_NODE)
3120                         {
3121                             lexer.ungetToken();
3122                             node = lexer.inferredTag("p");
3123                             lexer.report.warning(lexer, noframes, node, Report.CONTENT_AFTER_BODY);
3124                         }
3125 
3126                         body.insertNodeAtEnd(node);
3127                     }
3128                     else
3129                     {
3130                         lexer.ungetToken();
3131                         node = lexer.inferredTag("body");
3132                         if (lexer.configuration.xmlOut)
3133                         {
3134                             lexer.report.warning(lexer, noframes, node, Report.INSERTING_TAG);
3135                         }
3136                         noframes.insertNodeAtEnd(node);
3137                     }
3138                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
3139                     // MixedContent
3140                     continue;
3141                 }
3142                 // discard unexpected end tags
3143                 lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
3144             }
3145 
3146             lexer.report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_FOR);
3147         }
3148 
3149     }
3150 
3151     /***
3152      * Parser for SELECT.
3153      */
3154     public static class ParseSelect implements Parser
3155     {
3156 
3157         /***
3158          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3159          */
3160         public void parse(Lexer lexer, Node field, short mode)
3161         {
3162             Node node;
3163             TagTable tt = lexer.configuration.tt;
3164 
3165             lexer.insert = -1; // defer implicit inline start tags
3166 
3167             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3168             {
3169                 if (node.tag == field.tag && node.type == Node.END_TAG)
3170                 {
3171                     field.closed = true;
3172                     Node.trimSpaces(lexer, field);
3173                     return;
3174                 }
3175 
3176                 // deal with comments etc.
3177                 if (Node.insertMisc(field, node))
3178                 {
3179                     continue;
3180                 }
3181 
3182                 if (node.type == Node.START_TAG
3183                     && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup || node.tag == tt.tagScript))
3184                 {
3185                     field.insertNodeAtEnd(node);
3186                     parseTag(lexer, node, Lexer.IGNORE_WHITESPACE);
3187                     continue;
3188                 }
3189 
3190                 // discard unexpected tags
3191                 lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
3192             }
3193 
3194             lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR);
3195         }
3196 
3197     }
3198 
3199     /***
3200      * Parser for text nodes.
3201      */
3202     public static class ParseText implements Parser
3203     {
3204 
3205         /***
3206          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3207          */
3208         public void parse(Lexer lexer, Node field, short mode)
3209         {
3210             Node node;
3211             TagTable tt = lexer.configuration.tt;
3212 
3213             lexer.insert = -1; // defer implicit inline start tags
3214 
3215             if (field.tag == tt.tagTextarea)
3216             {
3217                 mode = Lexer.PREFORMATTED;
3218             }
3219             else
3220             {
3221                 mode = Lexer.MIXED_CONTENT; // kludge for font tags
3222             }
3223 
3224             while ((node = lexer.getToken(mode)) != null)
3225             {
3226                 if (node.tag == field.tag && node.type == Node.END_TAG)
3227                 {
3228                     field.closed = true;
3229                     Node.trimSpaces(lexer, field);
3230                     return;
3231                 }
3232 
3233                 // deal with comments etc.
3234                 if (Node.insertMisc(field, node))
3235                 {
3236                     continue;
3237                 }
3238 
3239                 if (node.type == Node.TEXT_NODE)
3240                 {
3241                     // only called for 1st child
3242                     if (field.content == null && !((mode & Lexer.PREFORMATTED) != 0))
3243                     {
3244                         Node.trimSpaces(lexer, field);
3245                     }
3246 
3247                     if (node.start >= node.end)
3248                     {
3249                         continue;
3250                     }
3251 
3252                     field.insertNodeAtEnd(node);
3253                     continue;
3254                 }
3255 
3256                 // for textarea should all cases of < and & be escaped?
3257                 // discard inline tags e.g. font
3258                 if (node.tag != null
3259                     && ((node.tag.model & Dict.CM_INLINE) != 0)
3260                     && (node.tag.model & Dict.CM_FIELD) == 0) // #487283 - fix by Lee Passey 25 Jan 02
3261                 {
3262                     lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
3263                     continue;
3264                 }
3265 
3266                 // terminate element on other tags
3267                 if (!((field.tag.model & Dict.CM_OPT) != 0))
3268                 {
3269                     lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_BEFORE);
3270                 }
3271 
3272                 lexer.ungetToken();
3273                 Node.trimSpaces(lexer, field);
3274                 return;
3275             }
3276 
3277             if (!((field.tag.model & Dict.CM_OPT) != 0))
3278             {
3279                 lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR);
3280             }
3281         }
3282 
3283     }
3284 
3285     /***
3286      * Parser for OPTGROUP.
3287      */
3288     public static class ParseOptGroup implements Parser
3289     {
3290 
3291         /***
3292          * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short)
3293          */
3294         public void parse(Lexer lexer, Node field, short mode)
3295         {
3296             Node node;
3297             TagTable tt = lexer.configuration.tt;
3298 
3299             lexer.insert = -1; // defer implicit inline start tags
3300 
3301             while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3302             {
3303                 if (node.tag == field.tag && node.type == Node.END_TAG)
3304                 {
3305                     field.closed = true;
3306                     Node.trimSpaces(lexer, field);
3307                     return;
3308                 }
3309 
3310                 // deal with comments etc.
3311                 if (Node.insertMisc(field, node))
3312                 {
3313                     continue;
3314                 }
3315 
3316                 if (node.type == Node.START_TAG && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup))
3317                 {
3318                     if (node.tag == tt.tagOptgroup)
3319                     {
3320                         lexer.report.warning(lexer, field, node, Report.CANT_BE_NESTED);
3321                     }
3322 
3323                     field.insertNodeAtEnd(node);
3324                     parseTag(lexer, node, Lexer.MIXED_CONTENT);
3325                     continue;
3326                 }
3327 
3328                 // discard unexpected tags
3329                 lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
3330             }
3331         }
3332 
3333     }
3334 
3335     /***
3336      * HTML is the top level element.
3337      */
3338     public static Node parseDocument(Lexer lexer)
3339     {
3340         Node node, document, html;
3341         Node doctype = null;
3342         TagTable tt = lexer.configuration.tt;
3343 
3344         document = lexer.newNode();
3345         document.type = Node.ROOT_NODE;
3346 
3347         lexer.root = document;
3348 
3349         while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3350         {
3351             // deal with comments etc.
3352             if (Node.insertMisc(document, node))
3353             {
3354                 continue;
3355             }
3356 
3357             if (node.type == Node.DOCTYPE_TAG)
3358             {
3359                 if (doctype == null)
3360                 {
3361                     document.insertNodeAtEnd(node);
3362                     doctype = node;
3363                 }
3364                 else
3365                 {
3366                     lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED);
3367                 }
3368                 continue;
3369             }
3370 
3371             if (node.type == Node.END_TAG)
3372             {
3373                 lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); //TODO?
3374                 continue;
3375             }
3376 
3377             if (node.type != Node.START_TAG || node.tag != tt.tagHtml)
3378             {
3379                 lexer.ungetToken();
3380                 html = lexer.inferredTag("html");
3381             }
3382             else
3383             {
3384                 html = node;
3385             }
3386 
3387             if (document.findDocType() == null && !lexer.configuration.bodyOnly)
3388             {
3389                 lexer.report.warning(lexer, null, null, Report.MISSING_DOCTYPE);
3390             }
3391 
3392             document.insertNodeAtEnd(html);
3393             HTML.parse(lexer, html, (short) 0); // TODO?
3394             break;
3395         }
3396 
3397         return document;
3398     }
3399 
3400     /***
3401      * Indicates whether or not whitespace should be preserved for this element. If an <code>xml:space</code>
3402      * attribute is found, then if the attribute value is <code>preserve</code>, returns <code>true</code>. For
3403      * any other value, returns <code>false</code>. If an <code>xml:space</code> attribute was <em>not</em>
3404      * found, then the following element names result in a return value of <code>true:
3405      *  pre, script, style,</code> and
3406      * <code>xsl:text</code>. Finally, if a <code>TagTable</code> was passed in and the element appears as the
3407      * "pre" element in the <code>TagTable</code>, then <code>true</code> will be returned. Otherwise,
3408      * <code>false</code> is returned.
3409      * @param element The <code>Node</code> to test to see if whitespace should be preserved.
3410      * @param tt The <code>TagTable</code> to test for the <code>getNodePre()</code> function. This may be
3411      * <code>null</code>, in which case this test is bypassed.
3412      * @return <code>true</code> or <code>false</code>, as explained above.
3413      */
3414     public static boolean XMLPreserveWhiteSpace(Node element, TagTable tt)
3415     {
3416         AttVal attribute;
3417 
3418         // search attributes for xml:space
3419         for (attribute = element.attributes; attribute != null; attribute = attribute.next)
3420         {
3421             if (attribute.attribute.equals("xml:space"))
3422             {
3423                 if (attribute.value.equals("preserve"))
3424                 {
3425                     return true;
3426                 }
3427 
3428                 return false;
3429             }
3430         }
3431 
3432         if (element.element == null) // Debian Bug #137124. Fix based on suggestion by Cesar Eduardo Barros 06 Mar 02
3433         {
3434             return false;
3435         }
3436 
3437         // kludge for html docs without explicit xml:space attribute
3438         if ("pre".equalsIgnoreCase(element.element)
3439             || "script".equalsIgnoreCase(element.element)
3440             || "style".equalsIgnoreCase(element.element))
3441         {
3442             return true;
3443         }
3444 
3445         if ((tt != null) && (tt.findParser(element) == PRE))
3446         {
3447             return true;
3448         }
3449 
3450         // kludge for XSL docs
3451         if ("xsl:text".equalsIgnoreCase(element.element))
3452         {
3453             return true;
3454         }
3455 
3456         return false;
3457     }
3458 
3459     /***
3460      * XML documents.
3461      */
3462     public static void parseXMLElement(Lexer lexer, Node element, short mode)
3463     {
3464         Node node;
3465 
3466         // if node is pre or has xml:space="preserve" then do so
3467 
3468         if (XMLPreserveWhiteSpace(element, lexer.configuration.tt))
3469         {
3470             mode = Lexer.PREFORMATTED;
3471         }
3472 
3473         while ((node = lexer.getToken(mode)) != null)
3474         {
3475             if (node.type == Node.END_TAG && node.element.equals(element.element))
3476             {
3477                 element.closed = true;
3478                 break;
3479             }
3480 
3481             // discard unexpected end tags
3482             if (node.type == Node.END_TAG)
3483             {
3484                 lexer.report.error(lexer, element, node, Report.UNEXPECTED_ENDTAG);
3485                 continue;
3486             }
3487 
3488             // parse content on seeing start tag
3489             if (node.type == Node.START_TAG)
3490             {
3491                 parseXMLElement(lexer, node, mode);
3492             }
3493 
3494             element.insertNodeAtEnd(node);
3495         }
3496 
3497         // if first child is text then trim initial space and delete text node if it is empty.
3498 
3499         node = element.content;
3500 
3501         if (node != null && node.type == Node.TEXT_NODE && mode != Lexer.PREFORMATTED)
3502         {
3503             if (node.textarray[node.start] == (byte) ' ')
3504             {
3505                 node.start++;
3506 
3507                 if (node.start >= node.end)
3508                 {
3509                     Node.discardElement(node);
3510                 }
3511             }
3512         }
3513 
3514         // if last child is text then trim final space and delete the text node if it is empty
3515 
3516         node = element.last;
3517 
3518         if (node != null && node.type == Node.TEXT_NODE && mode != Lexer.PREFORMATTED)
3519         {
3520             if (node.textarray[node.end - 1] == (byte) ' ')
3521             {
3522                 node.end--;
3523 
3524                 if (node.start >= node.end)
3525                 {
3526                     Node.discardElement(node);
3527                 }
3528             }
3529         }
3530     }
3531 
3532     public static Node parseXMLDocument(Lexer lexer)
3533     {
3534         Node node, document, doctype;
3535 
3536         document = lexer.newNode();
3537         document.type = Node.ROOT_NODE;
3538         doctype = null;
3539         lexer.configuration.xmlTags = true;
3540 
3541         while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null)
3542         {
3543             // discard unexpected end tags
3544             if (node.type == Node.END_TAG)
3545             {
3546                 lexer.report.warning(lexer, null, node, Report.UNEXPECTED_ENDTAG);
3547                 continue;
3548             }
3549 
3550             // deal with comments etc.
3551             if (Node.insertMisc(document, node))
3552             {
3553                 continue;
3554             }
3555 
3556             if (node.type == Node.DOCTYPE_TAG)
3557             {
3558                 if (doctype == null)
3559                 {
3560                     document.insertNodeAtEnd(node);
3561                     doctype = node;
3562                 }
3563                 else
3564                 {
3565                     lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); // TODO
3566                 }
3567                 continue;
3568             }
3569 
3570             if (node.type == Node.START_END_TAG)
3571             {
3572                 document.insertNodeAtEnd(node);
3573                 continue;
3574             }
3575 
3576             // if start tag then parse element's content
3577             if (node.type == Node.START_TAG)
3578             {
3579                 document.insertNodeAtEnd(node);
3580                 parseXMLElement(lexer, node, Lexer.IGNORE_WHITESPACE);
3581             }
3582 
3583         }
3584 
3585         if (doctype != null && !lexer.checkDocTypeKeyWords(doctype))
3586         {
3587             lexer.report.warning(lexer, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
3588         }
3589 
3590         // ensure presence of initial <?XML version="1.0"?>
3591         if (lexer.configuration.xmlPi)
3592         {
3593             lexer.fixXmlDecl(document);
3594         }
3595 
3596         return document;
3597     }
3598 
3599     /***
3600      * errors in positioning of form start or end tags generally require human intervention to fix.
3601      */
3602     static void badForm(Lexer lexer)
3603     {
3604         lexer.badForm = 1;
3605         lexer.errors++;
3606     }
3607 
3608 }