View Javadoc

1   /*
2    *  Java HTML Tidy - JTidy
3    *  HTML parser and pretty printer
4    *
5    *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6    *  Institute of Technology, Institut National de Recherche en
7    *  Informatique et en Automatique, Keio University). All Rights
8    *  Reserved.
9    *
10   *  Contributing Author(s):
11   *
12   *     Dave Raggett <dsr@w3.org>
13   *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14   *     Gary L Peskin <garyp@firstech.com> (Java development)
15   *     Sami Lempinen <sami@lempinen.net> (release management)
16   *     Fabrizio Giustina <fgiust at users.sourceforge.net>
17   *
18   *  The contributing author(s) would like to thank all those who
19   *  helped with testing, bug fixes, and patience.  This wouldn't
20   *  have been possible without all of you.
21   *
22   *  COPYRIGHT NOTICE:
23   *
24   *  This software and documentation is provided "as is," and
25   *  the copyright holders and contributing author(s) make no
26   *  representations or warranties, express or implied, including
27   *  but not limited to, warranties of merchantability or fitness
28   *  for any particular purpose or that the use of the software or
29   *  documentation will not infringe any third party patents,
30   *  copyrights, trademarks or other rights.
31   *
32   *  The copyright holders and contributing author(s) will not be
33   *  liable for any direct, indirect, special or consequential damages
34   *  arising out of any use of the software or documentation, even if
35   *  advised of the possibility of such damage.
36   *
37   *  Permission is hereby granted to use, copy, modify, and distribute
38   *  this source code, or portions hereof, documentation and executables,
39   *  for any purpose, without fee, subject to the following restrictions:
40   *
41   *  1. The origin of this source code must not be misrepresented.
42   *  2. Altered versions must be plainly marked as such and must
43   *     not be misrepresented as being the original source.
44   *  3. This Copyright notice may not be removed or altered from any
45   *     source or altered source distribution.
46   *
47   *  The copyright holders and contributing author(s) specifically
48   *  permit, without fee, and encourage the use of this source code
49   *  as a component for supporting the Hypertext Markup Language in
50   *  commercial products. If you use this source code in a product,
51   *  acknowledgment is not required but would be appreciated.
52   *
53   */
54  package org.w3c.tidy;
55  
56  import java.io.PrintWriter;
57  import java.util.List;
58  import java.util.Stack;
59  import java.util.Vector;
60  
61  
62  /***
63   * Lexer for html parser.
64   * <p>
65   * Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
66   * level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
67   * null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
68   * mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
69   * to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
70   * Not yet done: - Doctype subset and marked sections
71   * </p>
72   * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
73   * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
74   * @author Fabrizio Giustina
75   * @version $Revision: 1.93 $ ($Author: fgiust $)
76   */
77  public class Lexer
78  {
79  
80      /***
81       * state: ignore whitespace.
82       */
83      public static final short IGNORE_WHITESPACE = 0;
84  
85      /***
86       * state: mixed content.
87       */
88      public static final short MIXED_CONTENT = 1;
89  
90      /***
91       * state: preformatted.
92       */
93      public static final short PREFORMATTED = 2;
94  
95      /***
96       * state: ignore markup.
97       */
98      public static final short IGNORE_MARKUP = 3;
99  
100     /***
101      * URI for XHTML 1.0 transitional DTD.
102      */
103     private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
104 
105     /***
106      * URI for XHTML 1.0 strict DTD.
107      */
108     private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
109 
110     /***
111      * URI for XHTML 1.0 frameset DTD.
112      */
113     private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
114 
115     /***
116      * URI for XHTML 1.1.
117      */
118     private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";
119 
120     /***
121      * URI for XHTML Basic 1.0.
122      */
123     // private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd";
124     /***
125      * xhtml namespace.
126      */
127     private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
128 
129     /***
130      * lists all the known versions.
131      */
132     private static final Lexer.W3CVersionInfo[] W3CVERSION = {
133         new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
134         new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
135         new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
136         new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
137         new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
138         new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
139         new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
140         new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
141         new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
142         new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20),
143         new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)};
144 
145     /***
146      * getToken state: content.
147      */
148     private static final short LEX_CONTENT = 0;
149 
150     /***
151      * getToken state: gt.
152      */
153     private static final short LEX_GT = 1;
154 
155     /***
156      * getToken state: endtag.
157      */
158     private static final short LEX_ENDTAG = 2;
159 
160     /***
161      * getToken state: start tag.
162      */
163     private static final short LEX_STARTTAG = 3;
164 
165     /***
166      * getToken state: comment.
167      */
168     private static final short LEX_COMMENT = 4;
169 
170     /***
171      * getToken state: doctype.
172      */
173     private static final short LEX_DOCTYPE = 5;
174 
175     /***
176      * getToken state: procinstr.
177      */
178     private static final short LEX_PROCINSTR = 6;
179 
180     /***
181      * getToken state: cdata.
182      */
183     private static final short LEX_CDATA = 8;
184 
185     /***
186      * getToken state: section.
187      */
188     private static final short LEX_SECTION = 9;
189 
190     /***
191      * getToken state: asp.
192      */
193     private static final short LEX_ASP = 10;
194 
195     /***
196      * getToken state: jste.
197      */
198     private static final short LEX_JSTE = 11;
199 
200     /***
201      * getToken state: php.
202      */
203     private static final short LEX_PHP = 12;
204 
205     /***
206      * getToken state: xml declaration.
207      */
208     private static final short LEX_XMLDECL = 13;
209 
210     /***
211      * file stream.
212      */
213     protected StreamIn in;
214 
215     /***
216      * error output stream.
217      */
218     protected PrintWriter errout;
219 
220     /***
221      * for accessibility errors.
222      */
223     protected short badAccess;
224 
225     /***
226      * for bad style errors.
227      */
228     protected short badLayout;
229 
230     /***
231      * for bad char encodings.
232      */
233     protected short badChars;
234 
235     /***
236      * for mismatched/mispositioned form tags.
237      */
238     protected short badForm;
239 
240     /***
241      * count of warnings in this document.
242      */
243     protected short warnings;
244 
245     /***
246      * count of errors.
247      */
248     protected short errors;
249 
250     /***
251      * lines seen.
252      */
253     protected int lines;
254 
255     /***
256      * at start of current token.
257      */
258     protected int columns;
259 
260     /***
261      * used to collapse contiguous white space.
262      */
263     protected boolean waswhite;
264 
265     /***
266      * true after token has been pushed back.
267      */
268     protected boolean pushed;
269 
270     /***
271      * when space is moved after end tag.
272      */
273     protected boolean insertspace;
274 
275     /***
276      * Netscape compatibility.
277      */
278     protected boolean excludeBlocks;
279 
280     /***
281      * true if moved out of table.
282      */
283     protected boolean exiled;
284 
285     /***
286      * true if xmlns attribute on html element.
287      */
288     protected boolean isvoyager;
289 
290     /***
291      * bit vector of HTML versions.
292      */
293     protected short versions;
294 
295     /***
296      * version as given by doctype (if any).
297      */
298     protected int doctype;
299 
300     /***
301      * set if html or PUBLIC is missing.
302      */
303     protected boolean badDoctype;
304 
305     /***
306      * start of current node.
307      */
308     protected int txtstart;
309 
310     /***
311      * end of current node.
312      */
313     protected int txtend;
314 
315     /***
316      * state of lexer's finite state machine.
317      */
318     protected short state;
319 
320     /***
321      * current node.
322      */
323     protected Node token;
324 
325     /***
326      * Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
327      * all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
328      */
329     protected byte[] lexbuf;
330 
331     /***
332      * allocated.
333      */
334     protected int lexlength;
335 
336     /***
337      * used.
338      */
339     protected int lexsize;
340 
341     /***
342      * Inline stack for compatibility with Mosaic. For deferring text node.
343      */
344     protected Node inode;
345 
346     /***
347      * for inferring inline tags.
348      */
349     protected int insert;
350 
351     /***
352      * stack.
353      */
354     protected Stack istack;
355 
356     /***
357      * start of frame.
358      */
359     protected int istackbase;
360 
361     /***
362      * used for cleaning up presentation markup.
363      */
364     protected Style styles;
365 
366     /***
367      * configuration.
368      */
369     protected Configuration configuration;
370 
371     /***
372      * already seen end body tag?
373      */
374     protected boolean seenEndBody;
375 
376     /***
377      * already seen end html tag?
378      */
379     protected boolean seenEndHtml;
380 
381     /***
382      * report.
383      */
384     protected Report report;
385 
386     /***
387      * Root node is saved here.
388      */
389     protected Node root;
390 
391     /***
392      * node list.
393      */
394     private List nodeList;
395 
396     /***
397      * Instantiates a new Lexer.
398      * @param in StreamIn
399      * @param configuration configuation instance
400      * @param report report instance, for reporting errors
401      */
402     public Lexer(StreamIn in, Configuration configuration, Report report)
403     {
404         this.report = report;
405         this.in = in;
406         this.lines = 1;
407         this.columns = 1;
408         this.state = LEX_CONTENT;
409         this.versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY);
410         this.doctype = Dict.VERS_UNKNOWN;
411         this.insert = -1;
412         this.istack = new Stack();
413         this.configuration = configuration;
414         this.nodeList = new Vector();
415     }
416 
417     /***
418      * Creates a new node and add it to nodelist.
419      * @return Node
420      */
421     public Node newNode()
422     {
423         Node node = new Node();
424         this.nodeList.add(node);
425         return node;
426     }
427 
428     /***
429      * Creates a new node and add it to nodelist.
430      * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
431      * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
432      * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
433      * @param textarray array of bytes contained in the Node
434      * @param start start position
435      * @param end end position
436      * @return Node
437      */
438     public Node newNode(short type, byte[] textarray, int start, int end)
439     {
440         Node node = new Node(type, textarray, start, end);
441         this.nodeList.add(node);
442         return node;
443     }
444 
445     /***
446      * Creates a new node and add it to nodelist.
447      * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
448      * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
449      * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
450      * @param textarray array of bytes contained in the Node
451      * @param start start position
452      * @param end end position
453      * @param element tag name
454      * @return Node
455      */
456     public Node newNode(short type, byte[] textarray, int start, int end, String element)
457     {
458         Node node = new Node(type, textarray, start, end, element, this.configuration.tt);
459         this.nodeList.add(node);
460         return node;
461     }
462 
463     /***
464      * Clones a node and add it to node list.
465      * @param node Node
466      * @return cloned Node
467      */
468     public Node cloneNode(Node node)
469     {
470         Node cnode = (Node) node.clone();
471         this.nodeList.add(cnode);
472         for (AttVal att = cnode.attributes; att != null; att = att.next)
473         {
474             if (att.asp != null)
475             {
476                 this.nodeList.add(att.asp);
477             }
478             if (att.php != null)
479             {
480                 this.nodeList.add(att.php);
481             }
482         }
483         return cnode;
484     }
485 
486     /***
487      * Clones an attribute value and add eventual asp or php node to node list.
488      * @param attrs original AttVal
489      * @return cloned AttVal
490      */
491     public AttVal cloneAttributes(AttVal attrs)
492     {
493         AttVal cattrs = (AttVal) attrs.clone();
494         for (AttVal att = cattrs; att != null; att = att.next)
495         {
496             if (att.asp != null)
497             {
498                 this.nodeList.add(att.asp);
499             }
500             if (att.php != null)
501             {
502                 this.nodeList.add(att.php);
503             }
504         }
505         return cattrs;
506     }
507 
508     /***
509      * Update <code>oldtextarray</code> in the current nodes.
510      * @param oldtextarray previous text array
511      * @param newtextarray new text array
512      */
513     protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
514     {
515         Node node;
516         for (int i = 0; i < this.nodeList.size(); i++)
517         {
518             node = (Node) (this.nodeList.get(i));
519             if (node.textarray == oldtextarray)
520             {
521                 node.textarray = newtextarray;
522             }
523         }
524     }
525 
526     /***
527      * Adds a new line node. Used for creating preformatted text from Word2000.
528      * @return new line node
529      */
530     public Node newLineNode()
531     {
532         Node node = newNode();
533 
534         node.textarray = this.lexbuf;
535         node.start = this.lexsize;
536         addCharToLexer('\n');
537         node.end = this.lexsize;
538         return node;
539     }
540 
541     /***
542      * Has end of input stream been reached?
543      * @return <code>true</code> if end of input stream been reached
544      */
545     public boolean endOfInput()
546     {
547         return this.in.isEndOfStream();
548     }
549 
550     /***
551      * Adds a byte to lexer buffer.
552      * @param c byte to add
553      */
554     public void addByte(int c)
555     {
556         if (this.lexsize + 1 >= this.lexlength)
557         {
558             while (this.lexsize + 1 >= this.lexlength)
559             {
560                 if (this.lexlength == 0)
561                 {
562                     this.lexlength = 8192;
563                 }
564                 else
565                 {
566                     this.lexlength = this.lexlength * 2;
567                 }
568             }
569 
570             byte[] temp = this.lexbuf;
571             this.lexbuf = new byte[this.lexlength];
572             if (temp != null)
573             {
574                 System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
575                 updateNodeTextArrays(temp, this.lexbuf);
576             }
577         }
578 
579         this.lexbuf[this.lexsize++] = (byte) c;
580         this.lexbuf[this.lexsize] = (byte) '\0'; // debug
581     }
582 
583     /***
584      * Substitute the last char in buffer.
585      * @param c new char
586      */
587     public void changeChar(byte c)
588     {
589         if (this.lexsize > 0)
590         {
591             this.lexbuf[this.lexsize - 1] = c;
592         }
593     }
594 
595     /***
596      * Store char c as UTF-8 encoded byte stream.
597      * @param c char to store
598      */
599     public void addCharToLexer(int c)
600     {
601         // Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char
602         // Fix by Pablo Mayrgundter 17-08-2004
603 
604         if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
605             && !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first.
606                 || c == 0x9 || c == 0xA || c == 0xD // Then white-space.
607                 || (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode.
608             || (c >= 0x10000 && c <= 0x10FFFF)))
609         {
610             return;
611         }
612 
613         int i = 0;
614         int[] count = new int[]{0};
615         byte[] buf = new byte[10]; // unsigned char
616 
617         boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count);
618         if (err)
619         {
620             // replacement char 0xFFFD encoded as UTF-8
621             buf[0] = (byte) 0xEF;
622             buf[1] = (byte) 0xBF;
623             buf[2] = (byte) 0xBD;
624             count[0] = 3;
625         }
626 
627         for (i = 0; i < count[0]; i++)
628         {
629             addByte(buf[i]); // uint
630         }
631 
632     }
633 
634     /***
635      * Adds a string to lexer buffer.
636      * @param str String to add
637      */
638     public void addStringToLexer(String str)
639     {
640         for (int i = 0; i < str.length(); i++)
641         {
642             addCharToLexer(str.charAt(i));
643         }
644     }
645 
646     /***
647      * Parse an html entity.
648      * @param mode mode
649      */
650     public void parseEntity(short mode)
651     {
652         // No longer attempts to insert missing ';' for unknown
653         // entities unless one was present already, since this
654         // gives unexpected results.
655         // 
656         // For example: <a href="something.htm?foo&bar&fred">
657         // was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
658         // rather than: <a href="something.htm?foo&amp;bar&amp;fred">
659         // 
660         // My thanks for Maurice Buxton for spotting this.
661         // 
662         // Also Randy Waki pointed out the following case for the
663         // 04 Aug 00 version (bug #433012):
664         // 
665         // For example: <a href="something.htm?id=1&lang=en">
666         // was tidied to: <a href="something.htm?id=1&lang;=en">
667         // rather than: <a href="something.htm?id=1&amp;lang=en">
668         //
669         // where "lang" is a known entity (#9001), but browsers would
670         // misinterpret "&lang;" because it had a value > 256.
671         //
672         // So the case of an apparently known entity with a value > 256 and
673         // missing a semicolon is handled specially.
674         //
675         // "ParseEntity" is also a bit of a misnomer - it handles entities and
676         // numeric character references. Invalid NCR's are now reported.
677 
678         int start;
679         boolean first = true;
680         boolean semicolon = false;
681         int c, ch, startcol;
682         String str;
683 
684         start = this.lexsize - 1; // to start at "&"
685         startcol = this.in.getCurcol() - 1;
686 
687         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
688         {
689             if (c == ';')
690             {
691                 semicolon = true;
692                 break;
693             }
694 
695             if (first && c == '#')
696             {
697                 // #431953 - start RJ
698                 if (!this.configuration.ncr
699                     || this.configuration.getInCharEncoding() == Configuration.BIG5
700                     || this.configuration.getInCharEncoding() == Configuration.SHIFTJIS)
701                 {
702                     this.in.ungetChar(c);
703                     return;
704                 }
705                 // #431953 - end RJ
706 
707                 addCharToLexer(c);
708                 first = false;
709                 continue;
710             }
711 
712             first = false;
713 
714             if (TidyUtils.isNamechar((char) c))
715             {
716                 addCharToLexer(c);
717                 continue;
718             }
719 
720             // otherwise put it back
721             this.in.ungetChar(c);
722             break;
723         }
724 
725         str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start);
726 
727         if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML)
728         {
729             report.entityError(this, Report.APOS_UNDEFINED, str, 39);
730         }
731 
732         ch = EntityTable.getDefaultEntityTable().entityCode(str);
733 
734         // drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004
735         // if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
736         // && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first.
737         // || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space.
738         // || (ch >= 0xE000 && ch <= 0xFFFD)))
739         // {
740         // this.lexsize = start;
741         // return;
742         // }
743 
744         // deal with unrecognized or invalid entities
745         // #433012 - fix by Randy Waki 17 Feb 01
746         // report invalid NCR's - Terry Teague 01 Sep 01
747         if (ch <= 0 || (ch >= 256 && c != ';'))
748         {
749             // set error position just before offending character
750             this.lines = this.in.getCurline();
751             this.columns = startcol;
752 
753             if (this.lexsize > start + 1)
754             {
755                 if (ch >= 128 && ch <= 159)
756                 {
757                     // invalid numeric character reference
758                     int c1 = 0;
759 
760                     if (configuration.replacementCharEncoding == Configuration.WIN1252)
761                     {
762                         c1 = EncodingUtils.decodeWin1252(ch);
763                     }
764                     else if (configuration.replacementCharEncoding == Configuration.MACROMAN)
765                     {
766                         c1 = EncodingUtils.decodeMacRoman(ch);
767                     }
768 
769                     // "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing
770 
771                     int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;
772 
773                     if (c != ';') /* issue warning if not terminated by ';' */
774                     {
775                         report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c);
776                     }
777 
778                     report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch);
779 
780                     if (c1 != 0)
781                     {
782                         // make the replacement
783                         this.lexsize = start;
784                         addCharToLexer(c1);
785                         semicolon = false;
786                     }
787                     else
788                     {
789                         /* discard */
790                         this.lexsize = start;
791                         semicolon = false;
792                     }
793 
794                 }
795                 else
796                 {
797                     report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
798                 }
799 
800                 if (semicolon)
801                 {
802                     addCharToLexer(';');
803                 }
804             }
805             else
806             {
807                 // naked &
808                 report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
809             }
810         }
811         else
812         {
813             // issue warning if not terminated by ';'
814             if (c != ';')
815             {
816                 // set error position just before offending character
817                 this.lines = this.in.getCurline();
818                 this.columns = startcol;
819                 report.entityError(this, Report.MISSING_SEMICOLON, str, c);
820             }
821 
822             this.lexsize = start;
823 
824             if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED))
825             {
826                 ch = ' ';
827             }
828 
829             addCharToLexer(ch);
830 
831             if (ch == '&' && !this.configuration.quoteAmpersand)
832             {
833                 addCharToLexer('a');
834                 addCharToLexer('m');
835                 addCharToLexer('p');
836                 addCharToLexer(';');
837             }
838         }
839     }
840 
841     /***
842      * Parses a tag name.
843      * @return first char after the tag name
844      */
845     public char parseTagName()
846     {
847         int c;
848 
849         // fold case of first char in buffer
850         c = this.lexbuf[this.txtstart];
851 
852         if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
853         {
854             c = TidyUtils.toLower((char) c);
855             this.lexbuf[this.txtstart] = (byte) c;
856         }
857 
858         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
859         {
860             if (!TidyUtils.isNamechar((char) c))
861             {
862                 break;
863             }
864 
865             // fold case of subsequent chars
866             if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
867             {
868                 c = TidyUtils.toLower((char) c);
869             }
870 
871             addCharToLexer(c);
872         }
873 
874         this.txtend = this.lexsize;
875         return (char) c;
876     }
877 
878     /***
879      * calls addCharToLexer for any char in the string.
880      * @param str input String
881      */
882     public void addStringLiteral(String str)
883     {
884         int len = str.length();
885         for (int i = 0; i < len; i++)
886         {
887             addCharToLexer(str.charAt(i));
888         }
889     }
890 
891     /***
892      * calls addCharToLexer for any char in the string till len is reached.
893      * @param str input String
894      * @param len length of the substring to be added
895      */
896     void addStringLiteralLen(String str, int len)
897     {
898         int strlen = str.length();
899         if (strlen < len)
900         {
901             len = strlen;
902         }
903         for (int i = 0; i < len; i++)
904         {
905             addCharToLexer(str.charAt(i));
906         }
907     }
908 
909     /***
910      * Choose what version to use for new doctype.
911      * @return html version constant
912      */
913     public short htmlVersion()
914     {
915         if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20))
916         {
917             return Dict.VERS_HTML20;
918         }
919 
920         if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager)
921             && TidyUtils.toBoolean(versions & Dict.VERS_HTML32))
922         {
923             return Dict.VERS_HTML32;
924         }
925         if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11))
926         {
927             return Dict.VERS_XHTML11;
928         }
929         if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT))
930         {
931             return Dict.VERS_HTML40_STRICT;
932         }
933 
934         if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE))
935         {
936             return Dict.VERS_HTML40_LOOSE;
937         }
938 
939         if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET))
940         {
941             return Dict.VERS_FRAMESET;
942         }
943 
944         return Dict.VERS_UNKNOWN;
945     }
946 
947     /***
948      * Choose what version to use for new doctype.
949      * @return html version name
950      */
951     public String htmlVersionName()
952     {
953         short guessed;
954         int j;
955 
956         guessed = apparentVersion();
957 
958         for (j = 0; j < W3CVERSION.length; ++j)
959         {
960             if (guessed == W3CVERSION[j].code)
961             {
962                 if (this.isvoyager)
963                 {
964                     return W3CVERSION[j].voyagerName;
965                 }
966 
967                 return W3CVERSION[j].name;
968             }
969         }
970 
971         return null;
972     }
973 
974     /***
975      * Add meta element for Tidy. If the meta tag is already present, update release date.
976      * @param root root node
977      * @return <code>true</code> if the tag has been added
978      */
979     public boolean addGenerator(Node root)
980     {
981         AttVal attval;
982         Node node;
983         Node head = root.findHEAD(this.configuration.tt);
984 
985         if (head != null)
986         {
987             String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see www.w3.org";
988 
989             for (node = head.content; node != null; node = node.next)
990             {
991                 if (node.tag == this.configuration.tt.tagMeta)
992                 {
993                     attval = node.getAttrByName("name");
994 
995                     if (attval != null && attval.value != null && "generator".equalsIgnoreCase(attval.value))
996                     {
997                         attval = node.getAttrByName("content");
998 
999                         if (attval != null
1000                             && attval.value != null
1001                             && attval.value.length() >= 9
1002                             && "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9)))
1003                         {
1004                             attval.value = meta;
1005                             return false;
1006                         }
1007                     }
1008                 }
1009             }
1010 
1011             node = this.inferredTag("meta");
1012             node.addAttribute("content", meta);
1013             node.addAttribute("name", "generator");
1014             head.insertNodeAtStart(node);
1015             return true;
1016         }
1017 
1018         return false;
1019     }
1020 
1021     /***
1022      * Check system keywords (keywords should be uppercase).
1023      * @param doctype doctype node
1024      * @return true if doctype keywords are all uppercase
1025      */
1026     public boolean checkDocTypeKeyWords(Node doctype)
1027     {
1028         int len = doctype.end - doctype.start;
1029         String s = TidyUtils.getString(this.lexbuf, doctype.start, len);
1030 
1031         return !(TidyUtils.findBadSubString("SYSTEM", s, len)
1032             || TidyUtils.findBadSubString("PUBLIC", s, len)
1033             || TidyUtils.findBadSubString("//DTD", s, len)
1034             || TidyUtils.findBadSubString("//W3C", s, len) || TidyUtils.findBadSubString("//EN", s, len));
1035     }
1036 
1037     /***
1038      * Examine DOCTYPE to identify version.
1039      * @param doctype doctype node
1040      * @return version code
1041      */
1042     public short findGivenVersion(Node doctype)
1043     {
1044         String p, s;
1045         int i, j;
1046         int len;
1047         String str1;
1048         String str2;
1049 
1050         // if root tag for doctype isn't html give up now
1051         str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5);
1052         if (!"html ".equalsIgnoreCase(str1))
1053         {
1054             return 0;
1055         }
1056 
1057         if (!checkDocTypeKeyWords(doctype))
1058         {
1059             report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
1060         }
1061 
1062         // give up if all we are given is the system id for the doctype
1063         str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7);
1064         if ("SYSTEM ".equalsIgnoreCase(str1))
1065         {
1066             // but at least ensure the case is correct
1067             if (!str1.substring(0, 6).equals("SYSTEM"))
1068             {
1069                 System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6);
1070             }
1071             return 0; // unrecognized
1072         }
1073 
1074         if ("PUBLIC ".equalsIgnoreCase(str1))
1075         {
1076             if (!str1.substring(0, 6).equals("PUBLIC"))
1077             {
1078                 System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6);
1079             }
1080         }
1081         else
1082         {
1083             this.badDoctype = true;
1084         }
1085 
1086         for (i = doctype.start; i < doctype.end; ++i)
1087         {
1088             if (this.lexbuf[i] == (byte) '"')
1089             {
1090                 str1 = TidyUtils.getString(this.lexbuf, i + 1, 12);
1091                 str2 = TidyUtils.getString(this.lexbuf, i + 1, 13);
1092                 if (str1.equals("-//W3C//DTD "))
1093                 {
1094                     // compute length of identifier e.g. "HTML 4.0 Transitional"
1095                     for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
1096                     {
1097                         //
1098                     }
1099                     len = j - i - 13;
1100                     p = TidyUtils.getString(this.lexbuf, i + 13, len);
1101 
1102                     for (j = 1; j < W3CVERSION.length; ++j)
1103                     {
1104                         s = W3CVERSION[j].name;
1105                         if (len == s.length() && s.equals(p))
1106                         {
1107                             return W3CVERSION[j].code;
1108                         }
1109                     }
1110 
1111                     // else unrecognized version
1112                 }
1113                 else if (str2.equals("-//IETF//DTD "))
1114                 {
1115                     // compute length of identifier e.g. "HTML 2.0"
1116                     for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
1117                     {
1118                         //
1119                     }
1120                     len = j - i - 14;
1121 
1122                     p = TidyUtils.getString(this.lexbuf, i + 14, len);
1123                     s = W3CVERSION[0].name;
1124                     if (len == s.length() && s.equals(p))
1125                     {
1126                         return W3CVERSION[0].code;
1127                     }
1128 
1129                     // else unrecognized version
1130                 }
1131                 break;
1132             }
1133         }
1134 
1135         return 0;
1136     }
1137 
1138     /***
1139      * Fix xhtml namespace.
1140      * @param root root Node
1141      * @param profile current profile
1142      */
1143     public void fixHTMLNameSpace(Node root, String profile)
1144     {
1145         Node node;
1146         AttVal attr;
1147 
1148         node = root.content;
1149         while (node != null && node.tag != this.configuration.tt.tagHtml)
1150         {
1151             node = node.next;
1152         }
1153 
1154         if (node != null)
1155         {
1156 
1157             for (attr = node.attributes; attr != null; attr = attr.next)
1158             {
1159                 if (attr.attribute.equals("xmlns"))
1160                 {
1161                     break;
1162                 }
1163 
1164             }
1165 
1166             if (attr != null)
1167             {
1168                 if (!attr.value.equals(profile))
1169                 {
1170                     report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
1171                     attr.value = profile;
1172                 }
1173             }
1174             else
1175             {
1176                 attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
1177                 attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr);
1178                 node.attributes = attr;
1179             }
1180         }
1181     }
1182 
1183     /***
1184      * Put DOCTYPE declaration between the &lt:?xml version "1.0" ... ?&gt; declaration, if any, and the
1185      * <code>html</code> tag. Should also work for any comments, etc. that may precede the <code>html</code> tag.
1186      * @param root root node
1187      * @return new doctype node
1188      */
1189     Node newXhtmlDocTypeNode(Node root)
1190     {
1191         Node html = root.findHTML(this.configuration.tt);
1192         if (html == null)
1193         {
1194             return null;
1195         }
1196 
1197         Node newdoctype = newNode();
1198         newdoctype.setType(Node.DOCTYPE_TAG);
1199         newdoctype.next = html;
1200         newdoctype.parent = root;
1201         newdoctype.prev = null;
1202 
1203         if (html == root.content)
1204         {
1205             // No <?xml ... ?> declaration.
1206             root.content.prev = newdoctype;
1207             root.content = newdoctype;
1208             newdoctype.prev = null;
1209         }
1210         else
1211         {
1212             // we have an <?xml ... ?> declaration.
1213             newdoctype.prev = html.prev;
1214             newdoctype.prev.next = newdoctype;
1215         }
1216         html.prev = newdoctype;
1217         return newdoctype;
1218     }
1219 
1220     /***
1221      * Adds a new xhtml doctype to the document.
1222      * @param root root node
1223      * @return <code>true</code> if a doctype has been added
1224      */
1225     public boolean setXHTMLDocType(Node root)
1226     {
1227         String fpi = " ";
1228         String sysid = "";
1229         String namespace = XHTML_NAMESPACE;
1230         String dtdsub = null;
1231         Node doctype;
1232         int dtdlen = 0;
1233 
1234         doctype = root.findDocType();
1235 
1236         fixHTMLNameSpace(root, namespace); // #427839 - fix by Evan Lenz 05 Sep 00
1237 
1238         if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
1239         {
1240             if (doctype != null)
1241             {
1242                 Node.discardElement(doctype);
1243             }
1244             return true;
1245         }
1246 
1247         if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
1248         {
1249             // see what flavor of XHTML this document matches
1250             if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1251             {
1252                 // use XHTML strict
1253                 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1254                 sysid = VOYAGER_STRICT;
1255             }
1256             else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1257             {
1258                 // use XHTML frames
1259                 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
1260                 sysid = VOYAGER_FRAMESET;
1261             }
1262             else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE))
1263             {
1264                 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1265                 sysid = VOYAGER_LOOSE;
1266             }
1267             else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1268             {
1269                 // use XHTML 1.1
1270                 fpi = "-//W3C//DTD XHTML 1.1//EN";
1271                 sysid = VOYAGER_11;
1272             }
1273             else
1274             {
1275                 // proprietary
1276                 fpi = null;
1277                 sysid = "";
1278                 if (doctype != null)// #473490 - fix by Bjšrn Hšhrmann 10 Oct 01
1279                 {
1280                     Node.discardElement(doctype);
1281                 }
1282             }
1283         }
1284         else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
1285         {
1286             fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1287             sysid = VOYAGER_STRICT;
1288         }
1289         else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
1290         {
1291             fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1292             sysid = VOYAGER_LOOSE;
1293         }
1294 
1295         if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null)
1296         {
1297             fpi = this.configuration.docTypeStr;
1298             sysid = "";
1299         }
1300 
1301         if (fpi == null)
1302         {
1303             return false;
1304         }
1305 
1306         if (doctype != null)
1307         {
1308             // Look for internal DTD subset
1309             if (configuration.xHTML || configuration.xmlOut)
1310             {
1311 
1312                 int len = doctype.end - doctype.start + 1;
1313                 String start = TidyUtils.getString(this.lexbuf, doctype.start, len);
1314 
1315                 int dtdbeg = start.indexOf('[');
1316                 if (dtdbeg >= 0)
1317                 {
1318                     int dtdend = start.substring(dtdbeg).indexOf(']');
1319                     if (dtdend >= 0)
1320                     {
1321                         dtdlen = dtdend + 1;
1322                         dtdsub = start.substring(dtdbeg);
1323                     }
1324                 }
1325             }
1326         }
1327         else
1328         {
1329             if ((doctype = newXhtmlDocTypeNode(root)) == null)
1330             {
1331                 return false;
1332             }
1333         }
1334 
1335         this.txtstart = this.lexsize;
1336         this.txtend = this.lexsize;
1337 
1338         // add public identifier
1339         addStringLiteral("html PUBLIC ");
1340 
1341         // check if the fpi is quoted or not
1342         if (fpi.charAt(0) == '"')
1343         {
1344             addStringLiteral(fpi);
1345         }
1346         else
1347         {
1348             addStringLiteral("\"");
1349             addStringLiteral(fpi);
1350             addStringLiteral("\"");
1351         }
1352 
1353         if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen)
1354         {
1355             addStringLiteral("\n\"");
1356         }
1357         else
1358         {
1359             // FG: don't wrap
1360             addStringLiteral(" \"");
1361         }
1362 
1363         // add system identifier
1364         addStringLiteral(sysid);
1365         addStringLiteral("\"");
1366 
1367         if (dtdlen > 0 && dtdsub != null)
1368         {
1369             addCharToLexer(' ');
1370             addStringLiteralLen(dtdsub, dtdlen);
1371         }
1372 
1373         this.txtend = this.lexsize;
1374 
1375         int length = this.txtend - this.txtstart;
1376         doctype.textarray = new byte[length];
1377 
1378         System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
1379         doctype.start = 0;
1380         doctype.end = length;
1381 
1382         return false;
1383     }
1384 
1385     /***
1386      * Return the html version used in document.
1387      * @return version code
1388      */
1389     public short apparentVersion()
1390     {
1391         switch (this.doctype)
1392         {
1393             case Dict.VERS_UNKNOWN :
1394                 return htmlVersion();
1395 
1396             case Dict.VERS_HTML20 :
1397                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
1398                 {
1399                     return Dict.VERS_HTML20;
1400                 }
1401 
1402                 break;
1403 
1404             case Dict.VERS_HTML32 :
1405                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
1406                 {
1407                     return Dict.VERS_HTML32;
1408                 }
1409 
1410                 break; // to replace old version by new
1411 
1412             case Dict.VERS_HTML40_STRICT :
1413                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1414                 {
1415                     return Dict.VERS_HTML40_STRICT;
1416                 }
1417 
1418                 break;
1419 
1420             case Dict.VERS_HTML40_LOOSE :
1421                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
1422                 {
1423                     return Dict.VERS_HTML40_LOOSE;
1424                 }
1425 
1426                 break; // to replace old version by new
1427 
1428             case Dict.VERS_FRAMESET :
1429                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1430                 {
1431                     return Dict.VERS_FRAMESET;
1432                 }
1433 
1434                 break;
1435 
1436             case Dict.VERS_XHTML11 :
1437                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1438                 {
1439                     return Dict.VERS_XHTML11;
1440                 }
1441 
1442                 break;
1443             default :
1444                 // should never reach here
1445                 break;
1446         }
1447 
1448         // kludge to avoid error appearing at end of file
1449         // it would be better to note the actual position
1450         // when first encountering the doctype declaration
1451 
1452         this.lines = 1;
1453         this.columns = 1;
1454 
1455         report.warning(this, null, null, Report.INCONSISTENT_VERSION);
1456         return this.htmlVersion();
1457     }
1458 
1459     /***
1460      * Fixup doctype if missing.
1461      * @param root root node
1462      * @return <code>false</code> if current version has not been identified
1463      */
1464     public boolean fixDocType(Node root)
1465     {
1466         Node doctype;
1467         int guessed = Dict.VERS_HTML40_STRICT, i;
1468 
1469         if (this.badDoctype)
1470         {
1471             report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
1472         }
1473 
1474         doctype = root.findDocType();
1475 
1476         if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
1477         {
1478             if (doctype != null)
1479             {
1480                 Node.discardElement(doctype);
1481             }
1482             return true;
1483         }
1484 
1485         if (this.configuration.xmlOut)
1486         {
1487             return true;
1488         }
1489 
1490         if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
1491         {
1492             Node.discardElement(doctype);
1493             doctype = null;
1494             guessed = Dict.VERS_HTML40_STRICT;
1495         }
1496         else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
1497         {
1498             Node.discardElement(doctype);
1499             doctype = null;
1500             guessed = Dict.VERS_HTML40_LOOSE;
1501         }
1502         else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
1503         {
1504             if (doctype != null)
1505             {
1506                 if (this.doctype == Dict.VERS_UNKNOWN)
1507                 {
1508                     return false;
1509                 }
1510 
1511                 switch (this.doctype)
1512                 {
1513                     case Dict.VERS_UNKNOWN :
1514                         return false;
1515 
1516                     case Dict.VERS_HTML20 :
1517                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
1518                         {
1519                             return true;
1520                         }
1521 
1522                         break; // to replace old version by new
1523 
1524                     case Dict.VERS_HTML32 :
1525                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
1526                         {
1527                             return true;
1528                         }
1529 
1530                         break; // to replace old version by new
1531 
1532                     case Dict.VERS_HTML40_STRICT :
1533                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1534                         {
1535                             return true;
1536                         }
1537 
1538                         break; // to replace old version by new
1539 
1540                     case Dict.VERS_HTML40_LOOSE :
1541                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
1542                         {
1543                             return true;
1544                         }
1545 
1546                         break; // to replace old version by new
1547 
1548                     case Dict.VERS_FRAMESET :
1549                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1550                         {
1551                             return true;
1552                         }
1553 
1554                         break; // to replace old version by new
1555 
1556                     case Dict.VERS_XHTML11 :
1557                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1558                         {
1559                             return true;
1560                         }
1561 
1562                         break; // to replace old version by new
1563                     default :
1564                         // should never reach here
1565                         break;
1566                 }
1567 
1568                 // INCONSISTENT_VERSION warning is now issued by ApparentVersion()
1569             }
1570 
1571             // choose new doctype
1572             guessed = htmlVersion();
1573         }
1574 
1575         if (guessed == Dict.VERS_UNKNOWN)
1576         {
1577             return false;
1578         }
1579 
1580         // for XML use the Voyager system identifier
1581         if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager)
1582         {
1583             if (doctype != null)
1584             {
1585                 Node.discardElement(doctype);
1586             }
1587 
1588             fixHTMLNameSpace(root, XHTML_NAMESPACE);
1589 
1590             // Namespace is the same for all XHTML variants
1591             // Also, don't return yet. Still need to add DOCTYPE declaration.
1592             //
1593             // for (i = 0; i < W3CVersion.length; ++i)
1594             // {
1595             // if (guessed == W3CVersion[i].code)
1596             // {
1597             // fixHTMLNameSpace(root, W3CVersion[i].profile);
1598             // break;
1599             // }
1600             // }
1601             // return true;
1602         }
1603 
1604         if (doctype == null)
1605         {
1606             if ((doctype = newXhtmlDocTypeNode(root)) == null)
1607             {
1608                 return false;
1609             }
1610         }
1611 
1612         this.txtstart = this.lexsize;
1613         this.txtend = this.lexsize;
1614 
1615         // use the appropriate public identifier
1616         addStringLiteral("html PUBLIC ");
1617 
1618         if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER
1619             && this.configuration.docTypeStr != null
1620             && this.configuration.docTypeStr.length() > 0)
1621         {
1622             // check if the fpi is quoted or not
1623             if (this.configuration.docTypeStr.charAt(0) == '"')
1624             {
1625                 addStringLiteral(this.configuration.docTypeStr);
1626             }
1627             else
1628             {
1629                 addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
1630                 addStringLiteral(this.configuration.docTypeStr);
1631                 addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
1632             }
1633         }
1634         else if (guessed == Dict.VERS_HTML20)
1635         {
1636             addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
1637         }
1638         else
1639         {
1640             addStringLiteral("\"-//W3C//DTD ");
1641 
1642             for (i = 0; i < W3CVERSION.length; ++i)
1643             {
1644                 if (guessed == W3CVERSION[i].code)
1645                 {
1646                     addStringLiteral(W3CVERSION[i].name);
1647                     break;
1648                 }
1649             }
1650 
1651             addStringLiteral("//EN\"");
1652         }
1653 
1654         this.txtend = this.lexsize;
1655 
1656         int length = this.txtend - this.txtstart;
1657         doctype.textarray = new byte[length];
1658 
1659         System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
1660         doctype.start = 0;
1661         doctype.end = length;
1662 
1663         return true;
1664     }
1665 
1666     /***
1667      * Ensure XML document starts with <code>&lt;?XML version="1.0"?&gt;</code>. Add encoding attribute if not using
1668      * ASCII or UTF-8 output.
1669      * @param root root node
1670      * @return always true
1671      */
1672     public boolean fixXmlDecl(Node root)
1673     {
1674         Node xml;
1675         AttVal version;
1676         AttVal encoding;
1677 
1678         if (root.content != null && root.content.type == Node.XML_DECL)
1679         {
1680             xml = root.content;
1681         }
1682         else
1683         {
1684             xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0);
1685             xml.next = root.content;
1686 
1687             if (root.content != null)
1688             {
1689                 root.content.prev = xml;
1690                 xml.next = root.content;
1691             }
1692 
1693             root.content = xml;
1694         }
1695 
1696         version = xml.getAttrByName("version");
1697         encoding = xml.getAttrByName("encoding");
1698 
1699         // We need to insert a check if declared encoding and output encoding mismatch
1700         // and fix the Xml declaration accordingly!!!
1701         if (encoding == null && this.configuration.getOutCharEncoding() != Configuration.UTF8)
1702         {
1703             if (this.configuration.getOutCharEncoding() == Configuration.LATIN1)
1704             {
1705                 xml.addAttribute("encoding", "iso-8859-1");
1706             }
1707             if (this.configuration.getOutCharEncoding() == Configuration.ISO2022)
1708             {
1709                 xml.addAttribute("encoding", "iso-2022");
1710             }
1711         }
1712 
1713         if (version == null)
1714         {
1715             xml.addAttribute("version", "1.0");
1716         }
1717 
1718         return true;
1719     }
1720 
1721     /***
1722      * Generates and inserts a new node.
1723      * @param name tag name
1724      * @return generated node
1725      */
1726     public Node inferredTag(String name)
1727     {
1728         Node node;
1729 
1730         node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name);
1731         node.implicit = true;
1732         return node;
1733     }
1734 
1735     /***
1736      * Create a text node for the contents of a CDATA element like style or script which ends with &lt;/foo> for some
1737      * foo.
1738      * @param container container node
1739      * @return cdata node
1740      */
1741     public Node getCDATA(Node container)
1742     {
1743         int c, lastc, start, len, i;
1744         int qt = 0;
1745         int esc = 0;
1746         String str;
1747         boolean endtag = false;
1748         boolean begtag = false;
1749 
1750         if (container.isJavaScript())
1751         {
1752             esc = '//';
1753         }
1754 
1755         this.lines = this.in.getCurline();
1756         this.columns = this.in.getCurcol();
1757         this.waswhite = false;
1758         this.txtstart = this.lexsize;
1759         this.txtend = this.lexsize;
1760 
1761         lastc = '\0';
1762         start = -1;
1763 
1764         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
1765         {
1766             // treat \r\n as \n and \r as \n
1767             if (qt > 0)
1768             {
1769                 // #598860 script parsing fails with quote chars
1770                 // A quoted string is ended by the quotation character, or end of line
1771                 if ((c == '\r' || c == '\n' || c == qt) && (!TidyUtils.toBoolean(esc) || lastc != esc))
1772                 {
1773                     qt = 0;
1774                 }
1775                 else if (c == '/' && lastc == '<')
1776                 {
1777                     start = this.lexsize + 1; // to first letter
1778                 }
1779 
1780                 else if (c == '>' && start >= 0)
1781                 {
1782                     len = this.lexsize - start;
1783 
1784                     this.lines = this.in.getCurline();
1785                     this.columns = this.in.getCurcol() - 3;
1786 
1787                     report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1788 
1789                     // if javascript insert backslash before /
1790                     if (TidyUtils.toBoolean(esc))
1791                     {
1792                         for (i = this.lexsize; i > start - 1; --i)
1793                         {
1794                             this.lexbuf[i] = this.lexbuf[i - 1];
1795                         }
1796 
1797                         this.lexbuf[start - 1] = (byte) esc;
1798                         this.lexsize++;
1799                     }
1800 
1801                     start = -1;
1802                 }
1803             }
1804             else if (TidyUtils.isQuote(c) && (!TidyUtils.toBoolean(esc) || lastc != esc))
1805             {
1806                 qt = c;
1807             }
1808             else if (c == '<')
1809             {
1810                 start = this.lexsize + 1; // to first letter
1811                 endtag = false;
1812                 begtag = true;
1813             }
1814             else if (c == '!' && lastc == '<') // Cancel start tag
1815             {
1816                 start = -1;
1817                 endtag = false;
1818                 begtag = false;
1819             }
1820             else if (c == '/' && lastc == '<')
1821             {
1822                 start = this.lexsize + 1; // to first letter
1823                 endtag = true;
1824                 begtag = false;
1825             }
1826             else if (c == '>' && start >= 0) // End of begin or end tag
1827             {
1828                 int decr = 2;
1829 
1830                 if (endtag && ((len = this.lexsize - start) == container.element.length()))
1831                 {
1832 
1833                     str = TidyUtils.getString(this.lexbuf, start, len);
1834                     if (container.element.equalsIgnoreCase(str))
1835                     {
1836                         this.txtend = start - decr;
1837                         this.lexsize = start - decr; // #433857 - fix by Huajun Zeng 26 Apr 01
1838                         break;
1839                     }
1840                 }
1841 
1842                 // Unquoted markup will end SCRIPT or STYLE elements
1843 
1844                 this.lines = this.in.getCurline();
1845                 this.columns = this.in.getCurcol() - 3;
1846 
1847                 report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1848                 if (begtag)
1849                 {
1850                     decr = 1;
1851                 }
1852                 this.txtend = start - decr;
1853                 this.lexsize = start - decr;
1854                 break;
1855             }
1856             // #427844 - fix by Markus Hoenicka 21 Oct 00
1857             else if (c == '\r')
1858             {
1859                 if (begtag || endtag)
1860                 {
1861                     continue; // discard whitespace in endtag
1862                 }
1863 
1864                 c = this.in.readChar();
1865 
1866                 if (c != '\n')
1867                 {
1868                     this.in.ungetChar(c);
1869                 }
1870 
1871                 c = '\n';
1872 
1873             }
1874             else if ((c == '\n' || c == '\t' || c == ' ') && (begtag || endtag))
1875             {
1876                 continue; // discard whitespace in endtag
1877             }
1878 
1879             addCharToLexer(c);
1880             this.txtend = this.lexsize;
1881             lastc = c;
1882         }
1883 
1884         if (c == StreamIn.END_OF_STREAM)
1885         {
1886             report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
1887         }
1888 
1889         if (this.txtend > this.txtstart)
1890         {
1891             this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
1892             return this.token;
1893         }
1894 
1895         return null;
1896     }
1897 
1898     /***
1899      * 
1900      *
1901      */
1902     public void ungetToken()
1903     {
1904         this.pushed = true;
1905     }
1906 
1907     /***
1908      * Gets a token.
1909      * @param mode one of the following:
1910      * <ul>
1911      * <li><code>MixedContent</code>-- for elements which don't accept PCDATA</li>
1912      * <li><code>Preformatted</code>-- white spacepreserved as is</li>
1913      * <li><code>IgnoreMarkup</code>-- for CDATA elements such as script, style</li>
1914      * </ul>
1915      * @return next Node
1916      */
1917     public Node getToken(short mode)
1918     {
1919         int c = 0;
1920         int badcomment = 0;
1921         // pass by reference
1922         boolean[] isempty = new boolean[1];
1923         boolean inDTDSubset = false;
1924         AttVal attributes = null;
1925 
1926         if (this.pushed)
1927         {
1928             // duplicate inlines in preference to pushed text nodes when appropriate
1929             if (this.token.type != Node.TEXT_NODE || (this.insert == -1 && this.inode == null))
1930             {
1931                 this.pushed = false;
1932                 return this.token;
1933             }
1934         }
1935 
1936         // at start of block elements, unclosed inline
1937         if (this.insert != -1 || this.inode != null)
1938         {
1939             return insertedToken();
1940         }
1941 
1942         this.lines = this.in.getCurline();
1943         this.columns = this.in.getCurcol();
1944         this.waswhite = false;
1945 
1946         this.txtstart = this.lexsize;
1947         this.txtend = this.lexsize;
1948 
1949         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
1950         {
1951             // FG fix for [427846] different from tidy
1952             // if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
1953             if (this.insertspace && mode != IGNORE_WHITESPACE)
1954             {
1955                 addCharToLexer(' ');
1956             }
1957             if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
1958             {
1959                 this.waswhite = true;
1960                 this.insertspace = false;
1961             }
1962 
1963             // treat \r\n as \n and \r as \n
1964             if (c == '\r')
1965             {
1966                 c = this.in.readChar();
1967 
1968                 if (c != '\n')
1969                 {
1970                     this.in.ungetChar(c);
1971                 }
1972 
1973                 c = '\n';
1974             }
1975 
1976             addCharToLexer(c);
1977 
1978             switch (this.state)
1979             {
1980                 case LEX_CONTENT :
1981                     // element content
1982 
1983                     // Discard white space if appropriate.
1984                     // Its cheaper to do this here rather than in parser methods for elements that
1985                     // don't have mixed content.
1986                     if (TidyUtils.isWhite((char) c) && (mode == IGNORE_WHITESPACE) && this.lexsize == this.txtstart + 1)
1987                     {
1988                         --this.lexsize;
1989                         this.waswhite = false;
1990                         this.lines = this.in.getCurline();
1991                         this.columns = this.in.getCurcol();
1992                         continue;
1993                     }
1994 
1995                     if (c == '<')
1996                     {
1997                         this.state = LEX_GT;
1998                         continue;
1999                     }
2000 
2001                     if (TidyUtils.isWhite((char) c))
2002                     {
2003                         // was previous char white?
2004                         if (this.waswhite)
2005                         {
2006                             if (mode != PREFORMATTED && mode != IGNORE_MARKUP)
2007                             {
2008                                 --this.lexsize;
2009                                 this.lines = this.in.getCurline();
2010                                 this.columns = this.in.getCurcol();
2011                             }
2012                         }
2013                         else
2014                         {
2015                             // prev char wasn't white
2016                             this.waswhite = true;
2017 
2018                             if (mode != PREFORMATTED && mode != IGNORE_MARKUP && c != ' ')
2019                             {
2020                                 changeChar((byte) ' ');
2021                             }
2022                         }
2023 
2024                         continue;
2025                     }
2026                     else if (c == '&' && mode != IGNORE_MARKUP)
2027                     {
2028                         parseEntity(mode);
2029                     }
2030 
2031                     // this is needed to avoid trimming trailing whitespace
2032                     if (mode == IGNORE_WHITESPACE)
2033                     {
2034                         mode = MIXED_CONTENT;
2035                     }
2036 
2037                     this.waswhite = false;
2038                     continue;
2039 
2040                 case LEX_GT :
2041                     // <
2042 
2043                     // check for endtag
2044                     if (c == '/')
2045                     {
2046                         c = this.in.readChar();
2047                         if (c == StreamIn.END_OF_STREAM)
2048                         {
2049                             this.in.ungetChar(c);
2050                             continue;
2051                         }
2052 
2053                         addCharToLexer(c);
2054 
2055                         if (TidyUtils.isLetter((char) c))
2056                         {
2057                             this.lexsize -= 3;
2058                             this.txtend = this.lexsize;
2059                             this.in.ungetChar(c);
2060                             this.state = LEX_ENDTAG;
2061                             this.lexbuf[this.lexsize] = (byte) '\0'; // debug
2062 
2063                             // changed from
2064                             // this.in.curcol -= 2;
2065                             this.columns -= 2;
2066 
2067                             // if some text before the </ return it now
2068                             if (this.txtend > this.txtstart)
2069                             {
2070                                 // trim space char before end tag
2071                                 if (mode == IGNORE_WHITESPACE && this.lexbuf[this.lexsize - 1] == (byte) ' ')
2072                                 {
2073                                     this.lexsize -= 1;
2074                                     this.txtend = this.lexsize;
2075                                 }
2076 
2077                                 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2078                                 return this.token;
2079                             }
2080 
2081                             continue; // no text so keep going
2082                         }
2083 
2084                         // otherwise treat as CDATA
2085                         this.waswhite = false;
2086                         this.state = LEX_CONTENT;
2087                         continue;
2088                     }
2089 
2090                     if (mode == IGNORE_MARKUP)
2091                     {
2092                         // otherwise treat as CDATA
2093                         this.waswhite = false;
2094                         this.state = LEX_CONTENT;
2095                         continue;
2096                     }
2097 
2098                     // look out for comments, doctype or marked sections this isn't quite right, but its getting there
2099                     if (c == '!')
2100                     {
2101                         c = this.in.readChar();
2102 
2103                         if (c == '-')
2104                         {
2105                             c = this.in.readChar();
2106 
2107                             if (c == '-')
2108                             {
2109                                 this.state = LEX_COMMENT; // comment
2110                                 this.lexsize -= 2;
2111                                 this.txtend = this.lexsize;
2112 
2113                                 // if some text before < return it now
2114                                 if (this.txtend > this.txtstart)
2115                                 {
2116                                     this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2117                                     return this.token;
2118                                 }
2119 
2120                                 this.txtstart = this.lexsize;
2121                                 continue;
2122                             }
2123 
2124                             report.warning(this, null, null, Report.MALFORMED_COMMENT);
2125                         }
2126                         else if (c == 'd' || c == 'D')
2127                         {
2128                             this.state = LEX_DOCTYPE; // doctype
2129                             this.lexsize -= 2;
2130                             this.txtend = this.lexsize;
2131                             mode = IGNORE_WHITESPACE;
2132 
2133                             // skip until white space or '>'
2134 
2135                             for (;;)
2136                             {
2137                                 c = this.in.readChar();
2138 
2139                                 if (c == StreamIn.END_OF_STREAM || c == '>')
2140                                 {
2141                                     this.in.ungetChar(c);
2142                                     break;
2143                                 }
2144 
2145                                 if (!TidyUtils.isWhite((char) c))
2146                                 {
2147                                     continue;
2148                                 }
2149 
2150                                 // and skip to end of whitespace
2151 
2152                                 for (;;)
2153                                 {
2154                                     c = this.in.readChar();
2155 
2156                                     if (c == StreamIn.END_OF_STREAM || c == '>')
2157                                     {
2158                                         this.in.ungetChar(c);
2159                                         break;
2160                                     }
2161 
2162                                     if (TidyUtils.isWhite((char) c))
2163                                     {
2164                                         continue;
2165                                     }
2166 
2167                                     this.in.ungetChar(c);
2168                                     break;
2169                                 }
2170 
2171                                 break;
2172                             }
2173 
2174                             // if some text before < return it now
2175                             if (this.txtend > this.txtstart)
2176                             {
2177                                 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2178                                 return this.token;
2179                             }
2180 
2181                             this.txtstart = this.lexsize;
2182                             continue;
2183                         }
2184                         else if (c == '[')
2185                         {
2186                             // Word 2000 embeds <![if ...]> ... <![endif]> sequences
2187                             this.lexsize -= 2;
2188                             this.state = LEX_SECTION;
2189                             this.txtend = this.lexsize;
2190 
2191                             // if some text before < return it now
2192                             if (this.txtend > this.txtstart)
2193                             {
2194                                 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2195                                 return this.token;
2196                             }
2197 
2198                             this.txtstart = this.lexsize;
2199                             continue;
2200                         }
2201 
2202                         // otherwise swallow chars up to and including next '>'
2203                         while (true)
2204                         {
2205                             c = this.in.readChar();
2206                             if (c == '>')
2207                             {
2208                                 break;
2209                             }
2210                             if (c == -1)
2211                             {
2212                                 this.in.ungetChar(c);
2213                                 break;
2214                             }
2215                         }
2216 
2217                         this.lexsize -= 2;
2218                         this.lexbuf[this.lexsize] = (byte) '\0';
2219                         this.state = LEX_CONTENT;
2220                         continue;
2221                     }
2222 
2223                     // processing instructions
2224 
2225                     if (c == '?')
2226                     {
2227                         this.lexsize -= 2;
2228                         this.state = LEX_PROCINSTR;
2229                         this.txtend = this.lexsize;
2230 
2231                         // if some text before < return it now
2232                         if (this.txtend > this.txtstart)
2233                         {
2234                             this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2235                             return this.token;
2236                         }
2237 
2238                         this.txtstart = this.lexsize;
2239                         continue;
2240                     }
2241 
2242                     // Microsoft ASP's e.g. <% ... server-code ... %>
2243                     if (c == '%')
2244                     {
2245                         this.lexsize -= 2;
2246                         this.state = LEX_ASP;
2247                         this.txtend = this.lexsize;
2248 
2249                         // if some text before < return it now
2250                         if (this.txtend > this.txtstart)
2251                         {
2252                             this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2253                             return this.token;
2254                         }
2255 
2256                         this.txtstart = this.lexsize;
2257                         continue;
2258                     }
2259 
2260                     // Netscapes JSTE e.g. <# ... server-code ... #>
2261                     if (c == '#')
2262                     {
2263                         this.lexsize -= 2;
2264                         this.state = LEX_JSTE;
2265                         this.txtend = this.lexsize;
2266 
2267                         // if some text before < return it now
2268                         if (this.txtend > this.txtstart)
2269                         {
2270                             this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2271                             return this.token;
2272                         }
2273 
2274                         this.txtstart = this.lexsize;
2275                         continue;
2276                     }
2277 
2278                     // check for start tag
2279                     if (TidyUtils.isLetter((char) c))
2280                     {
2281                         this.in.ungetChar(c); // push back letter
2282                         this.lexsize -= 2; // discard " <" + letter
2283                         this.txtend = this.lexsize;
2284                         this.state = LEX_STARTTAG; // ready to read tag name
2285 
2286                         // if some text before < return it now
2287                         if (this.txtend > this.txtstart)
2288                         {
2289                             this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2290                             return this.token;
2291                         }
2292 
2293                         continue; // no text so keep going
2294                     }
2295 
2296                     // otherwise treat as CDATA
2297                     this.state = LEX_CONTENT;
2298                     this.waswhite = false;
2299                     continue;
2300 
2301                 case LEX_ENDTAG :
2302                     // </letter
2303                     this.txtstart = this.lexsize - 1;
2304 
2305                     // changed from
2306                     // this.in.curcol -= 2;
2307                     this.columns -= 2;
2308 
2309                     c = parseTagName();
2310                     this.token = newNode(Node.END_TAG, // create endtag token
2311                         this.lexbuf, this.txtstart, this.txtend, TidyUtils.getString(
2312                             this.lexbuf,
2313                             this.txtstart,
2314                             this.txtend - this.txtstart));
2315                     this.lexsize = this.txtstart;
2316                     this.txtend = this.txtstart;
2317 
2318                     // skip to '>'
2319                     while (c != '>')
2320                     {
2321                         c = this.in.readChar();
2322 
2323                         if (c == StreamIn.END_OF_STREAM)
2324                         {
2325                             break;
2326                         }
2327                     }
2328 
2329                     if (c == StreamIn.END_OF_STREAM)
2330                     {
2331                         this.in.ungetChar(c);
2332                         continue;
2333                     }
2334 
2335                     this.state = LEX_CONTENT;
2336                     this.waswhite = false;
2337                     return this.token; // the endtag token
2338 
2339                 case LEX_STARTTAG :
2340                     // first letter of tagname
2341                     this.txtstart = this.lexsize - 1; // set txtstart to first letter
2342                     c = parseTagName();
2343                     isempty[0] = false;
2344                     attributes = null;
2345                     this.token = newNode(
2346                         (isempty[0] ? Node.START_END_TAG : Node.START_TAG),
2347                         this.lexbuf,
2348                         this.txtstart,
2349                         this.txtend,
2350                         TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart));
2351 
2352                     // parse attributes, consuming closing ">"
2353                     if (c != '>')
2354                     {
2355                         if (c == '/')
2356                         {
2357                             this.in.ungetChar(c);
2358                         }
2359 
2360                         attributes = parseAttrs(isempty);
2361                     }
2362 
2363                     if (isempty[0])
2364                     {
2365                         this.token.type = Node.START_END_TAG;
2366                     }
2367 
2368                     this.token.attributes = attributes;
2369                     this.lexsize = this.txtstart;
2370                     this.txtend = this.txtstart;
2371 
2372                     // swallow newline following start tag
2373                     // special check needed for CRLF sequence
2374                     // this doesn't apply to empty elements
2375                     // nor to preformatted content that needs escaping
2376 
2377                     if (
2378 
2379                     (mode != PREFORMATTED || preContent(this.token))
2380                         && (this.token.expectsContent() || this.token.tag == this.configuration.tt.tagBr))
2381                     {
2382 
2383                         c = this.in.readChar();
2384 
2385                         if (c == '\r')
2386                         {
2387                             c = this.in.readChar();
2388 
2389                             if (c != '\n')
2390                             {
2391                                 this.in.ungetChar(c);
2392                             }
2393                         }
2394                         else if (c != '\n' && c != '\f')
2395                         {
2396                             this.in.ungetChar(c);
2397                         }
2398 
2399                         this.waswhite = true; // to swallow leading whitespace
2400                     }
2401                     else
2402                     {
2403                         this.waswhite = false;
2404                     }
2405 
2406                     this.state = LEX_CONTENT;
2407 
2408                     if (this.token.tag == null)
2409                     {
2410                         report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
2411                     }
2412                     else if (!this.configuration.xmlTags)
2413                     {
2414                         constrainVersion(this.token.tag.versions);
2415 
2416                         if (TidyUtils.toBoolean(this.token.tag.versions & Dict.VERS_PROPRIETARY))
2417                         {
2418                             // #427810 - fix by Gary Deschaines 24 May 00
2419                             if (this.configuration.makeClean && (this.token.tag != this.configuration.tt.tagNobr && //
2420                                 this.token.tag != this.configuration.tt.tagWbr))
2421                             {
2422                                 report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
2423                             }
2424                             // #427810 - fix by Terry Teague 2 Jul 01
2425                             else if (!this.configuration.makeClean)
2426                             {
2427                                 report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
2428                             }
2429                         }
2430 
2431                         if (this.token.tag.getChkattrs() != null)
2432                         {
2433                             this.token.tag.getChkattrs().check(this, this.token);
2434                         }
2435                         else
2436                         {
2437                             this.token.checkAttributes(this);
2438                         }
2439 
2440                         // should this be called before attribute checks?
2441                         this.token.repairDuplicateAttributes(this);
2442 
2443                     }
2444 
2445                     return this.token; // return start tag
2446 
2447                 case LEX_COMMENT :
2448                     // seen <!-- so look for -->
2449 
2450                     if (c != '-')
2451                     {
2452                         continue;
2453                     }
2454 
2455                     c = this.in.readChar();
2456                     addCharToLexer(c);
2457 
2458                     if (c != '-')
2459                     {
2460                         continue;
2461                     }
2462 
2463                     end_comment : while (true)
2464                     {
2465                         c = this.in.readChar();
2466 
2467                         if (c == '>')
2468                         {
2469                             if (badcomment != 0)
2470                             {
2471                                 report.warning(this, null, null, Report.MALFORMED_COMMENT);
2472                             }
2473 
2474                             this.txtend = this.lexsize - 2; // AQ 8Jul2000
2475                             this.lexbuf[this.lexsize] = (byte) '\0';
2476                             this.state = LEX_CONTENT;
2477                             this.waswhite = false;
2478                             this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
2479 
2480                             // now look for a line break
2481 
2482                             c = this.in.readChar();
2483 
2484                             if (c == '\r')
2485                             {
2486                                 c = this.in.readChar();
2487 
2488                                 if (c != '\n')
2489                                 {
2490                                     this.token.linebreak = true;
2491                                 }
2492                             }
2493 
2494                             if (c == '\n')
2495                             {
2496                                 this.token.linebreak = true;
2497                             }
2498                             else
2499                             {
2500                                 this.in.ungetChar(c);
2501                             }
2502 
2503                             return this.token;
2504                         }
2505 
2506                         // note position of first such error in the comment
2507                         if (badcomment == 0)
2508                         {
2509                             this.lines = this.in.getCurline();
2510                             this.columns = this.in.getCurcol() - 3;
2511                         }
2512 
2513                         badcomment++;
2514                         if (this.configuration.fixComments)
2515                         {
2516                             this.lexbuf[this.lexsize - 2] = (byte) '=';
2517                         }
2518 
2519                         addCharToLexer(c);
2520 
2521                         // if '-' then look for '>' to end the comment
2522                         if (c != '-')
2523                         {
2524                             break end_comment;
2525                         }
2526 
2527                     }
2528                     // otherwise continue to look for -->
2529                     this.lexbuf[this.lexsize - 2] = (byte) '=';
2530                     continue;
2531 
2532                 case LEX_DOCTYPE :
2533                     // seen <!d so look for '> ' munging whitespace
2534 
2535                     if (TidyUtils.isWhite((char) c))
2536                     {
2537                         if (this.waswhite)
2538                         {
2539                             this.lexsize -= 1;
2540                         }
2541 
2542                         this.waswhite = true;
2543                     }
2544                     else
2545                     {
2546                         this.waswhite = false;
2547                     }
2548 
2549                     if (inDTDSubset)
2550                     {
2551                         if (c == ']')
2552                         {
2553                             inDTDSubset = false;
2554                         }
2555                     }
2556                     else if (c == '[')
2557                     {
2558                         inDTDSubset = true;
2559                     }
2560                     if (inDTDSubset || c != '>')
2561                     {
2562                         continue;
2563                     }
2564 
2565                     this.lexsize -= 1;
2566                     this.txtend = this.lexsize;
2567                     this.lexbuf[this.lexsize] = (byte) '\0';
2568                     this.state = LEX_CONTENT;
2569                     this.waswhite = false;
2570                     this.token = newNode(Node.DOCTYPE_TAG, this.lexbuf, this.txtstart, this.txtend);
2571                     // make a note of the version named by the doctype
2572                     this.doctype = findGivenVersion(this.token);
2573                     return this.token;
2574 
2575                 case LEX_PROCINSTR :
2576                     // seen <? so look for '> '
2577                     // check for PHP preprocessor instructions <?php ... ?>
2578 
2579                     if (this.lexsize - this.txtstart == 3)
2580                     {
2581                         if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("php"))
2582                         {
2583                             this.state = LEX_PHP;
2584                             continue;
2585                         }
2586                     }
2587 
2588                     if (this.lexsize - this.txtstart == 4)
2589                     {
2590                         if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("xml")
2591                             && TidyUtils.isWhite((char) this.lexbuf[this.txtstart + 3]))
2592                         {
2593                             this.state = LEX_XMLDECL;
2594                             attributes = null;
2595                             continue;
2596                         }
2597                     }
2598 
2599                     if (this.configuration.xmlPIs) // insist on ?> as terminator
2600                     {
2601                         if (c != '?')
2602                         {
2603                             continue;
2604                         }
2605 
2606                         // now look for '>'
2607                         c = this.in.readChar();
2608 
2609                         if (c == StreamIn.END_OF_STREAM)
2610                         {
2611                             report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
2612                             this.in.ungetChar(c);
2613                             continue;
2614                         }
2615 
2616                         addCharToLexer(c);
2617                     }
2618 
2619                     if (c != '>')
2620                     {
2621                         continue;
2622                     }
2623 
2624                     this.lexsize -= 1;
2625                     this.txtend = this.lexsize;
2626                     this.lexbuf[this.lexsize] = (byte) '\0';
2627                     this.state = LEX_CONTENT;
2628                     this.waswhite = false;
2629                     this.token = newNode(Node.PROC_INS_TAG, this.lexbuf, this.txtstart, this.txtend);
2630                     return this.token;
2631 
2632                 case LEX_ASP :
2633                     // seen <% so look for "%> "
2634                     if (c != '%')
2635                     {
2636                         continue;
2637                     }
2638 
2639                     // now look for '>'
2640                     c = this.in.readChar();
2641 
2642                     if (c != '>')
2643                     {
2644                         this.in.ungetChar(c);
2645                         continue;
2646                     }
2647 
2648                     this.lexsize -= 1;
2649                     this.txtend = this.lexsize;
2650                     this.lexbuf[this.lexsize] = (byte) '\0';
2651                     this.state = LEX_CONTENT;
2652                     this.waswhite = false;
2653                     this.token = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
2654                     return this.token;
2655 
2656                 case LEX_JSTE :
2657                     // seen <# so look for "#> "
2658                     if (c != '#')
2659                     {
2660                         continue;
2661                     }
2662 
2663                     // now look for '>'
2664                     c = this.in.readChar();
2665 
2666                     if (c != '>')
2667                     {
2668                         this.in.ungetChar(c);
2669                         continue;
2670                     }
2671 
2672                     this.lexsize -= 1;
2673                     this.txtend = this.lexsize;
2674                     this.lexbuf[this.lexsize] = (byte) '\0';
2675                     this.state = LEX_CONTENT;
2676                     this.waswhite = false;
2677                     this.token = newNode(Node.JSTE_TAG, this.lexbuf, this.txtstart, this.txtend);
2678                     return this.token;
2679 
2680                 case LEX_PHP :
2681                     // seen " <?php" so look for "?> "
2682                     if (c != '?')
2683                     {
2684                         continue;
2685                     }
2686 
2687                     // now look for '>'
2688                     c = this.in.readChar();
2689 
2690                     if (c != '>')
2691                     {
2692                         this.in.ungetChar(c);
2693                         continue;
2694                     }
2695 
2696                     this.lexsize -= 1;
2697                     this.txtend = this.lexsize;
2698                     this.lexbuf[this.lexsize] = (byte) '\0';
2699                     this.state = LEX_CONTENT;
2700                     this.waswhite = false;
2701                     this.token = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
2702                     return this.token;
2703 
2704                 case LEX_XMLDECL : // seen "<?xml" so look for "?>"
2705 
2706                     if (TidyUtils.isWhite((char) c) && c != '?')
2707                     {
2708                         continue;
2709                     }
2710 
2711                     // get pseudo-attribute
2712                     if (c != '?')
2713                     {
2714                         String name;
2715                         Node[] asp = new Node[1];
2716                         Node[] php = new Node[1];
2717                         AttVal av = new AttVal();
2718                         int[] pdelim = new int[1];
2719                         isempty[0] = false;
2720 
2721                         this.in.ungetChar(c);
2722 
2723                         name = this.parseAttribute(isempty, asp, php);
2724                         av.attribute = name;
2725 
2726                         av.value = this.parseValue(name, true, isempty, pdelim);
2727                         av.delim = pdelim[0];
2728                         av.next = attributes;
2729 
2730                         attributes = av;
2731                         // continue;
2732                     }
2733 
2734                     // now look for '>'
2735                     c = this.in.readChar();
2736 
2737                     if (c != '>')
2738                     {
2739                         this.in.ungetChar(c);
2740                         continue;
2741                     }
2742                     this.lexsize -= 1;
2743                     this.txtend = this.txtstart;
2744                     this.lexbuf[this.txtend] = '\0';
2745                     this.state = LEX_CONTENT;
2746                     this.waswhite = false;
2747                     this.token = newNode(Node.XML_DECL, this.lexbuf, this.txtstart, this.txtend);
2748                     this.token.attributes = attributes;
2749                     return this.token;
2750 
2751                 case LEX_SECTION :
2752                     // seen " <![" so look for "]> "
2753                     if (c == '[')
2754                     {
2755                         if (this.lexsize == (this.txtstart + 6)
2756                             && (TidyUtils.getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
2757                         {
2758                             this.state = LEX_CDATA;
2759                             this.lexsize -= 6;
2760                             continue;
2761                         }
2762                     }
2763 
2764                     if (c != ']')
2765                     {
2766                         continue;
2767                     }
2768 
2769                     // now look for '>'
2770                     c = this.in.readChar();
2771 
2772                     if (c != '>')
2773                     {
2774                         this.in.ungetChar(c);
2775                         continue;
2776                     }
2777 
2778                     this.lexsize -= 1;
2779                     this.txtend = this.lexsize;
2780                     this.lexbuf[this.lexsize] = (byte) '\0';
2781                     this.state = LEX_CONTENT;
2782                     this.waswhite = false;
2783                     this.token = newNode(Node.SECTION_TAG, this.lexbuf, this.txtstart, this.txtend);
2784                     return this.token;
2785 
2786                 case LEX_CDATA :
2787                     // seen " <![CDATA[" so look for "]]> "
2788                     if (c != ']')
2789                     {
2790                         continue;
2791                     }
2792 
2793                     // now look for ']'
2794                     c = this.in.readChar();
2795 
2796                     if (c != ']')
2797                     {
2798                         this.in.ungetChar(c);
2799                         continue;
2800                     }
2801 
2802                     // now look for '>'
2803                     c = this.in.readChar();
2804 
2805                     if (c != '>')
2806                     {
2807                         this.in.ungetChar(c);
2808                         continue;
2809                     }
2810 
2811                     this.lexsize -= 1;
2812                     this.txtend = this.lexsize;
2813                     this.lexbuf[this.lexsize] = (byte) '\0';
2814                     this.state = LEX_CONTENT;
2815                     this.waswhite = false;
2816                     this.token = newNode(Node.CDATA_TAG, this.lexbuf, this.txtstart, this.txtend);
2817                     return this.token;
2818 
2819                 default :
2820                     // should never reach here
2821                     break;
2822             }
2823         }
2824 
2825         if (this.state == LEX_CONTENT) // text string
2826         {
2827             this.txtend = this.lexsize;
2828 
2829             if (this.txtend > this.txtstart)
2830             {
2831                 this.in.ungetChar(c);
2832 
2833                 if (this.lexbuf[this.lexsize - 1] == (byte) ' ')
2834                 {
2835                     this.lexsize -= 1;
2836                     this.txtend = this.lexsize;
2837                 }
2838 
2839                 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2840                 return this.token;
2841             }
2842         }
2843         else if (this.state == LEX_COMMENT) // comment
2844         {
2845             if (c == StreamIn.END_OF_STREAM)
2846             {
2847                 report.warning(this, null, null, Report.MALFORMED_COMMENT);
2848             }
2849 
2850             this.txtend = this.lexsize;
2851             this.lexbuf[this.lexsize] = (byte) '\0';
2852             this.state = LEX_CONTENT;
2853             this.waswhite = false;
2854             this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
2855             return this.token;
2856         }
2857 
2858         return null;
2859     }
2860 
2861     /***
2862      * parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to
2863      * dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to
2864      * tailor the attribute value. Here is an example of a work around for using ASP in attribute values:
2865      * <code>href='<%=rsSchool.Fields("ID").Value%>'</code> where the ASP that generates the attribute value is
2866      * masked from Tidy by the quotemarks.
2867      * @return parsed Node
2868      */
2869     public Node parseAsp()
2870     {
2871         int c;
2872         Node asp = null;
2873 
2874         this.txtstart = this.lexsize;
2875 
2876         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
2877         {
2878 
2879             addCharToLexer(c);
2880 
2881             if (c != '%')
2882             {
2883                 continue;
2884             }
2885 
2886             if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
2887             {
2888                 break;
2889             }
2890             addCharToLexer(c);
2891 
2892             if (c == '>')
2893             {
2894                 break;
2895             }
2896         }
2897 
2898         this.lexsize -= 2;
2899         this.txtend = this.lexsize;
2900 
2901         if (this.txtend > this.txtstart)
2902         {
2903             asp = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
2904         }
2905 
2906         this.txtstart = this.txtend;
2907         return asp;
2908     }
2909 
2910     /***
2911      * PHP is like ASP but is based upon XML processing instructions, e.g. <code>&lt;?php ... ?&gt;</code>.
2912      * @return parsed Node
2913      */
2914     public Node parsePhp()
2915     {
2916         int c;
2917         Node php = null;
2918 
2919         this.txtstart = this.lexsize;
2920 
2921         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
2922         {
2923             addCharToLexer(c);
2924 
2925             if (c != '?')
2926             {
2927                 continue;
2928             }
2929 
2930             if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
2931             {
2932                 break;
2933             }
2934             addCharToLexer(c);
2935 
2936             if (c == '>')
2937             {
2938                 break;
2939             }
2940         }
2941 
2942         this.lexsize -= 2;
2943         this.txtend = this.lexsize;
2944 
2945         if (this.txtend > this.txtstart)
2946         {
2947             php = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
2948         }
2949 
2950         this.txtstart = this.txtend;
2951         return php;
2952     }
2953 
2954     /***
2955      * consumes the '>' terminating start tags.
2956      * @param isempty flag is passed as array so it can be modified
2957      * @param asp asp Node, passed as array so it can be modified
2958      * @param php php Node, passed as array so it can be modified
2959      * @return parsed attribute
2960      */
2961     public String parseAttribute(boolean[] isempty, Node[] asp, Node[] php)
2962     {
2963         int start = 0;
2964         String attr;
2965         int c = 0;
2966         int lastc = 0;
2967 
2968         asp[0] = null; // clear asp pointer
2969         php[0] = null; // clear php pointer
2970         // skip white space before the attribute
2971 
2972         for (;;)
2973         {
2974             c = this.in.readChar();
2975 
2976             if (c == '/')
2977             {
2978                 c = this.in.readChar();
2979 
2980                 if (c == '>')
2981                 {
2982                     isempty[0] = true;
2983                     return null;
2984                 }
2985 
2986                 this.in.ungetChar(c);
2987                 c = '/';
2988                 break;
2989             }
2990 
2991             if (c == '>')
2992             {
2993                 return null;
2994             }
2995 
2996             if (c == '<')
2997             {
2998                 c = this.in.readChar();
2999 
3000                 if (c == '%')
3001                 {
3002                     asp[0] = parseAsp();
3003                     return null;
3004                 }
3005                 else if (c == '?')
3006                 {
3007                     php[0] = parsePhp();
3008                     return null;
3009                 }
3010 
3011                 this.in.ungetChar(c);
3012                 if (this.state != LEX_XMLDECL) // FG fix for 532535
3013                 {
3014                     this.in.ungetChar('<'); // fix for 433360
3015                 }
3016                 report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3017                 return null;
3018             }
3019 
3020             if (c == '=')
3021             {
3022                 report.attrError(this, this.token, null, Report.UNEXPECTED_EQUALSIGN);
3023                 continue;
3024             }
3025 
3026             if (c == '"' || c == '\'')
3027             {
3028                 report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
3029                 continue;
3030             }
3031 
3032             if (c == StreamIn.END_OF_STREAM)
3033             {
3034                 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3035                 this.in.ungetChar(c);
3036                 return null;
3037             }
3038 
3039             if (!TidyUtils.isWhite((char) c))
3040             {
3041                 break;
3042             }
3043         }
3044 
3045         start = this.lexsize;
3046         lastc = c;
3047 
3048         for (;;)
3049         {
3050             // but push back '=' for parseValue()
3051             if (c == '=' || c == '>')
3052             {
3053                 this.in.ungetChar(c);
3054                 break;
3055             }
3056 
3057             if (c == '<' || c == StreamIn.END_OF_STREAM)
3058             {
3059                 this.in.ungetChar(c);
3060                 break;
3061             }
3062             if (lastc == '-' && (c == '"' || c == '\''))
3063             {
3064                 this.lexsize--;
3065                 this.in.ungetChar(c);
3066                 break;
3067             }
3068             if (TidyUtils.isWhite((char) c))
3069             {
3070                 break;
3071             }
3072 
3073             // what should be done about non-namechar characters?
3074             // currently these are incorporated into the attr name
3075 
3076             if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
3077             {
3078                 c = TidyUtils.toLower((char) c);
3079             }
3080 
3081             // ++len; #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
3082             addCharToLexer(c);
3083 
3084             lastc = c;
3085             c = this.in.readChar();
3086         }
3087 
3088         // #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
3089         int len = this.lexsize - start;
3090         attr = (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
3091         this.lexsize = start;
3092 
3093         return attr;
3094     }
3095 
3096     /***
3097      * Invoked when &lt; is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this
3098      * routine recognizes ' and " quoted strings.
3099      * @return delimiter
3100      */
3101     public int parseServerInstruction()
3102     {
3103         int c, delim = '"';
3104         boolean isrule = false;
3105 
3106         c = this.in.readChar();
3107         addCharToLexer(c);
3108 
3109         // check for ASP, PHP or Tango
3110         if (c == '%' || c == '?' || c == '@')
3111         {
3112             isrule = true;
3113         }
3114 
3115         for (;;)
3116         {
3117             c = this.in.readChar();
3118 
3119             if (c == StreamIn.END_OF_STREAM)
3120             {
3121                 break;
3122             }
3123 
3124             if (c == '>')
3125             {
3126                 if (isrule)
3127                 {
3128                     addCharToLexer(c);
3129                 }
3130                 else
3131                 {
3132                     this.in.ungetChar(c);
3133                 }
3134 
3135                 break;
3136             }
3137 
3138             // if not recognized as ASP, PHP or Tango
3139             // then also finish value on whitespace
3140             if (!isrule)
3141             {
3142                 if (TidyUtils.isWhite((char) c))
3143                 {
3144                     break;
3145                 }
3146             }
3147 
3148             addCharToLexer(c);
3149 
3150             if (c == '"')
3151             {
3152                 do
3153                 {
3154                     c = this.in.readChar();
3155 
3156                     if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
3157                     {
3158                         report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3159                         this.in.ungetChar(c);
3160                         return 0;
3161                     }
3162                     if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
3163                     {
3164                         this.in.ungetChar(c);
3165                         report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3166                         return 0;
3167                     }
3168 
3169                     addCharToLexer(c);
3170                 }
3171                 while (c != '"');
3172                 delim = '\'';
3173                 continue;
3174             }
3175 
3176             if (c == '\'')
3177             {
3178                 do
3179                 {
3180                     c = this.in.readChar();
3181 
3182                     if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
3183                     {
3184                         report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3185                         this.in.ungetChar(c);
3186                         return 0;
3187                     }
3188                     if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
3189                     {
3190                         this.in.ungetChar(c);
3191                         report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3192                         return 0;
3193                     }
3194 
3195                     addCharToLexer(c);
3196                 }
3197                 while (c != '\'');
3198             }
3199         }
3200 
3201         return delim;
3202     }
3203 
3204     /***
3205      * Parse an attribute value.
3206      * @param name attribute name
3207      * @param foldCase fold case?
3208      * @param isempty is attribute empty? Passed as an array reference to allow modification
3209      * @param pdelim delimiter, passed as an array reference to allow modification
3210      * @return parsed value
3211      */
3212     public String parseValue(String name, boolean foldCase, boolean[] isempty, int[] pdelim)
3213     {
3214         // values start with "=" or " = " etc.
3215         // doesn't consume the ">" at end of start tag
3216 
3217         int len = 0;
3218         int start;
3219         boolean seenGt = false;
3220         boolean munge = true;
3221         int c = 0;
3222         int lastc, delim, quotewarning;
3223         String value;
3224 
3225         delim = 0;
3226         pdelim[0] = '"';
3227 
3228         // Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are
3229         // significant and must be preserved
3230 
3231         if (this.configuration.literalAttribs)
3232         {
3233             munge = false;
3234         }
3235 
3236         // skip white space before the '='
3237         while (true)
3238         {
3239             c = this.in.readChar();
3240 
3241             if (c == StreamIn.END_OF_STREAM)
3242             {
3243                 this.in.ungetChar(c);
3244                 break;
3245             }
3246 
3247             if (!TidyUtils.isWhite((char) c))
3248             {
3249                 break;
3250             }
3251         }
3252 
3253         // c should be '=' if there is a value other legal possibilities are white space, '/' and '>'
3254 
3255         if (c != '=' && c != '"' && c != '\'')
3256         {
3257             this.in.ungetChar(c);
3258             return null;
3259         }
3260 
3261         // skip white space after '='
3262 
3263         while (true)
3264         {
3265             c = this.in.readChar();
3266 
3267             if (c == StreamIn.END_OF_STREAM)
3268             {
3269                 this.in.ungetChar(c);
3270                 break;
3271             }
3272 
3273             if (!TidyUtils.isWhite((char) c))
3274             {
3275                 break;
3276             }
3277         }
3278 
3279         // check for quote marks
3280 
3281         if (c == '"' || c == '\'')
3282         {
3283             delim = c;
3284         }
3285         else if (c == '<')
3286         {
3287             start = this.lexsize;
3288             addCharToLexer(c);
3289             pdelim[0] = parseServerInstruction();
3290             len = this.lexsize - start;
3291             this.lexsize = start;
3292             return (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
3293         }
3294         else
3295         {
3296             this.in.ungetChar(c);
3297         }
3298 
3299         // and read the value string check for quote mark if needed
3300 
3301         quotewarning = 0;
3302         start = this.lexsize;
3303         c = '\0';
3304 
3305         while (true)
3306         {
3307             lastc = c; // track last character
3308             c = this.in.readChar();
3309 
3310             if (c == StreamIn.END_OF_STREAM)
3311             {
3312                 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3313                 this.in.ungetChar(c);
3314                 break;
3315             }
3316 
3317             if (delim == (char) 0)
3318             {
3319                 if (c == '>')
3320                 {
3321                     this.in.ungetChar(c);
3322                     break;
3323                 }
3324 
3325                 if (c == '"' || c == '\'')
3326                 {
3327                     report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
3328                     break;
3329                 }
3330 
3331                 if (c == '<')
3332                 {
3333                     this.in.ungetChar(c); // fix for 433360
3334                     c = '>';
3335                     this.in.ungetChar(c);
3336                     report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3337                     break;
3338                 }
3339 
3340                 // For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however
3341                 // care is needed to avoid so treating <a href=http://www.acme.com /> in this way, which would map the
3342                 // <a> tag to <a href="http://www.acme.com"/>
3343 
3344                 if (c == '/')
3345                 {
3346                     // peek ahead in case of />
3347                     c = this.in.readChar();
3348 
3349                     if (c == '>' && !AttributeTable.getDefaultAttributeTable().isUrl(name))
3350                     {
3351                         isempty[0] = true;
3352                         this.in.ungetChar(c);
3353                         break;
3354                     }
3355 
3356                     // unget peeked char
3357                     this.in.ungetChar(c);
3358                     c = '/';
3359                 }
3360             }
3361             else
3362             {
3363                 // delim is '\'' or '"'
3364                 if (c == delim)
3365                 {
3366                     break;
3367                 }
3368 
3369                 // treat CRLF, CR and LF as single line break
3370 
3371                 if (c == '\r')
3372                 {
3373                     c = this.in.readChar();
3374                     if (c != '\n')
3375                     {
3376                         this.in.ungetChar(c);
3377                     }
3378 
3379                     c = '\n';
3380                 }
3381 
3382                 if (c == '\n' || c == '<' || c == '>')
3383                 {
3384                     ++quotewarning;
3385                 }
3386 
3387                 if (c == '>')
3388                 {
3389                     seenGt = true;
3390                 }
3391             }
3392 
3393             if (c == '&')
3394             {
3395                 // no entities in ID attributes
3396                 if ("id".equalsIgnoreCase(name))
3397                 {
3398                     report.attrError(this, null, null, Report.ENTITY_IN_ID);
3399                     continue;
3400                 }
3401 
3402                 addCharToLexer(c);
3403                 parseEntity((short) 0);
3404                 continue;
3405 
3406             }
3407 
3408             // kludge for JavaScript attribute values with line continuations in string literals
3409 
3410             if (c == '//')
3411             {
3412                 c = this.in.readChar();
3413 
3414                 if (c != '\n')
3415                 {
3416                     this.in.ungetChar(c);
3417                     c = '//';
3418                 }
3419             }
3420 
3421             if (TidyUtils.isWhite((char) c))
3422             {
3423                 if (delim == (char) 0)
3424                 {
3425                     break;
3426                 }
3427 
3428                 if (munge)
3429                 {
3430                     // discard line breaks in quoted URLs
3431                     // #438650 - fix by Randy Waki
3432                     if (c == '\n' && AttributeTable.getDefaultAttributeTable().isUrl(name))
3433                     {
3434                         // warn that we discard this newline
3435                         report.attrError(this, this.token, null, Report.NEWLINE_IN_URI);
3436                         continue;
3437                     }
3438 
3439                     c = ' ';
3440 
3441                     if (lastc == ' ')
3442                     {
3443                         continue;
3444                     }
3445                 }
3446             }
3447             else if (foldCase && TidyUtils.isUpper((char) c))
3448             {
3449                 c = TidyUtils.toLower((char) c);
3450             }
3451 
3452             addCharToLexer(c);
3453         }
3454 
3455         if (quotewarning > 10 && seenGt && munge)
3456         {
3457             // there is almost certainly a missing trailing quote mark as we have see too many newlines, < or >
3458             // characters. an exception is made for Javascript attributes and the javascript URL scheme which may
3459             // legitimately include < and >, and for attributes starting with "<xml " as generated by Microsoft Office.
3460 
3461             if (!AttributeTable.getDefaultAttributeTable().isScript(name)
3462                 && !(AttributeTable.getDefaultAttributeTable().isUrl(name) && "javascript:".equals(TidyUtils.getString(
3463                     this.lexbuf,
3464                     start,
3465                     11)))
3466                 && !"<xml ".equals(TidyUtils.getString(this.lexbuf, start, 5))) // #500236 - fix by Klaus Johannes Rusch
3467             // 06 Jan 02
3468             {
3469                 report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
3470             }
3471         }
3472 
3473         len = this.lexsize - start;
3474         this.lexsize = start;
3475 
3476         if (len > 0 || delim != 0)
3477         {
3478             // ignore leading and trailing white space for all but title, alt, value and prompts attributes unless
3479             // --literal-attributes is set to yes
3480             // #994841 - Whitespace is removed from value attributes
3481 
3482             if (munge && !TidyUtils.isInValuesIgnoreCase(new String[]{"alt", "title", "value", "prompt"}, name))
3483             {
3484                 while (TidyUtils.isWhite((char) this.lexbuf[start + len - 1]))
3485                 {
3486                     --len;
3487                 }
3488 
3489                 while (TidyUtils.isWhite((char) this.lexbuf[start]) && start < len)
3490                 {
3491                     ++start;
3492                     --len;
3493                 }
3494             }
3495 
3496             value = TidyUtils.getString(this.lexbuf, start, len);
3497         }
3498         else
3499         {
3500             value = null;
3501         }
3502 
3503         // note delimiter if given
3504         if (delim != 0)
3505         {
3506             pdelim[0] = delim;
3507         }
3508         else
3509         {
3510             pdelim[0] = '"';
3511         }
3512 
3513         return value;
3514     }
3515 
3516     /***
3517      * Check if attr is a valid name.
3518      * @param attr String to check, must be non-null
3519      * @return <code>true</code> if attr is a valid name.
3520      */
3521     public static boolean isValidAttrName(String attr)
3522     {
3523         char c;
3524         int i;
3525 
3526         // first character should be a letter
3527         c = attr.charAt(0);
3528 
3529         if (!TidyUtils.isLetter(c))
3530         {
3531             return false;
3532         }
3533 
3534         // remaining characters should be namechars
3535         for (i = 1; i < attr.length(); i++)
3536         {
3537             c = attr.charAt(i);
3538 
3539             if (TidyUtils.isNamechar(c))
3540             {
3541                 continue;
3542             }
3543 
3544             return false;
3545         }
3546 
3547         return true;
3548     }
3549 
3550     /***
3551      * In CSS1, selectors can contain only the characters A-Z, 0-9, and Unicode characters 161-255, plus dash (-); they
3552      * cannot start with a dash or a digit; they can also contain escaped characters and any Unicode character as a
3553      * numeric code (see next item). The backslash followed by at most four hexadecimal digits (0..9A..F) stands for the
3554      * Unicode character with that number. Any character except a hexadecimal digit can be escaped to remove its special
3555      * meaning, by putting a backslash in front.
3556      * @param buf css selector name
3557      * @return <code>true</code> if the given string is a valid css1 selector name
3558      */
3559     public static boolean isCSS1Selector(String buf)
3560     {
3561         if (buf == null)
3562         {
3563             return false;
3564         }
3565 
3566         // #508936 - CSS class naming for -clean option
3567         boolean valid = true;
3568         int esclen = 0;
3569         char c;
3570         int pos;
3571 
3572         for (pos = 0; valid && pos < buf.length(); ++pos)
3573         {
3574             c = buf.charAt(pos);
3575             if (c == '//')
3576             {
3577                 esclen = 1; // ab\555\444 is 4 chars {'a', 'b', \555, \444}
3578             }
3579             else if (Character.isDigit(c))
3580             {
3581                 // Digit not 1st, unless escaped (Max length "\112F")
3582                 if (esclen > 0)
3583                 {
3584                     valid = (++esclen < 6);
3585                 }
3586                 if (valid)
3587                 {
3588                     valid = (pos > 0 || esclen > 0);
3589                 }
3590             }
3591             else
3592             {
3593                 valid = (esclen > 0 // Escaped? Anything goes.
3594                     || (pos > 0 && c == '-') // Dash cannot be 1st char
3595                     || Character.isLetter(c) // a-z, A-Z anywhere
3596                 || (c >= 161 && c <= 255)); // Unicode 161-255 anywhere
3597                 esclen = 0;
3598             }
3599         }
3600         return valid;
3601     }
3602 
3603     /***
3604      * Parse tag attributes.
3605      * @param isempty is tag empty?
3606      * @return parsed attribute/value list
3607      */
3608     public AttVal parseAttrs(boolean[] isempty)
3609     {
3610         AttVal av, list;
3611         String attribute, value;
3612         int[] delim = new int[1];
3613         Node[] asp = new Node[1];
3614         Node[] php = new Node[1];
3615 
3616         list = null;
3617 
3618         while (!endOfInput())
3619         {
3620             attribute = parseAttribute(isempty, asp, php);
3621 
3622             if (attribute == null)
3623             {
3624                 // check if attributes are created by ASP markup
3625                 if (asp[0] != null)
3626                 {
3627                     av = new AttVal(list, null, asp[0], null, '\0', null, null);
3628                     list = av;
3629                     continue;
3630                 }
3631 
3632                 // check if attributes are created by PHP markup
3633                 if (php[0] != null)
3634                 {
3635                     av = new AttVal(list, null, null, php[0], '\0', null, null);
3636                     list = av;
3637                     continue;
3638                 }
3639 
3640                 break;
3641             }
3642 
3643             value = parseValue(attribute, false, isempty, delim);
3644 
3645             if (attribute != null && isValidAttrName(attribute))
3646             {
3647                 av = new AttVal(list, null, null, null, delim[0], attribute, value);
3648                 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
3649                 list = av;
3650             }
3651             else
3652             {
3653                 av = new AttVal(null, null, null, null, 0, attribute, value);
3654 
3655                 // #427664 - fix by Gary Peskin 04 Aug 00; other fixes by Dave Raggett
3656                 if (value != null)
3657                 {
3658                     report.attrError(this, this.token, av, Report.BAD_ATTRIBUTE_VALUE);
3659                 }
3660                 else if (TidyUtils.lastChar(attribute) == '"')
3661                 {
3662                     report.attrError(this, this.token, av, Report.MISSING_QUOTEMARK);
3663                 }
3664                 else
3665                 {
3666                     report.attrError(this, this.token, av, Report.UNKNOWN_ATTRIBUTE);
3667                 }
3668             }
3669         }
3670 
3671         return list;
3672     }
3673 
3674     /***
3675      * Push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones
3676      * generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance:
3677      * <code>&lt;p>&lt;em> text &lt;p>&lt;em> more text</code> Shouldn't be mapped to
3678      * <code>&lt;p>&lt;em> text &lt;/em>&lt;/p>&lt;p>&lt;em>&lt;em> more text &lt;/em>&lt;/em></code>
3679      * @param node Node to be pushed
3680      */
3681     public void pushInline(Node node)
3682     {
3683         IStack is;
3684 
3685         if (node.implicit)
3686         {
3687             return;
3688         }
3689 
3690         if (node.tag == null)
3691         {
3692             return;
3693         }
3694 
3695         if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
3696         {
3697             return;
3698         }
3699 
3700         if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
3701         {
3702             return;
3703         }
3704 
3705         if (node.tag != this.configuration.tt.tagFont && isPushed(node))
3706         {
3707             return;
3708         }
3709 
3710         // make sure there is enough space for the stack
3711         is = new IStack();
3712         is.tag = node.tag;
3713         is.element = node.element;
3714         if (node.attributes != null)
3715         {
3716             is.attributes = cloneAttributes(node.attributes);
3717         }
3718         this.istack.push(is);
3719     }
3720 
3721     /***
3722      * Pop a copy of an inline node from the stack.
3723      * @param node Node to be popped
3724      */
3725     public void popInline(Node node)
3726     {
3727         IStack is;
3728 
3729         if (node != null)
3730         {
3731 
3732             if (node.tag == null)
3733             {
3734                 return;
3735             }
3736 
3737             if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
3738             {
3739                 return;
3740             }
3741 
3742             if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
3743             {
3744                 return;
3745             }
3746 
3747             // if node is </a> then pop until we find an <a>
3748             if (node.tag == this.configuration.tt.tagA)
3749             {
3750 
3751                 while (this.istack.size() > 0)
3752                 {
3753                     is = (IStack) this.istack.pop();
3754                     if (is.tag == this.configuration.tt.tagA)
3755                     {
3756                         break;
3757                     }
3758                 }
3759 
3760                 if (this.insert >= this.istack.size())
3761                 {
3762                     this.insert = -1;
3763                 }
3764                 return;
3765             }
3766         }
3767 
3768         if (this.istack.size() > 0)
3769         {
3770             is = (IStack) this.istack.pop();
3771             if (this.insert >= this.istack.size())
3772             {
3773                 this.insert = -1;
3774             }
3775         }
3776     }
3777 
3778     /***
3779      * Is the node in the stack?
3780      * @param node Node
3781      * @return <code>true</code> is the node is found in the stack
3782      */
3783     public boolean isPushed(Node node)
3784     {
3785         int i;
3786         IStack is;
3787 
3788         for (i = this.istack.size() - 1; i >= 0; --i)
3789         {
3790             is = (IStack) this.istack.elementAt(i);
3791             if (is.tag == node.tag)
3792             {
3793                 return true;
3794             }
3795         }
3796 
3797         return false;
3798     }
3799 
3800     /***
3801      * This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P,
3802      * TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. When the inline stack is not empty, as
3803      * will be the case in: <code>&lt;i>&lt;h1>italic heading&lt;/h1>&lt;/i></code> which is then treated as
3804      * equivalent to <code>&lt;h1>&lt;i>italic heading&lt;/i>&lt;/h1></code> This is implemented by setting the lexer
3805      * into a mode where it gets tokens from the inline stack rather than from the input stream.
3806      * @param node original node
3807      * @return stack size
3808      */
3809     public int inlineDup(Node node)
3810     {
3811         int n;
3812 
3813         n = this.istack.size() - this.istackbase;
3814         if (n > 0)
3815         {
3816             this.insert = this.istackbase;
3817             this.inode = node;
3818         }
3819 
3820         return n;
3821     }
3822 
3823     /***
3824      * @return
3825      */
3826     public Node insertedToken()
3827     {
3828         Node node;
3829         IStack is;
3830         int n;
3831 
3832         // this will only be null if inode != null
3833         if (this.insert == -1)
3834         {
3835             node = this.inode;
3836             this.inode = null;
3837             return node;
3838         }
3839 
3840         // is this is the "latest" node then update the position, otherwise use current values
3841         if (this.inode == null)
3842         {
3843             this.lines = this.in.getCurline();
3844             this.columns = this.in.getCurcol();
3845         }
3846 
3847         node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend);
3848 
3849         // GLP: Bugfix 126261. Remove when this change is fixed in istack.c in the original Tidy
3850         node.implicit = true;
3851         is = (IStack) this.istack.elementAt(this.insert);
3852         node.element = is.element;
3853         node.tag = is.tag;
3854         if (is.attributes != null)
3855         {
3856             node.attributes = cloneAttributes(is.attributes);
3857         }
3858 
3859         // advance lexer to next item on the stack
3860         n = this.insert;
3861 
3862         // and recover state if we have reached the end
3863         if (++n < this.istack.size())
3864         {
3865             this.insert = n;
3866         }
3867         else
3868         {
3869             this.insert = -1;
3870         }
3871 
3872         return node;
3873     }
3874 
3875     /***
3876      * Can the given element be removed?
3877      * @param element node
3878      * @return <code>true</code> if he element can be removed
3879      */
3880     public boolean canPrune(Node element)
3881     {
3882         if (element.type == Node.TEXT_NODE)
3883         {
3884             return true;
3885         }
3886 
3887         if (element.content != null)
3888         {
3889             return false;
3890         }
3891 
3892         if (element.tag == this.configuration.tt.tagA && element.attributes != null)
3893         {
3894             return false;
3895         }
3896 
3897         if (element.tag == this.configuration.tt.tagP && !this.configuration.dropEmptyParas)
3898         {
3899             return false;
3900         }
3901 
3902         if (element.tag == null)
3903         {
3904             return false;
3905         }
3906 
3907         if (TidyUtils.toBoolean(element.tag.model & Dict.CM_ROW))
3908         {
3909             return false;
3910         }
3911 
3912         if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY))
3913         {
3914             return false;
3915         }
3916 
3917         if (element.tag == this.configuration.tt.tagApplet)
3918         {
3919             return false;
3920         }
3921 
3922         if (element.tag == this.configuration.tt.tagObject)
3923         {
3924             return false;
3925         }
3926 
3927         if (element.tag == this.configuration.tt.tagScript && element.getAttrByName("src") != null)
3928         {
3929             return false;
3930         }
3931 
3932         // #540555 Empty title tag is trimmed
3933         if (element.tag == this.configuration.tt.tagTitle)
3934         {
3935             return false;
3936         }
3937 
3938         // #433359 - fix by Randy Waki 12 Mar 01 - Empty iframe is trimmed
3939         if (element.tag == this.configuration.tt.tagIframe)
3940         {
3941             return false;
3942         }
3943 
3944         if (element.getAttrByName("id") != null || element.getAttrByName("name") != null)
3945         {
3946             return false;
3947         }
3948 
3949         return true;
3950     }
3951 
3952     /***
3953      * duplicate name attribute as an id and check if id and name match.
3954      * @param node Node to check for name/it attributes
3955      */
3956     public void fixId(Node node)
3957     {
3958         AttVal name = node.getAttrByName("name");
3959         AttVal id = node.getAttrByName("id");
3960 
3961         if (name != null)
3962         {
3963             if (id != null)
3964             {
3965                 if (id.value != null && !id.value.equals(name.value))
3966                 {
3967                     report.attrError(this, node, name, Report.ID_NAME_MISMATCH);
3968                 }
3969             }
3970             else if (this.configuration.xmlOut)
3971             {
3972                 node.addAttribute("id", name.value);
3973             }
3974         }
3975     }
3976 
3977     /***
3978      * Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated.
3979      */
3980     public void deferDup()
3981     {
3982         this.insert = -1;
3983         this.inode = null;
3984     }
3985 
3986     /***
3987      * Constraint the html version in the document to the given one. Everything is allowed in proprietary version of
3988      * HTML this is handled here rather than in the tag/attr dicts.
3989      * @param vers html version code
3990      */
3991     void constrainVersion(int vers)
3992     {
3993         this.versions &= (vers | Dict.VERS_PROPRIETARY);
3994     }
3995 
3996     /***
3997      * Is content acceptable for pre elements?
3998      * @param node content
3999      * @return <code>true</code> if node is acceptable in pre elements
4000      */
4001     protected boolean preContent(Node node)
4002     {
4003         // p is coerced to br's
4004         if (node.tag == this.configuration.tt.tagP)
4005         {
4006             return true;
4007         }
4008 
4009         if (node.tag == null
4010             || node.tag == this.configuration.tt.tagP
4011             || !TidyUtils.toBoolean(node.tag.model & (Dict.CM_INLINE | Dict.CM_NEW)))
4012         {
4013             return false;
4014         }
4015         return true;
4016     }
4017 
4018     /***
4019      * document type.
4020      */
4021     private static class W3CVersionInfo
4022     {
4023 
4024         /***
4025          * name.
4026          */
4027         String name;
4028 
4029         /***
4030          * voyager name.
4031          */
4032         String voyagerName;
4033 
4034         /***
4035          * profile.
4036          */
4037         String profile;
4038 
4039         /***
4040          * code.
4041          */
4042         short code;
4043 
4044         /***
4045          * Instantiates a new W3CVersionInfo.
4046          * @param name version name
4047          * @param voyagerName voyager (xhtml) name
4048          * @param profile VOYAGER_STRICT | VOYAGER_LOOSE | VOYAGER_FRAMESET
4049          * @param code unique code for this version info
4050          */
4051         public W3CVersionInfo(String name, String voyagerName, String profile, short code)
4052         {
4053             this.name = name;
4054             this.voyagerName = voyagerName;
4055             this.profile = profile;
4056             this.code = code;
4057         }
4058     }
4059 
4060 }