View Javadoc

1   /*
2    *  Java HTML Tidy - JTidy
3    *  HTML parser and pretty printer
4    *
5    *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6    *  Institute of Technology, Institut National de Recherche en
7    *  Informatique et en Automatique, Keio University). All Rights
8    *  Reserved.
9    *
10   *  Contributing Author(s):
11   *
12   *     Dave Raggett <dsr@w3.org>
13   *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14   *     Gary L Peskin <garyp@firstech.com> (Java development)
15   *     Sami Lempinen <sami@lempinen.net> (release management)
16   *     Fabrizio Giustina <fgiust at users.sourceforge.net>
17   *
18   *  The contributing author(s) would like to thank all those who
19   *  helped with testing, bug fixes, and patience.  This wouldn't
20   *  have been possible without all of you.
21   *
22   *  COPYRIGHT NOTICE:
23   *
24   *  This software and documentation is provided "as is," and
25   *  the copyright holders and contributing author(s) make no
26   *  representations or warranties, express or implied, including
27   *  but not limited to, warranties of merchantability or fitness
28   *  for any particular purpose or that the use of the software or
29   *  documentation will not infringe any third party patents,
30   *  copyrights, trademarks or other rights.
31   *
32   *  The copyright holders and contributing author(s) will not be
33   *  liable for any direct, indirect, special or consequential damages
34   *  arising out of any use of the software or documentation, even if
35   *  advised of the possibility of such damage.
36   *
37   *  Permission is hereby granted to use, copy, modify, and distribute
38   *  this source code, or portions hereof, documentation and executables,
39   *  for any purpose, without fee, subject to the following restrictions:
40   *
41   *  1. The origin of this source code must not be misrepresented.
42   *  2. Altered versions must be plainly marked as such and must
43   *     not be misrepresented as being the original source.
44   *  3. This Copyright notice may not be removed or altered from any
45   *     source or altered source distribution.
46   *
47   *  The copyright holders and contributing author(s) specifically
48   *  permit, without fee, and encourage the use of this source code
49   *  as a component for supporting the Hypertext Markup Language in
50   *  commercial products. If you use this source code in a product,
51   *  acknowledgment is not required but would be appreciated.
52   *
53   */
54  package org.w3c.tidy;
55  
56  import java.io.PrintWriter;
57  import java.util.List;
58  import java.util.Stack;
59  import java.util.Vector;
60  
61  
62  /**
63   * Lexer for html parser.
64   * <p>
65   * Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
66   * level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
67   * null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
68   * mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
69   * to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
70   * Not yet done: - Doctype subset and marked sections
71   * </p>
72   * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
73   * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
74   * @author Fabrizio Giustina
75   * @version $Revision: 807 $ ($Author: fgiust $)
76   */
77  public class Lexer
78  {
79  
80      /**
81       * state: ignore whitespace.
82       */
83      public static final short IGNORE_WHITESPACE = 0;
84  
85      /**
86       * state: mixed content.
87       */
88      public static final short MIXED_CONTENT = 1;
89  
90      /**
91       * state: preformatted.
92       */
93      public static final short PREFORMATTED = 2;
94  
95      /**
96       * state: ignore markup.
97       */
98      public static final short IGNORE_MARKUP = 3;
99  
100     /**
101      * URI for XHTML 1.0 transitional DTD.
102      */
103     private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
104 
105     /**
106      * URI for XHTML 1.0 strict DTD.
107      */
108     private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
109 
110     /**
111      * URI for XHTML 1.0 frameset DTD.
112      */
113     private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
114 
115     /**
116      * URI for XHTML 1.1.
117      */
118     private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";
119 
120     /**
121      * URI for XHTML Basic 1.0.
122      */
123     // private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd";
124     /**
125      * xhtml namespace.
126      */
127     private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
128 
129     /**
130      * lists all the known versions.
131      */
132     private static final Lexer.W3CVersionInfo[] W3CVERSION = {
133         new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
134         new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
135         new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
136         new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
137         new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
138         new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
139         new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
140         new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
141         new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
142         new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20),
143         new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)};
144 
145     /**
146      * getToken state: content.
147      */
148     private static final short LEX_CONTENT = 0;
149 
150     /**
151      * getToken state: gt.
152      */
153     private static final short LEX_GT = 1;
154 
155     /**
156      * getToken state: endtag.
157      */
158     private static final short LEX_ENDTAG = 2;
159 
160     /**
161      * getToken state: start tag.
162      */
163     private static final short LEX_STARTTAG = 3;
164 
165     /**
166      * getToken state: comment.
167      */
168     private static final short LEX_COMMENT = 4;
169 
170     /**
171      * getToken state: doctype.
172      */
173     private static final short LEX_DOCTYPE = 5;
174 
175     /**
176      * getToken state: procinstr.
177      */
178     private static final short LEX_PROCINSTR = 6;
179 
180     /**
181      * getToken state: cdata.
182      */
183     private static final short LEX_CDATA = 8;
184 
185     /**
186      * getToken state: section.
187      */
188     private static final short LEX_SECTION = 9;
189 
190     /**
191      * getToken state: asp.
192      */
193     private static final short LEX_ASP = 10;
194 
195     /**
196      * getToken state: jste.
197      */
198     private static final short LEX_JSTE = 11;
199 
200     /**
201      * getToken state: php.
202      */
203     private static final short LEX_PHP = 12;
204 
205     /**
206      * getToken state: xml declaration.
207      */
208     private static final short LEX_XMLDECL = 13;
209 
210     /**
211      * file stream.
212      */
213     protected StreamIn in;
214 
215     /**
216      * error output stream.
217      */
218     protected PrintWriter errout;
219 
220     /**
221      * for accessibility errors.
222      */
223     protected short badAccess;
224 
225     /**
226      * for bad style errors.
227      */
228     protected short badLayout;
229 
230     /**
231      * for bad char encodings.
232      */
233     protected short badChars;
234 
235     /**
236      * for mismatched/mispositioned form tags.
237      */
238     protected short badForm;
239 
240     /**
241      * count of warnings in this document.
242      */
243     protected short warnings;
244 
245     /**
246      * count of errors.
247      */
248     protected short errors;
249 
250     /**
251      * lines seen.
252      */
253     protected int lines;
254 
255     /**
256      * at start of current token.
257      */
258     protected int columns;
259 
260     /**
261      * used to collapse contiguous white space.
262      */
263     protected boolean waswhite;
264 
265     /**
266      * true after token has been pushed back.
267      */
268     protected boolean pushed;
269 
270     /**
271      * when space is moved after end tag.
272      */
273     protected boolean insertspace;
274 
275     /**
276      * Netscape compatibility.
277      */
278     protected boolean excludeBlocks;
279 
280     /**
281      * true if moved out of table.
282      */
283     protected boolean exiled;
284 
285     /**
286      * true if xmlns attribute on html element.
287      */
288     protected boolean isvoyager;
289 
290     /**
291      * bit vector of HTML versions.
292      */
293     protected short versions;
294 
295     /**
296      * version as given by doctype (if any).
297      */
298     protected int doctype;
299 
300     /**
301      * set if html or PUBLIC is missing.
302      */
303     protected boolean badDoctype;
304 
305     /**
306      * start of current node.
307      */
308     protected int txtstart;
309 
310     /**
311      * end of current node.
312      */
313     protected int txtend;
314 
315     /**
316      * state of lexer's finite state machine.
317      */
318     protected short state;
319 
320     /**
321      * current node.
322      */
323     protected Node token;
324 
325     /**
326      * Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
327      * all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
328      */
329     protected byte[] lexbuf;
330 
331     /**
332      * allocated.
333      */
334     protected int lexlength;
335 
336     /**
337      * used.
338      */
339     protected int lexsize;
340 
341     /**
342      * Inline stack for compatibility with Mosaic. For deferring text node.
343      */
344     protected Node inode;
345 
346     /**
347      * for inferring inline tags.
348      */
349     protected int insert;
350 
351     /**
352      * stack.
353      */
354     protected Stack istack;
355 
356     /**
357      * start of frame.
358      */
359     protected int istackbase;
360 
361     /**
362      * used for cleaning up presentation markup.
363      */
364     protected Style styles;
365 
366     /**
367      * configuration.
368      */
369     protected Configuration configuration;
370 
371     /**
372      * already seen end body tag?
373      */
374     protected boolean seenEndBody;
375 
376     /**
377      * already seen end html tag?
378      */
379     protected boolean seenEndHtml;
380 
381     /**
382      * report.
383      */
384     protected Report report;
385 
386     /**
387      * Root node is saved here.
388      */
389     protected Node root;
390 
391     /**
392      * node list.
393      */
394     private List nodeList;
395 
396     /**
397      * Instantiates a new Lexer.
398      * @param in StreamIn
399      * @param configuration configuation instance
400      * @param report report instance, for reporting errors
401      */
402     public Lexer(StreamIn in, Configuration configuration, Report report)
403     {
404         this.report = report;
405         this.in = in;
406         this.lines = 1;
407         this.columns = 1;
408         this.state = LEX_CONTENT;
409         this.versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY);
410         this.doctype = Dict.VERS_UNKNOWN;
411         this.insert = -1;
412         this.istack = new Stack();
413         this.configuration = configuration;
414         this.nodeList = new Vector();
415     }
416 
417     /**
418      * Creates a new node and add it to nodelist.
419      * @return Node
420      */
421     public Node newNode()
422     {
423         Node node = new Node();
424         this.nodeList.add(node);
425         return node;
426     }
427 
428     /**
429      * Creates a new node and add it to nodelist.
430      * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
431      * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
432      * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
433      * @param textarray array of bytes contained in the Node
434      * @param start start position
435      * @param end end position
436      * @return Node
437      */
438     public Node newNode(short type, byte[] textarray, int start, int end)
439     {
440         Node node = new Node(type, textarray, start, end);
441         this.nodeList.add(node);
442         return node;
443     }
444 
445     /**
446      * Creates a new node and add it to nodelist.
447      * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
448      * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
449      * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
450      * @param textarray array of bytes contained in the Node
451      * @param start start position
452      * @param end end position
453      * @param element tag name
454      * @return Node
455      */
456     public Node newNode(short type, byte[] textarray, int start, int end, String element)
457     {
458         Node node = new Node(type, textarray, start, end, element, this.configuration.tt);
459         this.nodeList.add(node);
460         return node;
461     }
462 
463     /**
464      * Clones a node and add it to node list.
465      * @param node Node
466      * @return cloned Node
467      */
468     public Node cloneNode(Node node)
469     {
470         Node cnode = (Node) node.clone();
471         this.nodeList.add(cnode);
472         for (AttVal att = cnode.attributes; att != null; att = att.next)
473         {
474             if (att.asp != null)
475             {
476                 this.nodeList.add(att.asp);
477             }
478             if (att.php != null)
479             {
480                 this.nodeList.add(att.php);
481             }
482         }
483         return cnode;
484     }
485 
486     /**
487      * Clones an attribute value and add eventual asp or php node to node list.
488      * @param attrs original AttVal
489      * @return cloned AttVal
490      */
491     public AttVal cloneAttributes(AttVal attrs)
492     {
493         AttVal cattrs = (AttVal) attrs.clone();
494         for (AttVal att = cattrs; att != null; att = att.next)
495         {
496             if (att.asp != null)
497             {
498                 this.nodeList.add(att.asp);
499             }
500             if (att.php != null)
501             {
502                 this.nodeList.add(att.php);
503             }
504         }
505         return cattrs;
506     }
507 
508     /**
509      * Update <code>oldtextarray</code> in the current nodes.
510      * @param oldtextarray previous text array
511      * @param newtextarray new text array
512      */
513     protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
514     {
515         Node node;
516         for (int i = 0; i < this.nodeList.size(); i++)
517         {
518             node = (Node) (this.nodeList.get(i));
519             if (node.textarray == oldtextarray)
520             {
521                 node.textarray = newtextarray;
522             }
523         }
524     }
525 
526     /**
527      * Adds a new line node. Used for creating preformatted text from Word2000.
528      * @return new line node
529      */
530     public Node newLineNode()
531     {
532         Node node = newNode();
533 
534         node.textarray = this.lexbuf;
535         node.start = this.lexsize;
536         addCharToLexer('\n');
537         node.end = this.lexsize;
538         return node;
539     }
540 
541     /**
542      * Has end of input stream been reached?
543      * @return <code>true</code> if end of input stream been reached
544      */
545     public boolean endOfInput()
546     {
547         return this.in.isEndOfStream();
548     }
549 
550     /**
551      * Adds a byte to lexer buffer.
552      * @param c byte to add
553      */
554     public void addByte(int c)
555     {
556         if (this.lexsize + 1 >= this.lexlength)
557         {
558             while (this.lexsize + 1 >= this.lexlength)
559             {
560                 if (this.lexlength == 0)
561                 {
562                     this.lexlength = 8192;
563                 }
564                 else
565                 {
566                     this.lexlength = this.lexlength * 2;
567                 }
568             }
569 
570             byte[] temp = this.lexbuf;
571             this.lexbuf = new byte[this.lexlength];
572             if (temp != null)
573             {
574                 System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
575                 updateNodeTextArrays(temp, this.lexbuf);
576             }
577         }
578 
579         this.lexbuf[this.lexsize++] = (byte) c;
580         this.lexbuf[this.lexsize] = (byte) '\0'; // debug
581     }
582 
583     /**
584      * Substitute the last char in buffer.
585      * @param c new char
586      */
587     public void changeChar(byte c)
588     {
589         if (this.lexsize > 0)
590         {
591             this.lexbuf[this.lexsize - 1] = c;
592         }
593     }
594 
595     /**
596      * Store char c as UTF-8 encoded byte stream.
597      * @param c char to store
598      */
599     public void addCharToLexer(int c)
600     {
601         // Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char
602         // Fix by Pablo Mayrgundter 17-08-2004
603 
604         if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
605             && !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first.
606                 || c == 0x9
607                 || c == 0xA
608                 || c == 0xD // Then white-space.
609                 || (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode.
610             || (c >= 0x10000 && c <= 0x10FFFF)))
611         {
612             return;
613         }
614 
615         int i = 0;
616         int[] count = new int[]{0};
617         byte[] buf = new byte[10]; // unsigned char
618 
619         boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count);
620         if (err)
621         {
622             // replacement char 0xFFFD encoded as UTF-8
623             buf[0] = (byte) 0xEF;
624             buf[1] = (byte) 0xBF;
625             buf[2] = (byte) 0xBD;
626             count[0] = 3;
627         }
628 
629         for (i = 0; i < count[0]; i++)
630         {
631             addByte(buf[i]); // uint
632         }
633 
634     }
635 
636     /**
637      * Adds a string to lexer buffer.
638      * @param str String to add
639      */
640     public void addStringToLexer(String str)
641     {
642         for (int i = 0; i < str.length(); i++)
643         {
644             addCharToLexer(str.charAt(i));
645         }
646     }
647 
648     /**
649      * Parse an html entity.
650      * @param mode mode
651      */
652     public void parseEntity(short mode)
653     {
654         // No longer attempts to insert missing ';' for unknown
655         // entities unless one was present already, since this
656         // gives unexpected results.
657         //
658         // For example: <a href="something.htm?foo&bar&fred">
659         // was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
660         // rather than: <a href="something.htm?foo&amp;bar&amp;fred">
661         //
662         // My thanks for Maurice Buxton for spotting this.
663         //
664         // Also Randy Waki pointed out the following case for the
665         // 04 Aug 00 version (bug #433012):
666         //
667         // For example: <a href="something.htm?id=1&lang=en">
668         // was tidied to: <a href="something.htm?id=1&lang;=en">
669         // rather than: <a href="something.htm?id=1&amp;lang=en">
670         //
671         // where "lang" is a known entity (#9001), but browsers would
672         // misinterpret "&lang;" because it had a value > 256.
673         //
674         // So the case of an apparently known entity with a value > 256 and
675         // missing a semicolon is handled specially.
676         //
677         // "ParseEntity" is also a bit of a misnomer - it handles entities and
678         // numeric character references. Invalid NCR's are now reported.
679 
680         int start;
681         boolean first = true;
682         boolean semicolon = false;
683         int c, ch, startcol;
684         String str;
685 
686         start = this.lexsize - 1; // to start at "&"
687         startcol = this.in.getCurcol() - 1;
688 
689         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
690         {
691             if (c == ';')
692             {
693                 semicolon = true;
694                 break;
695             }
696 
697             if (first && c == '#')
698             {
699                 // #431953 - start RJ
700                 if (!this.configuration.ncr
701                     || "BIG5".equals(this.configuration.getInCharEncodingName())
702                     || "SHIFTJIS".equals(this.configuration.getInCharEncodingName()))
703                 {
704                     this.in.ungetChar(c);
705                     return;
706                 }
707                 // #431953 - end RJ
708 
709                 addCharToLexer(c);
710                 first = false;
711                 continue;
712             }
713 
714             first = false;
715 
716             if (TidyUtils.isNamechar((char) c))
717             {
718                 addCharToLexer(c);
719                 continue;
720             }
721 
722             // otherwise put it back
723             this.in.ungetChar(c);
724             break;
725         }
726 
727         str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start);
728 
729         if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML)
730         {
731             report.entityError(this, Report.APOS_UNDEFINED, str, 39);
732         }
733 
734         ch = EntityTable.getDefaultEntityTable().entityCode(str);
735 
736         // drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004
737         // if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
738         // && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first.
739         // || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space.
740         // || (ch >= 0xE000 && ch <= 0xFFFD)))
741         // {
742         // this.lexsize = start;
743         // return;
744         // }
745 
746         // deal with unrecognized or invalid entities
747         // #433012 - fix by Randy Waki 17 Feb 01
748         // report invalid NCR's - Terry Teague 01 Sep 01
749         if (ch <= 0 || (ch >= 256 && c != ';'))
750         {
751             // set error position just before offending character
752             this.lines = this.in.getCurline();
753             this.columns = startcol;
754 
755             if (this.lexsize > start + 1)
756             {
757                 if (ch >= 128 && ch <= 159)
758                 {
759                     // invalid numeric character reference
760                     int c1 = 0;
761 
762                     if ("WIN1252".equals(configuration.replacementCharEncoding))
763                     {
764                         c1 = EncodingUtils.decodeWin1252(ch);
765                     }
766                     else if ("MACROMAN".equals(configuration.replacementCharEncoding))
767                     {
768                         c1 = EncodingUtils.decodeMacRoman(ch);
769                     }
770 
771                     // "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing
772 
773                     int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;
774 
775                     if (c != ';') /* issue warning if not terminated by ';' */
776                     {
777                         report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c);
778                     }
779 
780                     report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch);
781 
782                     if (c1 != 0)
783                     {
784                         // make the replacement
785                         this.lexsize = start;
786                         addCharToLexer(c1);
787                         semicolon = false;
788                     }
789                     else
790                     {
791                         /* discard */
792                         this.lexsize = start;
793                         semicolon = false;
794                     }
795 
796                 }
797                 else
798                 {
799                     report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
800                 }
801 
802                 if (semicolon)
803                 {
804                     addCharToLexer(';');
805                 }
806             }
807             else
808             {
809                 // naked &
810                 report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
811             }
812         }
813         else
814         {
815             // issue warning if not terminated by ';'
816             if (c != ';')
817             {
818                 // set error position just before offending character
819                 this.lines = this.in.getCurline();
820                 this.columns = startcol;
821                 report.entityError(this, Report.MISSING_SEMICOLON, str, c);
822             }
823 
824             this.lexsize = start;
825 
826             if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED))
827             {
828                 ch = ' ';
829             }
830 
831             addCharToLexer(ch);
832 
833             if (ch == '&' && !this.configuration.quoteAmpersand)
834             {
835                 addCharToLexer('a');
836                 addCharToLexer('m');
837                 addCharToLexer('p');
838                 addCharToLexer(';');
839             }
840         }
841     }
842 
843     /**
844      * Parses a tag name.
845      * @return first char after the tag name
846      */
847     public char parseTagName()
848     {
849         int c;
850 
851         // fold case of first char in buffer
852         c = this.lexbuf[this.txtstart];
853 
854         if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
855         {
856             c = TidyUtils.toLower((char) c);
857             this.lexbuf[this.txtstart] = (byte) c;
858         }
859 
860         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
861         {
862             if (!TidyUtils.isNamechar((char) c))
863             {
864                 break;
865             }
866 
867             // fold case of subsequent chars
868             if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
869             {
870                 c = TidyUtils.toLower((char) c);
871             }
872 
873             addCharToLexer(c);
874         }
875 
876         this.txtend = this.lexsize;
877         return (char) c;
878     }
879 
880     /**
881      * calls addCharToLexer for any char in the string.
882      * @param str input String
883      */
884     public void addStringLiteral(String str)
885     {
886         int len = str.length();
887         for (int i = 0; i < len; i++)
888         {
889             addCharToLexer(str.charAt(i));
890         }
891     }
892 
893     /**
894      * calls addCharToLexer for any char in the string till len is reached.
895      * @param str input String
896      * @param len length of the substring to be added
897      */
898     void addStringLiteralLen(String str, int len)
899     {
900         int strlen = str.length();
901         if (strlen < len)
902         {
903             len = strlen;
904         }
905         for (int i = 0; i < len; i++)
906         {
907             addCharToLexer(str.charAt(i));
908         }
909     }
910 
911     /**
912      * Choose what version to use for new doctype.
913      * @return html version constant
914      */
915     public short htmlVersion()
916     {
917         if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20))
918         {
919             return Dict.VERS_HTML20;
920         }
921 
922         if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager)
923             && TidyUtils.toBoolean(versions & Dict.VERS_HTML32))
924         {
925             return Dict.VERS_HTML32;
926         }
927         if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11))
928         {
929             return Dict.VERS_XHTML11;
930         }
931         if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT))
932         {
933             return Dict.VERS_HTML40_STRICT;
934         }
935 
936         if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE))
937         {
938             return Dict.VERS_HTML40_LOOSE;
939         }
940 
941         if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET))
942         {
943             return Dict.VERS_FRAMESET;
944         }
945 
946         return Dict.VERS_UNKNOWN;
947     }
948 
949     /**
950      * Choose what version to use for new doctype.
951      * @return html version name
952      */
953     public String htmlVersionName()
954     {
955         short guessed;
956         int j;
957 
958         guessed = apparentVersion();
959 
960         for (j = 0; j < W3CVERSION.length; ++j)
961         {
962             if (guessed == W3CVERSION[j].code)
963             {
964                 if (this.isvoyager)
965                 {
966                     return W3CVERSION[j].voyagerName;
967                 }
968 
969                 return W3CVERSION[j].name;
970             }
971         }
972 
973         return null;
974     }
975 
976     /**
977      * Add meta element for Tidy. If the meta tag is already present, update release date.
978      * @param root root node
979      * @return <code>true</code> if the tag has been added
980      */
981     public boolean addGenerator(Node root)
982     {
983         AttVal attval;
984         Node node;
985         Node head = root.findHEAD(this.configuration.tt);
986 
987         if (head != null)
988         {
989             String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see www.w3.org";
990 
991             for (node = head.content; node != null; node = node.next)
992             {
993                 if (node.tag == this.configuration.tt.tagMeta)
994                 {
995                     attval = node.getAttrByName("name");
996 
997                     if (attval != null && attval.value != null && "generator".equalsIgnoreCase(attval.value))
998                     {
999                         attval = node.getAttrByName("content");
1000 
1001                         if (attval != null
1002                             && attval.value != null
1003                             && attval.value.length() >= 9
1004                             && "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9)))
1005                         {
1006                             attval.value = meta;
1007                             return false;
1008                         }
1009                     }
1010                 }
1011             }
1012 
1013             node = this.inferredTag("meta");
1014             node.addAttribute("content", meta);
1015             node.addAttribute("name", "generator");
1016             head.insertNodeAtStart(node);
1017             return true;
1018         }
1019 
1020         return false;
1021     }
1022 
1023     /**
1024      * Check system keywords (keywords should be uppercase).
1025      * @param doctype doctype node
1026      * @return true if doctype keywords are all uppercase
1027      */
1028     public boolean checkDocTypeKeyWords(Node doctype)
1029     {
1030         int len = doctype.end - doctype.start;
1031         String s = TidyUtils.getString(this.lexbuf, doctype.start, len);
1032 
1033         return !(TidyUtils.findBadSubString("SYSTEM", s, len)
1034             || TidyUtils.findBadSubString("PUBLIC", s, len)
1035             || TidyUtils.findBadSubString("//DTD", s, len)
1036             || TidyUtils.findBadSubString("//W3C", s, len) || TidyUtils.findBadSubString("//EN", s, len));
1037     }
1038 
1039     /**
1040      * Examine DOCTYPE to identify version.
1041      * @param doctype doctype node
1042      * @return version code
1043      */
1044     public short findGivenVersion(Node doctype)
1045     {
1046         String p, s;
1047         int i, j;
1048         int len;
1049         String str1;
1050         String str2;
1051 
1052         // if root tag for doctype isn't html give up now
1053         str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5);
1054         if (!"html ".equalsIgnoreCase(str1))
1055         {
1056             return 0;
1057         }
1058 
1059         if (!checkDocTypeKeyWords(doctype))
1060         {
1061             report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
1062         }
1063 
1064         // give up if all we are given is the system id for the doctype
1065         str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7);
1066         if ("SYSTEM ".equalsIgnoreCase(str1))
1067         {
1068             // but at least ensure the case is correct
1069             if (!str1.substring(0, 6).equals("SYSTEM"))
1070             {
1071                 System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6);
1072             }
1073             return 0; // unrecognized
1074         }
1075 
1076         if ("PUBLIC ".equalsIgnoreCase(str1))
1077         {
1078             if (!str1.substring(0, 6).equals("PUBLIC"))
1079             {
1080                 System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6);
1081             }
1082         }
1083         else
1084         {
1085             this.badDoctype = true;
1086         }
1087 
1088         for (i = doctype.start; i < doctype.end; ++i)
1089         {
1090             if (this.lexbuf[i] == (byte) '"')
1091             {
1092                 str1 = TidyUtils.getString(this.lexbuf, i + 1, 12);
1093                 str2 = TidyUtils.getString(this.lexbuf, i + 1, 13);
1094                 if (str1.equals("-//W3C//DTD "))
1095                 {
1096                     // compute length of identifier e.g. "HTML 4.0 Transitional"
1097                     for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
1098                     {
1099                         //
1100                     }
1101                     len = j - i - 13;
1102                     p = TidyUtils.getString(this.lexbuf, i + 13, len);
1103 
1104                     for (j = 1; j < W3CVERSION.length; ++j)
1105                     {
1106                         s = W3CVERSION[j].name;
1107                         if (len == s.length() && s.equals(p))
1108                         {
1109                             return W3CVERSION[j].code;
1110                         }
1111                     }
1112 
1113                     // else unrecognized version
1114                 }
1115                 else if (str2.equals("-//IETF//DTD "))
1116                 {
1117                     // compute length of identifier e.g. "HTML 2.0"
1118                     for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
1119                     {
1120                         //
1121                     }
1122                     len = j - i - 14;
1123 
1124                     p = TidyUtils.getString(this.lexbuf, i + 14, len);
1125                     s = W3CVERSION[0].name;
1126                     if (len == s.length() && s.equals(p))
1127                     {
1128                         return W3CVERSION[0].code;
1129                     }
1130 
1131                     // else unrecognized version
1132                 }
1133                 break;
1134             }
1135         }
1136 
1137         return 0;
1138     }
1139 
1140     /**
1141      * Fix xhtml namespace.
1142      * @param root root Node
1143      * @param profile current profile
1144      */
1145     public void fixHTMLNameSpace(Node root, String profile)
1146     {
1147         Node node;
1148         AttVal attr;
1149 
1150         node = root.content;
1151         while (node != null && node.tag != this.configuration.tt.tagHtml)
1152         {
1153             node = node.next;
1154         }
1155 
1156         if (node != null)
1157         {
1158 
1159             for (attr = node.attributes; attr != null; attr = attr.next)
1160             {
1161                 if (attr.attribute.equals("xmlns"))
1162                 {
1163                     break;
1164                 }
1165 
1166             }
1167 
1168             if (attr != null)
1169             {
1170                 if (!attr.value.equals(profile))
1171                 {
1172                     report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
1173                     attr.value = profile;
1174                 }
1175             }
1176             else
1177             {
1178                 attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
1179                 attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr);
1180                 node.attributes = attr;
1181             }
1182         }
1183     }
1184 
1185     /**
1186      * Put DOCTYPE declaration between the &lt:?xml version "1.0" ... ?&gt; declaration, if any, and the
1187      * <code>html</code> tag. Should also work for any comments, etc. that may precede the <code>html</code> tag.
1188      * @param root root node
1189      * @return new doctype node
1190      */
1191     Node newXhtmlDocTypeNode(Node root)
1192     {
1193         Node html = root.findHTML(this.configuration.tt);
1194         if (html == null)
1195         {
1196             return null;
1197         }
1198 
1199         Node newdoctype = newNode();
1200         newdoctype.setType(Node.DOCTYPE_TAG);
1201         newdoctype.next = html;
1202         newdoctype.parent = root;
1203         newdoctype.prev = null;
1204 
1205         if (html == root.content)
1206         {
1207             // No <?xml ... ?> declaration.
1208             root.content.prev = newdoctype;
1209             root.content = newdoctype;
1210             newdoctype.prev = null;
1211         }
1212         else
1213         {
1214             // we have an <?xml ... ?> declaration.
1215             newdoctype.prev = html.prev;
1216             newdoctype.prev.next = newdoctype;
1217         }
1218         html.prev = newdoctype;
1219         return newdoctype;
1220     }
1221 
1222     /**
1223      * Adds a new xhtml doctype to the document.
1224      * @param root root node
1225      * @return <code>true</code> if a doctype has been added
1226      */
1227     public boolean setXHTMLDocType(Node root)
1228     {
1229         String fpi = " ";
1230         String sysid = "";
1231         String namespace = XHTML_NAMESPACE;
1232         String dtdsub = null;
1233         Node doctype;
1234         int dtdlen = 0;
1235 
1236         doctype = root.findDocType();
1237 
1238         fixHTMLNameSpace(root, namespace); // #427839 - fix by Evan Lenz 05 Sep 00
1239 
1240         if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
1241         {
1242             if (doctype != null)
1243             {
1244                 Node.discardElement(doctype);
1245             }
1246             return true;
1247         }
1248 
1249         if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
1250         {
1251             // see what flavor of XHTML this document matches
1252             if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1253             {
1254                 // use XHTML strict
1255                 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1256                 sysid = VOYAGER_STRICT;
1257             }
1258             else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1259             {
1260                 // use XHTML frames
1261                 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
1262                 sysid = VOYAGER_FRAMESET;
1263             }
1264             else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE))
1265             {
1266                 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1267                 sysid = VOYAGER_LOOSE;
1268             }
1269             else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1270             {
1271                 // use XHTML 1.1
1272                 fpi = "-//W3C//DTD XHTML 1.1//EN";
1273                 sysid = VOYAGER_11;
1274             }
1275             else
1276             {
1277                 // proprietary
1278                 fpi = null;
1279                 sysid = "";
1280                 if (doctype != null)// #473490 - fix by Bjšrn Hšhrmann 10 Oct 01
1281                 {
1282                     Node.discardElement(doctype);
1283                 }
1284             }
1285         }
1286         else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
1287         {
1288             fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1289             sysid = VOYAGER_STRICT;
1290         }
1291         else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
1292         {
1293             fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1294             sysid = VOYAGER_LOOSE;
1295         }
1296 
1297         if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null)
1298         {
1299             fpi = this.configuration.docTypeStr;
1300             sysid = "";
1301         }
1302 
1303         if (fpi == null)
1304         {
1305             return false;
1306         }
1307 
1308         if (doctype != null)
1309         {
1310             // Look for internal DTD subset
1311             if (configuration.xHTML || configuration.xmlOut)
1312             {
1313 
1314                 int len = doctype.end - doctype.start + 1;
1315                 String start = TidyUtils.getString(this.lexbuf, doctype.start, len);
1316 
1317                 int dtdbeg = start.indexOf('[');
1318                 if (dtdbeg >= 0)
1319                 {
1320                     int dtdend = start.substring(dtdbeg).indexOf(']');
1321                     if (dtdend >= 0)
1322                     {
1323                         dtdlen = dtdend + 1;
1324                         dtdsub = start.substring(dtdbeg);
1325                     }
1326                 }
1327             }
1328         }
1329         else
1330         {
1331             if ((doctype = newXhtmlDocTypeNode(root)) == null)
1332             {
1333                 return false;
1334             }
1335         }
1336 
1337         this.txtstart = this.lexsize;
1338         this.txtend = this.lexsize;
1339 
1340         // add public identifier
1341         addStringLiteral("html PUBLIC ");
1342 
1343         // check if the fpi is quoted or not
1344         if (fpi.charAt(0) == '"')
1345         {
1346             addStringLiteral(fpi);
1347         }
1348         else
1349         {
1350             addStringLiteral("\"");
1351             addStringLiteral(fpi);
1352             addStringLiteral("\"");
1353         }
1354 
1355         if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen)
1356         {
1357             addStringLiteral("\n\"");
1358         }
1359         else
1360         {
1361             // FG: don't wrap
1362             addStringLiteral(" \"");
1363         }
1364 
1365         // add system identifier
1366         addStringLiteral(sysid);
1367         addStringLiteral("\"");
1368 
1369         if (dtdlen > 0 && dtdsub != null)
1370         {
1371             addCharToLexer(' ');
1372             addStringLiteralLen(dtdsub, dtdlen);
1373         }
1374 
1375         this.txtend = this.lexsize;
1376 
1377         int length = this.txtend - this.txtstart;
1378         doctype.textarray = new byte[length];
1379 
1380         System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
1381         doctype.start = 0;
1382         doctype.end = length;
1383 
1384         return false;
1385     }
1386 
1387     /**
1388      * Return the html version used in document.
1389      * @return version code
1390      */
1391     public short apparentVersion()
1392     {
1393         switch (this.doctype)
1394         {
1395             case Dict.VERS_UNKNOWN :
1396                 return htmlVersion();
1397 
1398             case Dict.VERS_HTML20 :
1399                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
1400                 {
1401                     return Dict.VERS_HTML20;
1402                 }
1403 
1404                 break;
1405 
1406             case Dict.VERS_HTML32 :
1407                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
1408                 {
1409                     return Dict.VERS_HTML32;
1410                 }
1411 
1412                 break; // to replace old version by new
1413 
1414             case Dict.VERS_HTML40_STRICT :
1415                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1416                 {
1417                     return Dict.VERS_HTML40_STRICT;
1418                 }
1419 
1420                 break;
1421 
1422             case Dict.VERS_HTML40_LOOSE :
1423                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
1424                 {
1425                     return Dict.VERS_HTML40_LOOSE;
1426                 }
1427 
1428                 break; // to replace old version by new
1429 
1430             case Dict.VERS_FRAMESET :
1431                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1432                 {
1433                     return Dict.VERS_FRAMESET;
1434                 }
1435 
1436                 break;
1437 
1438             case Dict.VERS_XHTML11 :
1439                 if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1440                 {
1441                     return Dict.VERS_XHTML11;
1442                 }
1443 
1444                 break;
1445             default :
1446                 // should never reach here
1447                 break;
1448         }
1449 
1450         // kludge to avoid error appearing at end of file
1451         // it would be better to note the actual position
1452         // when first encountering the doctype declaration
1453 
1454         this.lines = 1;
1455         this.columns = 1;
1456 
1457         report.warning(this, null, null, Report.INCONSISTENT_VERSION);
1458         return this.htmlVersion();
1459     }
1460 
1461     /**
1462      * Fixup doctype if missing.
1463      * @param root root node
1464      * @return <code>false</code> if current version has not been identified
1465      */
1466     public boolean fixDocType(Node root)
1467     {
1468         Node doctype;
1469         int guessed = Dict.VERS_HTML40_STRICT, i;
1470 
1471         if (this.badDoctype)
1472         {
1473             report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
1474         }
1475 
1476         doctype = root.findDocType();
1477 
1478         if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
1479         {
1480             if (doctype != null)
1481             {
1482                 Node.discardElement(doctype);
1483             }
1484             return true;
1485         }
1486 
1487         if (this.configuration.xmlOut)
1488         {
1489             return true;
1490         }
1491 
1492         if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
1493         {
1494             Node.discardElement(doctype);
1495             doctype = null;
1496             guessed = Dict.VERS_HTML40_STRICT;
1497         }
1498         else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
1499         {
1500             Node.discardElement(doctype);
1501             doctype = null;
1502             guessed = Dict.VERS_HTML40_LOOSE;
1503         }
1504         else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
1505         {
1506             if (doctype != null)
1507             {
1508                 if (this.doctype == Dict.VERS_UNKNOWN)
1509                 {
1510                     return false;
1511                 }
1512 
1513                 switch (this.doctype)
1514                 {
1515                     case Dict.VERS_UNKNOWN :
1516                         return false;
1517 
1518                     case Dict.VERS_HTML20 :
1519                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
1520                         {
1521                             return true;
1522                         }
1523 
1524                         break; // to replace old version by new
1525 
1526                     case Dict.VERS_HTML32 :
1527                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
1528                         {
1529                             return true;
1530                         }
1531 
1532                         break; // to replace old version by new
1533 
1534                     case Dict.VERS_HTML40_STRICT :
1535                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1536                         {
1537                             return true;
1538                         }
1539 
1540                         break; // to replace old version by new
1541 
1542                     case Dict.VERS_HTML40_LOOSE :
1543                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
1544                         {
1545                             return true;
1546                         }
1547 
1548                         break; // to replace old version by new
1549 
1550                     case Dict.VERS_FRAMESET :
1551                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1552                         {
1553                             return true;
1554                         }
1555 
1556                         break; // to replace old version by new
1557 
1558                     case Dict.VERS_XHTML11 :
1559                         if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1560                         {
1561                             return true;
1562                         }
1563 
1564                         break; // to replace old version by new
1565                     default :
1566                         // should never reach here
1567                         break;
1568                 }
1569 
1570                 // INCONSISTENT_VERSION warning is now issued by ApparentVersion()
1571             }
1572 
1573             // choose new doctype
1574             guessed = htmlVersion();
1575         }
1576 
1577         if (guessed == Dict.VERS_UNKNOWN)
1578         {
1579             return false;
1580         }
1581 
1582         // for XML use the Voyager system identifier
1583         if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager)
1584         {
1585             if (doctype != null)
1586             {
1587                 Node.discardElement(doctype);
1588             }
1589 
1590             fixHTMLNameSpace(root, XHTML_NAMESPACE);
1591 
1592             // Namespace is the same for all XHTML variants
1593             // Also, don't return yet. Still need to add DOCTYPE declaration.
1594             //
1595             // for (i = 0; i < W3CVersion.length; ++i)
1596             // {
1597             // if (guessed == W3CVersion[i].code)
1598             // {
1599             // fixHTMLNameSpace(root, W3CVersion[i].profile);
1600             // break;
1601             // }
1602             // }
1603             // return true;
1604         }
1605 
1606         if (doctype == null)
1607         {
1608             if ((doctype = newXhtmlDocTypeNode(root)) == null)
1609             {
1610                 return false;
1611             }
1612         }
1613 
1614         this.txtstart = this.lexsize;
1615         this.txtend = this.lexsize;
1616 
1617         // use the appropriate public identifier
1618         addStringLiteral("html PUBLIC ");
1619 
1620         if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER
1621             && this.configuration.docTypeStr != null
1622             && this.configuration.docTypeStr.length() > 0)
1623         {
1624             // check if the fpi is quoted or not
1625             if (this.configuration.docTypeStr.charAt(0) == '"')
1626             {
1627                 addStringLiteral(this.configuration.docTypeStr);
1628             }
1629             else
1630             {
1631                 addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
1632                 addStringLiteral(this.configuration.docTypeStr);
1633                 addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
1634             }
1635         }
1636         else if (guessed == Dict.VERS_HTML20)
1637         {
1638             addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
1639         }
1640         else
1641         {
1642             addStringLiteral("\"-//W3C//DTD ");
1643 
1644             for (i = 0; i < W3CVERSION.length; ++i)
1645             {
1646                 if (guessed == W3CVERSION[i].code)
1647                 {
1648                     addStringLiteral(W3CVERSION[i].name);
1649                     break;
1650                 }
1651             }
1652 
1653             addStringLiteral("//EN\"");
1654         }
1655 
1656         this.txtend = this.lexsize;
1657 
1658         int length = this.txtend - this.txtstart;
1659         doctype.textarray = new byte[length];
1660 
1661         System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
1662         doctype.start = 0;
1663         doctype.end = length;
1664 
1665         return true;
1666     }
1667 
1668     /**
1669      * Ensure XML document starts with <code>&lt;?XML version="1.0"?&gt;</code>. Add encoding attribute if not using
1670      * ASCII or UTF-8 output.
1671      * @param root root node
1672      * @return always true
1673      */
1674     public boolean fixXmlDecl(Node root)
1675     {
1676         Node xml;
1677         AttVal version;
1678         AttVal encoding;
1679 
1680         if (root.content != null && root.content.type == Node.XML_DECL)
1681         {
1682             xml = root.content;
1683         }
1684         else
1685         {
1686             xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0);
1687             xml.next = root.content;
1688 
1689             if (root.content != null)
1690             {
1691                 root.content.prev = xml;
1692                 xml.next = root.content;
1693             }
1694 
1695             root.content = xml;
1696         }
1697 
1698         version = xml.getAttrByName("version");
1699         encoding = xml.getAttrByName("encoding");
1700 
1701         // We need to insert a check if declared encoding and output encoding mismatch
1702         // and fix the Xml declaration accordingly!!!
1703         if (encoding == null && !"UTF8".equals(this.configuration.getOutCharEncodingName()))
1704         {
1705             if ("ISO8859_1".equals(this.configuration.getOutCharEncodingName()))
1706             {
1707                 xml.addAttribute("encoding", "iso-8859-1");
1708             }
1709             if ("ISO2022".equals(this.configuration.getOutCharEncodingName()))
1710             {
1711                 xml.addAttribute("encoding", "iso-2022");
1712             }
1713         }
1714 
1715         if (version == null)
1716         {
1717             xml.addAttribute("version", "1.0");
1718         }
1719 
1720         return true;
1721     }
1722 
1723     /**
1724      * Generates and inserts a new node.
1725      * @param name tag name
1726      * @return generated node
1727      */
1728     public Node inferredTag(String name)
1729     {
1730         Node node;
1731 
1732         node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name);
1733         node.implicit = true;
1734         return node;
1735     }
1736 
1737     /**
1738      * Create a text node for the contents of a CDATA element like style or script which ends with &lt;/foo> for some
1739      * foo.
1740      * @param container container node
1741      * @return cdata node
1742      */
1743     public Node getCDATA(Node container)
1744     {
1745         int c, lastc, start, len, i;
1746         int qt = 0;
1747         int esc = 0;
1748         String str;
1749         boolean endtag = false;
1750         boolean begtag = false;
1751 
1752         if (container.isJavaScript())
1753         {
1754             esc = '\\';
1755         }
1756 
1757         this.lines = this.in.getCurline();
1758         this.columns = this.in.getCurcol();
1759         this.waswhite = false;
1760         this.txtstart = this.lexsize;
1761         this.txtend = this.lexsize;
1762 
1763         lastc = '\0';
1764         start = -1;
1765 
1766         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
1767         {
1768             // treat \r\n as \n and \r as \n
1769             if (qt > 0)
1770             {
1771                 // #598860 script parsing fails with quote chars
1772                 // A quoted string is ended by the quotation character, or end of line
1773                 if ((c == '\r' || c == '\n' || c == qt) && (!TidyUtils.toBoolean(esc) || lastc != esc))
1774                 {
1775                     qt = 0;
1776                 }
1777                 else if (c == '/' && lastc == '<')
1778                 {
1779                     start = this.lexsize + 1; // to first letter
1780                 }
1781 
1782                 else if (c == '>' && start >= 0)
1783                 {
1784                     len = this.lexsize - start;
1785 
1786                     this.lines = this.in.getCurline();
1787                     this.columns = this.in.getCurcol() - 3;
1788 
1789                     report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1790 
1791                     // if javascript insert backslash before /
1792                     if (TidyUtils.toBoolean(esc))
1793                     {
1794                         for (i = this.lexsize; i > start - 1; --i)
1795                         {
1796                             this.lexbuf[i] = this.lexbuf[i - 1];
1797                         }
1798 
1799                         this.lexbuf[start - 1] = (byte) esc;
1800                         this.lexsize++;
1801                     }
1802 
1803                     start = -1;
1804                 }
1805             }
1806             else if (TidyUtils.isQuote(c) && (!TidyUtils.toBoolean(esc) || lastc != esc))
1807             {
1808                 qt = c;
1809             }
1810             else if (c == '<')
1811             {
1812                 start = this.lexsize + 1; // to first letter
1813                 endtag = false;
1814                 begtag = true;
1815             }
1816             else if (c == '!' && lastc == '<') // Cancel start tag
1817             {
1818                 start = -1;
1819                 endtag = false;
1820                 begtag = false;
1821             }
1822             else if (c == '/' && lastc == '<')
1823             {
1824                 start = this.lexsize + 1; // to first letter
1825                 endtag = true;
1826                 begtag = false;
1827             }
1828             else if (c == '>' && start >= 0) // End of begin or end tag
1829             {
1830                 int decr = 2;
1831 
1832                 if (endtag && ((len = this.lexsize - start) == container.element.length()))
1833                 {
1834 
1835                     str = TidyUtils.getString(this.lexbuf, start, len);
1836                     if (container.element.equalsIgnoreCase(str))
1837                     {
1838                         this.txtend = start - decr;
1839                         this.lexsize = start - decr; // #433857 - fix by Huajun Zeng 26 Apr 01
1840                         break;
1841                     }
1842                 }
1843 
1844                 // Unquoted markup will end SCRIPT or STYLE elements
1845 
1846                 this.lines = this.in.getCurline();
1847                 this.columns = this.in.getCurcol() - 3;
1848 
1849                 report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1850                 if (begtag)
1851                 {
1852                     decr = 1;
1853                 }
1854                 this.txtend = start - decr;
1855                 this.lexsize = start - decr;
1856                 break;
1857             }
1858             // #427844 - fix by Markus Hoenicka 21 Oct 00
1859             else if (c == '\r')
1860             {
1861                 if (begtag || endtag)
1862                 {
1863                     continue; // discard whitespace in endtag
1864                 }
1865 
1866                 c = this.in.readChar();
1867 
1868                 if (c != '\n')
1869                 {
1870                     this.in.ungetChar(c);
1871                 }
1872 
1873                 c = '\n';
1874 
1875             }
1876             else if ((c == '\n' || c == '\t' || c == ' ') && (begtag || endtag))
1877             {
1878                 continue; // discard whitespace in endtag
1879             }
1880 
1881             addCharToLexer(c);
1882             this.txtend = this.lexsize;
1883             lastc = c;
1884         }
1885 
1886         if (c == StreamIn.END_OF_STREAM)
1887         {
1888             report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
1889         }
1890 
1891         if (this.txtend > this.txtstart)
1892         {
1893             this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
1894             return this.token;
1895         }
1896 
1897         return null;
1898     }
1899 
1900     /**
1901      *
1902      *
1903      */
1904     public void ungetToken()
1905     {
1906         this.pushed = true;
1907     }
1908 
1909     /**
1910      * Gets a token.
1911      * @param mode one of the following:
1912      * <ul>
1913      * <li><code>MixedContent</code>-- for elements which don't accept PCDATA</li>
1914      * <li><code>Preformatted</code>-- white spacepreserved as is</li>
1915      * <li><code>IgnoreMarkup</code>-- for CDATA elements such as script, style</li>
1916      * </ul>
1917      * @return next Node
1918      */
1919     public Node getToken(short mode)
1920     {
1921         int c = 0;
1922         int badcomment = 0;
1923         // pass by reference
1924         boolean[] isempty = new boolean[1];
1925         boolean inDTDSubset = false;
1926         AttVal attributes = null;
1927 
1928         if (this.pushed)
1929         {
1930             // duplicate inlines in preference to pushed text nodes when appropriate
1931             if (this.token.type != Node.TEXT_NODE || (this.insert == -1 && this.inode == null))
1932             {
1933                 this.pushed = false;
1934                 return this.token;
1935             }
1936         }
1937 
1938         // at start of block elements, unclosed inline
1939         if (this.insert != -1 || this.inode != null)
1940         {
1941             return insertedToken();
1942         }
1943 
1944         this.lines = this.in.getCurline();
1945         this.columns = this.in.getCurcol();
1946         this.waswhite = false;
1947 
1948         this.txtstart = this.lexsize;
1949         this.txtend = this.lexsize;
1950 
1951         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
1952         {
1953             // FG fix for [427846] different from tidy
1954             // if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
1955             if (this.insertspace && mode != IGNORE_WHITESPACE)
1956             {
1957                 addCharToLexer(' ');
1958             }
1959             if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
1960             {
1961                 this.waswhite = true;
1962                 this.insertspace = false;
1963             }
1964 
1965             // treat \r\n as \n and \r as \n
1966             if (c == '\r')
1967             {
1968                 c = this.in.readChar();
1969 
1970                 if (c != '\n')
1971                 {
1972                     this.in.ungetChar(c);
1973                 }
1974 
1975                 c = '\n';
1976             }
1977 
1978             addCharToLexer(c);
1979 
1980             switch (this.state)
1981             {
1982                 case LEX_CONTENT :
1983                     // element content
1984 
1985                     // Discard white space if appropriate.
1986                     // Its cheaper to do this here rather than in parser methods for elements that
1987                     // don't have mixed content.
1988                     if (TidyUtils.isWhite((char) c) && (mode == IGNORE_WHITESPACE) && this.lexsize == this.txtstart + 1)
1989                     {
1990                         --this.lexsize;
1991                         this.waswhite = false;
1992                         this.lines = this.in.getCurline();
1993                         this.columns = this.in.getCurcol();
1994                         continue;
1995                     }
1996 
1997                     if (c == '<')
1998                     {
1999                         this.state = LEX_GT;
2000                         continue;
2001                     }
2002 
2003                     if (TidyUtils.isWhite((char) c))
2004                     {
2005                         // was previous char white?
2006                         if (this.waswhite)
2007                         {
2008                             if (mode != PREFORMATTED && mode != IGNORE_MARKUP)
2009                             {
2010                                 --this.lexsize;
2011                                 this.lines = this.in.getCurline();
2012                                 this.columns = this.in.getCurcol();
2013                             }
2014                         }
2015                         else
2016                         {
2017                             // prev char wasn't white
2018                             this.waswhite = true;
2019 
2020                             if (mode != PREFORMATTED && mode != IGNORE_MARKUP && c != ' ')
2021                             {
2022                                 changeChar((byte) ' ');
2023                             }
2024                         }
2025 
2026                         continue;
2027                     }
2028                     else if (c == '&' && mode != IGNORE_MARKUP)
2029                     {
2030                         parseEntity(mode);
2031                     }
2032 
2033                     // this is needed to avoid trimming trailing whitespace
2034                     if (mode == IGNORE_WHITESPACE)
2035                     {
2036                         mode = MIXED_CONTENT;
2037                     }
2038 
2039                     this.waswhite = false;
2040                     continue;
2041 
2042                 case LEX_GT :
2043                     // <
2044 
2045                     // check for endtag
2046                     if (c == '/')
2047                     {
2048                         c = this.in.readChar();
2049                         if (c == StreamIn.END_OF_STREAM)
2050                         {
2051                             this.in.ungetChar(c);
2052                             continue;
2053                         }
2054 
2055                         addCharToLexer(c);
2056 
2057                         if (TidyUtils.isLetter((char) c))
2058                         {
2059                             this.lexsize -= 3;
2060                             this.txtend = this.lexsize;
2061                             this.in.ungetChar(c);
2062                             this.state = LEX_ENDTAG;
2063                             this.lexbuf[this.lexsize] = (byte) '\0'; // debug
2064 
2065                             // changed from
2066                             // this.in.curcol -= 2;
2067                             this.columns -= 2;
2068 
2069                             // if some text before the </ return it now
2070                             if (this.txtend > this.txtstart)
2071                             {
2072                                 // trim space char before end tag
2073                                 if (mode == IGNORE_WHITESPACE && this.lexbuf[this.lexsize - 1] == (byte) ' ')
2074                                 {
2075                                     this.lexsize -= 1;
2076                                     this.txtend = this.lexsize;
2077                                 }
2078 
2079                                 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2080                                 return this.token;
2081                             }
2082 
2083                             continue; // no text so keep going
2084                         }
2085 
2086                         // otherwise treat as CDATA
2087                         this.waswhite = false;
2088                         this.state = LEX_CONTENT;
2089                         continue;
2090                     }
2091 
2092                     if (mode == IGNORE_MARKUP)
2093                     {
2094                         // otherwise treat as CDATA
2095                         this.waswhite = false;
2096                         this.state = LEX_CONTENT;
2097                         continue;
2098                     }
2099 
2100                     // look out for comments, doctype or marked sections this isn't quite right, but its getting there
2101                     if (c == '!')
2102                     {
2103                         c = this.in.readChar();
2104 
2105                         if (c == '-')
2106                         {
2107                             c = this.in.readChar();
2108 
2109                             if (c == '-')
2110                             {
2111                                 this.state = LEX_COMMENT; // comment
2112                                 this.lexsize -= 2;
2113                                 this.txtend = this.lexsize;
2114 
2115                                 // if some text before < return it now
2116                                 if (this.txtend > this.txtstart)
2117                                 {
2118                                     this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2119                                     return this.token;
2120                                 }
2121 
2122                                 this.txtstart = this.lexsize;
2123                                 continue;
2124                             }
2125 
2126                             report.warning(this, null, null, Report.MALFORMED_COMMENT);
2127                         }
2128                         else if (c == 'd' || c == 'D')
2129                         {
2130                             this.state = LEX_DOCTYPE; // doctype
2131                             this.lexsize -= 2;
2132                             this.txtend = this.lexsize;
2133                             mode = IGNORE_WHITESPACE;
2134 
2135                             // skip until white space or '>'
2136 
2137                             for (;;)
2138                             {
2139                                 c = this.in.readChar();
2140 
2141                                 if (c == StreamIn.END_OF_STREAM || c == '>')
2142                                 {
2143                                     this.in.ungetChar(c);
2144                                     break;
2145                                 }
2146 
2147                                 if (!TidyUtils.isWhite((char) c))
2148                                 {
2149                                     continue;
2150                                 }
2151 
2152                                 // and skip to end of whitespace
2153 
2154                                 for (;;)
2155                                 {
2156                                     c = this.in.readChar();
2157 
2158                                     if (c == StreamIn.END_OF_STREAM || c == '>')
2159                                     {
2160                                         this.in.ungetChar(c);
2161                                         break;
2162                                     }
2163 
2164                                     if (TidyUtils.isWhite((char) c))
2165                                     {
2166                                         continue;
2167                                     }
2168 
2169                                     this.in.ungetChar(c);
2170                                     break;
2171                                 }
2172 
2173                                 break;
2174                             }
2175 
2176                             // if some text before < return it now
2177                             if (this.txtend > this.txtstart)
2178                             {
2179                                 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2180                                 return this.token;
2181                             }
2182 
2183                             this.txtstart = this.lexsize;
2184                             continue;
2185                         }
2186                         else if (c == '[')
2187                         {
2188                             // Word 2000 embeds <![if ...]> ... <![endif]> sequences
2189                             this.lexsize -= 2;
2190                             this.state = LEX_SECTION;
2191                             this.txtend = this.lexsize;
2192 
2193                             // if some text before < return it now
2194                             if (this.txtend > this.txtstart)
2195                             {
2196                                 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2197                                 return this.token;
2198                             }
2199 
2200                             this.txtstart = this.lexsize;
2201                             continue;
2202                         }
2203 
2204                         // otherwise swallow chars up to and including next '>'
2205                         while (true)
2206                         {
2207                             c = this.in.readChar();
2208                             if (c == '>')
2209                             {
2210                                 break;
2211                             }
2212                             if (c == -1)
2213                             {
2214                                 this.in.ungetChar(c);
2215                                 break;
2216                             }
2217                         }
2218 
2219                         this.lexsize -= 2;
2220                         this.lexbuf[this.lexsize] = (byte) '\0';
2221                         this.state = LEX_CONTENT;
2222                         continue;
2223                     }
2224 
2225                     // processing instructions
2226 
2227                     if (c == '?')
2228                     {
2229                         this.lexsize -= 2;
2230                         this.state = LEX_PROCINSTR;
2231                         this.txtend = this.lexsize;
2232 
2233                         // if some text before < return it now
2234                         if (this.txtend > this.txtstart)
2235                         {
2236                             this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2237                             return this.token;
2238                         }
2239 
2240                         this.txtstart = this.lexsize;
2241                         continue;
2242                     }
2243 
2244                     // Microsoft ASP's e.g. <% ... server-code ... %>
2245                     if (c == '%')
2246                     {
2247                         this.lexsize -= 2;
2248                         this.state = LEX_ASP;
2249                         this.txtend = this.lexsize;
2250 
2251                         // if some text before < return it now
2252                         if (this.txtend > this.txtstart)
2253                         {
2254                             this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2255                             return this.token;
2256                         }
2257 
2258                         this.txtstart = this.lexsize;
2259                         continue;
2260                     }
2261 
2262                     // Netscapes JSTE e.g. <# ... server-code ... #>
2263                     if (c == '#')
2264                     {
2265                         this.lexsize -= 2;
2266                         this.state = LEX_JSTE;
2267                         this.txtend = this.lexsize;
2268 
2269                         // if some text before < return it now
2270                         if (this.txtend > this.txtstart)
2271                         {
2272                             this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2273                             return this.token;
2274                         }
2275 
2276                         this.txtstart = this.lexsize;
2277                         continue;
2278                     }
2279 
2280                     // check for start tag
2281                     if (TidyUtils.isLetter((char) c))
2282                     {
2283                         this.in.ungetChar(c); // push back letter
2284                         this.lexsize -= 2; // discard " <" + letter
2285                         this.txtend = this.lexsize;
2286                         this.state = LEX_STARTTAG; // ready to read tag name
2287 
2288                         // if some text before < return it now
2289                         if (this.txtend > this.txtstart)
2290                         {
2291                             this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2292                             return this.token;
2293                         }
2294 
2295                         continue; // no text so keep going
2296                     }
2297 
2298                     // otherwise treat as CDATA
2299                     this.state = LEX_CONTENT;
2300                     this.waswhite = false;
2301                     continue;
2302 
2303                 case LEX_ENDTAG :
2304                     // </letter
2305                     this.txtstart = this.lexsize - 1;
2306 
2307                     // changed from
2308                     // this.in.curcol -= 2;
2309                     this.columns -= 2;
2310 
2311                     c = parseTagName();
2312                     this.token = newNode(Node.END_TAG, // create endtag token
2313                         this.lexbuf,
2314                         this.txtstart,
2315                         this.txtend,
2316                         TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart));
2317                     this.lexsize = this.txtstart;
2318                     this.txtend = this.txtstart;
2319 
2320                     // skip to '>'
2321                     while (c != '>')
2322                     {
2323                         c = this.in.readChar();
2324 
2325                         if (c == StreamIn.END_OF_STREAM)
2326                         {
2327                             break;
2328                         }
2329                     }
2330 
2331                     if (c == StreamIn.END_OF_STREAM)
2332                     {
2333                         this.in.ungetChar(c);
2334                         continue;
2335                     }
2336 
2337                     this.state = LEX_CONTENT;
2338                     this.waswhite = false;
2339                     return this.token; // the endtag token
2340 
2341                 case LEX_STARTTAG :
2342                     // first letter of tagname
2343                     this.txtstart = this.lexsize - 1; // set txtstart to first letter
2344                     c = parseTagName();
2345                     isempty[0] = false;
2346                     attributes = null;
2347                     this.token = newNode(
2348                         (isempty[0] ? Node.START_END_TAG : Node.START_TAG),
2349                         this.lexbuf,
2350                         this.txtstart,
2351                         this.txtend,
2352                         TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart));
2353 
2354                     // parse attributes, consuming closing ">"
2355                     if (c != '>')
2356                     {
2357                         if (c == '/')
2358                         {
2359                             this.in.ungetChar(c);
2360                         }
2361 
2362                         attributes = parseAttrs(isempty);
2363                     }
2364 
2365                     if (isempty[0])
2366                     {
2367                         this.token.type = Node.START_END_TAG;
2368                     }
2369 
2370                     this.token.attributes = attributes;
2371                     this.lexsize = this.txtstart;
2372                     this.txtend = this.txtstart;
2373 
2374                     // swallow newline following start tag
2375                     // special check needed for CRLF sequence
2376                     // this doesn't apply to empty elements
2377                     // nor to preformatted content that needs escaping
2378 
2379                     if (
2380 
2381                     (mode != PREFORMATTED || preContent(this.token))
2382                         && (this.token.expectsContent() || this.token.tag == this.configuration.tt.tagBr))
2383                     {
2384 
2385                         c = this.in.readChar();
2386 
2387                         if (c == '\r')
2388                         {
2389                             c = this.in.readChar();
2390 
2391                             if (c != '\n')
2392                             {
2393                                 this.in.ungetChar(c);
2394                             }
2395                         }
2396                         else if (c != '\n' && c != '\f')
2397                         {
2398                             this.in.ungetChar(c);
2399                         }
2400 
2401                         this.waswhite = true; // to swallow leading whitespace
2402                     }
2403                     else
2404                     {
2405                         this.waswhite = false;
2406                     }
2407 
2408                     this.state = LEX_CONTENT;
2409 
2410                     if (this.token.tag == null)
2411                     {
2412                         report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
2413                     }
2414                     else if (!this.configuration.xmlTags)
2415                     {
2416                         constrainVersion(this.token.tag.versions);
2417 
2418                         if (TidyUtils.toBoolean(this.token.tag.versions & Dict.VERS_PROPRIETARY))
2419                         {
2420                             // #427810 - fix by Gary Deschaines 24 May 00
2421                             if (this.configuration.makeClean && (this.token.tag != this.configuration.tt.tagNobr && //
2422                                 this.token.tag != this.configuration.tt.tagWbr))
2423                             {
2424                                 report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
2425                             }
2426                             // #427810 - fix by Terry Teague 2 Jul 01
2427                             else if (!this.configuration.makeClean)
2428                             {
2429                                 report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
2430                             }
2431                         }
2432 
2433                         if (this.token.tag.getChkattrs() != null)
2434                         {
2435                             this.token.tag.getChkattrs().check(this, this.token);
2436                         }
2437                         else
2438                         {
2439                             this.token.checkAttributes(this);
2440                         }
2441 
2442                         // should this be called before attribute checks?
2443                         this.token.repairDuplicateAttributes(this);
2444 
2445                     }
2446 
2447                     return this.token; // return start tag
2448 
2449                 case LEX_COMMENT :
2450                     // seen <!-- so look for -->
2451 
2452                     if (c != '-')
2453                     {
2454                         continue;
2455                     }
2456 
2457                     c = this.in.readChar();
2458                     addCharToLexer(c);
2459 
2460                     if (c != '-')
2461                     {
2462                         continue;
2463                     }
2464 
2465                     end_comment : while (true)
2466                     {
2467                         c = this.in.readChar();
2468 
2469                         if (c == '>')
2470                         {
2471                             if (badcomment != 0)
2472                             {
2473                                 report.warning(this, null, null, Report.MALFORMED_COMMENT);
2474                             }
2475 
2476                             this.txtend = this.lexsize - 2; // AQ 8Jul2000
2477                             this.lexbuf[this.lexsize] = (byte) '\0';
2478                             this.state = LEX_CONTENT;
2479                             this.waswhite = false;
2480                             this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
2481 
2482                             // now look for a line break
2483 
2484                             c = this.in.readChar();
2485 
2486                             if (c == '\r')
2487                             {
2488                                 c = this.in.readChar();
2489 
2490                                 if (c != '\n')
2491                                 {
2492                                     this.token.linebreak = true;
2493                                 }
2494                             }
2495 
2496                             if (c == '\n')
2497                             {
2498                                 this.token.linebreak = true;
2499                             }
2500                             else
2501                             {
2502                                 this.in.ungetChar(c);
2503                             }
2504 
2505                             return this.token;
2506                         }
2507 
2508                         // note position of first such error in the comment
2509                         if (badcomment == 0)
2510                         {
2511                             this.lines = this.in.getCurline();
2512                             this.columns = this.in.getCurcol() - 3;
2513                         }
2514 
2515                         badcomment++;
2516                         if (this.configuration.fixComments)
2517                         {
2518                             this.lexbuf[this.lexsize - 2] = (byte) '=';
2519                         }
2520 
2521                         addCharToLexer(c);
2522 
2523                         // if '-' then look for '>' to end the comment
2524                         if (c != '-')
2525                         {
2526                             break end_comment;
2527                         }
2528 
2529                     }
2530                     // otherwise continue to look for -->
2531                     this.lexbuf[this.lexsize - 2] = (byte) '=';
2532                     continue;
2533 
2534                 case LEX_DOCTYPE :
2535                     // seen <!d so look for '> ' munging whitespace
2536 
2537                     if (TidyUtils.isWhite((char) c))
2538                     {
2539                         if (this.waswhite)
2540                         {
2541                             this.lexsize -= 1;
2542                         }
2543 
2544                         this.waswhite = true;
2545                     }
2546                     else
2547                     {
2548                         this.waswhite = false;
2549                     }
2550 
2551                     if (inDTDSubset)
2552                     {
2553                         if (c == ']')
2554                         {
2555                             inDTDSubset = false;
2556                         }
2557                     }
2558                     else if (c == '[')
2559                     {
2560                         inDTDSubset = true;
2561                     }
2562                     if (inDTDSubset || c != '>')
2563                     {
2564                         continue;
2565                     }
2566 
2567                     this.lexsize -= 1;
2568                     this.txtend = this.lexsize;
2569                     this.lexbuf[this.lexsize] = (byte) '\0';
2570                     this.state = LEX_CONTENT;
2571                     this.waswhite = false;
2572                     this.token = newNode(Node.DOCTYPE_TAG, this.lexbuf, this.txtstart, this.txtend);
2573                     // make a note of the version named by the doctype
2574                     this.doctype = findGivenVersion(this.token);
2575                     return this.token;
2576 
2577                 case LEX_PROCINSTR :
2578                     // seen <? so look for '> '
2579                     // check for PHP preprocessor instructions <?php ... ?>
2580 
2581                     if (this.lexsize - this.txtstart == 3)
2582                     {
2583                         if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("php"))
2584                         {
2585                             this.state = LEX_PHP;
2586                             continue;
2587                         }
2588                     }
2589 
2590                     if (this.lexsize - this.txtstart == 4)
2591                     {
2592                         if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("xml")
2593                             && TidyUtils.isWhite((char) this.lexbuf[this.txtstart + 3]))
2594                         {
2595                             this.state = LEX_XMLDECL;
2596                             attributes = null;
2597                             continue;
2598                         }
2599                     }
2600 
2601                     if (this.configuration.xmlPIs) // insist on ?> as terminator
2602                     {
2603                         if (c != '?')
2604                         {
2605                             continue;
2606                         }
2607 
2608                         // now look for '>'
2609                         c = this.in.readChar();
2610 
2611                         if (c == StreamIn.END_OF_STREAM)
2612                         {
2613                             report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
2614                             this.in.ungetChar(c);
2615                             continue;
2616                         }
2617 
2618                         addCharToLexer(c);
2619                     }
2620 
2621                     if (c != '>')
2622                     {
2623                         continue;
2624                     }
2625 
2626                     this.lexsize -= 1;
2627                     this.txtend = this.lexsize;
2628                     this.lexbuf[this.lexsize] = (byte) '\0';
2629                     this.state = LEX_CONTENT;
2630                     this.waswhite = false;
2631                     this.token = newNode(Node.PROC_INS_TAG, this.lexbuf, this.txtstart, this.txtend);
2632                     return this.token;
2633 
2634                 case LEX_ASP :
2635                     // seen <% so look for "%> "
2636                     if (c != '%')
2637                     {
2638                         continue;
2639                     }
2640 
2641                     // now look for '>'
2642                     c = this.in.readChar();
2643 
2644                     if (c != '>')
2645                     {
2646                         this.in.ungetChar(c);
2647                         continue;
2648                     }
2649 
2650                     this.lexsize -= 1;
2651                     this.txtend = this.lexsize;
2652                     this.lexbuf[this.lexsize] = (byte) '\0';
2653                     this.state = LEX_CONTENT;
2654                     this.waswhite = false;
2655                     this.token = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
2656                     return this.token;
2657 
2658                 case LEX_JSTE :
2659                     // seen <# so look for "#> "
2660                     if (c != '#')
2661                     {
2662                         continue;
2663                     }
2664 
2665                     // now look for '>'
2666                     c = this.in.readChar();
2667 
2668                     if (c != '>')
2669                     {
2670                         this.in.ungetChar(c);
2671                         continue;
2672                     }
2673 
2674                     this.lexsize -= 1;
2675                     this.txtend = this.lexsize;
2676                     this.lexbuf[this.lexsize] = (byte) '\0';
2677                     this.state = LEX_CONTENT;
2678                     this.waswhite = false;
2679                     this.token = newNode(Node.JSTE_TAG, this.lexbuf, this.txtstart, this.txtend);
2680                     return this.token;
2681 
2682                 case LEX_PHP :
2683                     // seen " <?php" so look for "?> "
2684                     if (c != '?')
2685                     {
2686                         continue;
2687                     }
2688 
2689                     // now look for '>'
2690                     c = this.in.readChar();
2691 
2692                     if (c != '>')
2693                     {
2694                         this.in.ungetChar(c);
2695                         continue;
2696                     }
2697 
2698                     this.lexsize -= 1;
2699                     this.txtend = this.lexsize;
2700                     this.lexbuf[this.lexsize] = (byte) '\0';
2701                     this.state = LEX_CONTENT;
2702                     this.waswhite = false;
2703                     this.token = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
2704                     return this.token;
2705 
2706                 case LEX_XMLDECL : // seen "<?xml" so look for "?>"
2707 
2708                     if (TidyUtils.isWhite((char) c) && c != '?')
2709                     {
2710                         continue;
2711                     }
2712 
2713                     // get pseudo-attribute
2714                     if (c != '?')
2715                     {
2716                         String name;
2717                         Node[] asp = new Node[1];
2718                         Node[] php = new Node[1];
2719                         AttVal av = new AttVal();
2720                         int[] pdelim = new int[1];
2721                         isempty[0] = false;
2722 
2723                         this.in.ungetChar(c);
2724 
2725                         name = this.parseAttribute(isempty, asp, php);
2726                         av.attribute = name;
2727 
2728                         av.value = this.parseValue(name, true, isempty, pdelim);
2729                         av.delim = pdelim[0];
2730                         av.next = attributes;
2731 
2732                         attributes = av;
2733                         // continue;
2734                     }
2735 
2736                     // now look for '>'
2737                     c = this.in.readChar();
2738 
2739                     if (c != '>')
2740                     {
2741                         this.in.ungetChar(c);
2742                         continue;
2743                     }
2744                     this.lexsize -= 1;
2745                     this.txtend = this.txtstart;
2746                     this.lexbuf[this.txtend] = '\0';
2747                     this.state = LEX_CONTENT;
2748                     this.waswhite = false;
2749                     this.token = newNode(Node.XML_DECL, this.lexbuf, this.txtstart, this.txtend);
2750                     this.token.attributes = attributes;
2751                     return this.token;
2752 
2753                 case LEX_SECTION :
2754                     // seen " <![" so look for "]> "
2755                     if (c == '[')
2756                     {
2757                         if (this.lexsize == (this.txtstart + 6)
2758                             && (TidyUtils.getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
2759                         {
2760                             this.state = LEX_CDATA;
2761                             this.lexsize -= 6;
2762                             continue;
2763                         }
2764                     }
2765 
2766                     if (c != ']')
2767                     {
2768                         continue;
2769                     }
2770 
2771                     // now look for '>'
2772                     c = this.in.readChar();
2773 
2774                     if (c != '>')
2775                     {
2776                         this.in.ungetChar(c);
2777                         continue;
2778                     }
2779 
2780                     this.lexsize -= 1;
2781                     this.txtend = this.lexsize;
2782                     this.lexbuf[this.lexsize] = (byte) '\0';
2783                     this.state = LEX_CONTENT;
2784                     this.waswhite = false;
2785                     this.token = newNode(Node.SECTION_TAG, this.lexbuf, this.txtstart, this.txtend);
2786                     return this.token;
2787 
2788                 case LEX_CDATA :
2789                     // seen " <![CDATA[" so look for "]]> "
2790                     if (c != ']')
2791                     {
2792                         continue;
2793                     }
2794 
2795                     // now look for ']'
2796                     c = this.in.readChar();
2797 
2798                     if (c != ']')
2799                     {
2800                         this.in.ungetChar(c);
2801                         continue;
2802                     }
2803 
2804                     // now look for '>'
2805                     c = this.in.readChar();
2806 
2807                     if (c != '>')
2808                     {
2809                         this.in.ungetChar(c);
2810                         continue;
2811                     }
2812 
2813                     this.lexsize -= 1;
2814                     this.txtend = this.lexsize;
2815                     this.lexbuf[this.lexsize] = (byte) '\0';
2816                     this.state = LEX_CONTENT;
2817                     this.waswhite = false;
2818                     this.token = newNode(Node.CDATA_TAG, this.lexbuf, this.txtstart, this.txtend);
2819                     return this.token;
2820 
2821                 default :
2822                     // should never reach here
2823                     break;
2824             }
2825         }
2826 
2827         if (this.state == LEX_CONTENT) // text string
2828         {
2829             this.txtend = this.lexsize;
2830 
2831             if (this.txtend > this.txtstart)
2832             {
2833                 this.in.ungetChar(c);
2834 
2835                 if (this.lexbuf[this.lexsize - 1] == (byte) ' ')
2836                 {
2837                     this.lexsize -= 1;
2838                     this.txtend = this.lexsize;
2839                 }
2840 
2841                 this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2842                 return this.token;
2843             }
2844         }
2845         else if (this.state == LEX_COMMENT) // comment
2846         {
2847             if (c == StreamIn.END_OF_STREAM)
2848             {
2849                 report.warning(this, null, null, Report.MALFORMED_COMMENT);
2850             }
2851 
2852             this.txtend = this.lexsize;
2853             this.lexbuf[this.lexsize] = (byte) '\0';
2854             this.state = LEX_CONTENT;
2855             this.waswhite = false;
2856             this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
2857             return this.token;
2858         }
2859 
2860         return null;
2861     }
2862 
2863     /**
2864      * parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to
2865      * dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to
2866      * tailor the attribute value. Here is an example of a work around for using ASP in attribute values:
2867      * <code>href='<%=rsSchool.Fields("ID").Value%>'</code> where the ASP that generates the attribute value is
2868      * masked from Tidy by the quotemarks.
2869      * @return parsed Node
2870      */
2871     public Node parseAsp()
2872     {
2873         int c;
2874         Node asp = null;
2875 
2876         this.txtstart = this.lexsize;
2877 
2878         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
2879         {
2880 
2881             addCharToLexer(c);
2882 
2883             if (c != '%')
2884             {
2885                 continue;
2886             }
2887 
2888             if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
2889             {
2890                 break;
2891             }
2892             addCharToLexer(c);
2893 
2894             if (c == '>')
2895             {
2896                 break;
2897             }
2898         }
2899 
2900         this.lexsize -= 2;
2901         this.txtend = this.lexsize;
2902 
2903         if (this.txtend > this.txtstart)
2904         {
2905             asp = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
2906         }
2907 
2908         this.txtstart = this.txtend;
2909         return asp;
2910     }
2911 
2912     /**
2913      * PHP is like ASP but is based upon XML processing instructions, e.g. <code>&lt;?php ... ?&gt;</code>.
2914      * @return parsed Node
2915      */
2916     public Node parsePhp()
2917     {
2918         int c;
2919         Node php = null;
2920 
2921         this.txtstart = this.lexsize;
2922 
2923         while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
2924         {
2925             addCharToLexer(c);
2926 
2927             if (c != '?')
2928             {
2929                 continue;
2930             }
2931 
2932             if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
2933             {
2934                 break;
2935             }
2936             addCharToLexer(c);
2937 
2938             if (c == '>')
2939             {
2940                 break;
2941             }
2942         }
2943 
2944         this.lexsize -= 2;
2945         this.txtend = this.lexsize;
2946 
2947         if (this.txtend > this.txtstart)
2948         {
2949             php = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
2950         }
2951 
2952         this.txtstart = this.txtend;
2953         return php;
2954     }
2955 
2956     /**
2957      * consumes the '>' terminating start tags.
2958      * @param isempty flag is passed as array so it can be modified
2959      * @param asp asp Node, passed as array so it can be modified
2960      * @param php php Node, passed as array so it can be modified
2961      * @return parsed attribute
2962      */
2963     public String parseAttribute(boolean[] isempty, Node[] asp, Node[] php)
2964     {
2965         int start = 0;
2966         String attr;
2967         int c = 0;
2968         int lastc = 0;
2969 
2970         asp[0] = null; // clear asp pointer
2971         php[0] = null; // clear php pointer
2972         // skip white space before the attribute
2973 
2974         for (;;)
2975         {
2976             c = this.in.readChar();
2977 
2978             if (c == '/')
2979             {
2980                 c = this.in.readChar();
2981 
2982                 if (c == '>')
2983                 {
2984                     isempty[0] = true;
2985                     return null;
2986                 }
2987 
2988                 this.in.ungetChar(c);
2989                 c = '/';
2990                 break;
2991             }
2992 
2993             if (c == '>')
2994             {
2995                 return null;
2996             }
2997 
2998             if (c == '<')
2999             {
3000                 c = this.in.readChar();
3001 
3002                 if (c == '%')
3003                 {
3004                     asp[0] = parseAsp();
3005                     return null;
3006                 }
3007                 else if (c == '?')
3008                 {
3009                     php[0] = parsePhp();
3010                     return null;
3011                 }
3012 
3013                 this.in.ungetChar(c);
3014                 if (this.state != LEX_XMLDECL) // FG fix for 532535
3015                 {
3016                     this.in.ungetChar('<'); // fix for 433360
3017                 }
3018                 report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3019                 return null;
3020             }
3021 
3022             if (c == '=')
3023             {
3024                 report.attrError(this, this.token, null, Report.UNEXPECTED_EQUALSIGN);
3025                 continue;
3026             }
3027 
3028             if (c == '"' || c == '\'')
3029             {
3030                 report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
3031                 continue;
3032             }
3033 
3034             if (c == StreamIn.END_OF_STREAM)
3035             {
3036                 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3037                 this.in.ungetChar(c);
3038                 return null;
3039             }
3040 
3041             if (!TidyUtils.isWhite((char) c))
3042             {
3043                 break;
3044             }
3045         }
3046 
3047         start = this.lexsize;
3048         lastc = c;
3049 
3050         for (;;)
3051         {
3052             // but push back '=' for parseValue()
3053             if (c == '=' || c == '>')
3054             {
3055                 this.in.ungetChar(c);
3056                 break;
3057             }
3058 
3059             if (c == '<' || c == StreamIn.END_OF_STREAM)
3060             {
3061                 this.in.ungetChar(c);
3062                 break;
3063             }
3064             if (lastc == '-' && (c == '"' || c == '\''))
3065             {
3066                 this.lexsize--;
3067                 this.in.ungetChar(c);
3068                 break;
3069             }
3070             if (TidyUtils.isWhite((char) c))
3071             {
3072                 break;
3073             }
3074 
3075             // what should be done about non-namechar characters?
3076             // currently these are incorporated into the attr name
3077 
3078             if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
3079             {
3080                 c = TidyUtils.toLower((char) c);
3081             }
3082 
3083             // ++len; #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
3084             addCharToLexer(c);
3085 
3086             lastc = c;
3087             c = this.in.readChar();
3088         }
3089 
3090         // #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
3091         int len = this.lexsize - start;
3092         attr = (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
3093         this.lexsize = start;
3094 
3095         return attr;
3096     }
3097 
3098     /**
3099      * Invoked when &lt; is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this
3100      * routine recognizes ' and " quoted strings.
3101      * @return delimiter
3102      */
3103     public int parseServerInstruction()
3104     {
3105         int c, delim = '"';
3106         boolean isrule = false;
3107 
3108         c = this.in.readChar();
3109         addCharToLexer(c);
3110 
3111         // check for ASP, PHP or Tango
3112         if (c == '%' || c == '?' || c == '@')
3113         {
3114             isrule = true;
3115         }
3116 
3117         for (;;)
3118         {
3119             c = this.in.readChar();
3120 
3121             if (c == StreamIn.END_OF_STREAM)
3122             {
3123                 break;
3124             }
3125 
3126             if (c == '>')
3127             {
3128                 if (isrule)
3129                 {
3130                     addCharToLexer(c);
3131                 }
3132                 else
3133                 {
3134                     this.in.ungetChar(c);
3135                 }
3136 
3137                 break;
3138             }
3139 
3140             // if not recognized as ASP, PHP or Tango
3141             // then also finish value on whitespace
3142             if (!isrule)
3143             {
3144                 if (TidyUtils.isWhite((char) c))
3145                 {
3146                     break;
3147                 }
3148             }
3149 
3150             addCharToLexer(c);
3151 
3152             if (c == '"')
3153             {
3154                 do
3155                 {
3156                     c = this.in.readChar();
3157 
3158                     if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
3159                     {
3160                         report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3161                         this.in.ungetChar(c);
3162                         return 0;
3163                     }
3164                     if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
3165                     {
3166                         this.in.ungetChar(c);
3167                         report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3168                         return 0;
3169                     }
3170 
3171                     addCharToLexer(c);
3172                 }
3173                 while (c != '"');
3174                 delim = '\'';
3175                 continue;
3176             }
3177 
3178             if (c == '\'')
3179             {
3180                 do
3181                 {
3182                     c = this.in.readChar();
3183 
3184                     if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
3185                     {
3186                         report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3187                         this.in.ungetChar(c);
3188                         return 0;
3189                     }
3190                     if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
3191                     {
3192                         this.in.ungetChar(c);
3193                         report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3194                         return 0;
3195                     }
3196 
3197                     addCharToLexer(c);
3198                 }
3199                 while (c != '\'');
3200             }
3201         }
3202 
3203         return delim;
3204     }
3205 
3206     /**
3207      * Parse an attribute value.
3208      * @param name attribute name
3209      * @param foldCase fold case?
3210      * @param isempty is attribute empty? Passed as an array reference to allow modification
3211      * @param pdelim delimiter, passed as an array reference to allow modification
3212      * @return parsed value
3213      */
3214     public String parseValue(String name, boolean foldCase, boolean[] isempty, int[] pdelim)
3215     {
3216         // values start with "=" or " = " etc.
3217         // doesn't consume the ">" at end of start tag
3218 
3219         int len = 0;
3220         int start;
3221         boolean seenGt = false;
3222         boolean munge = true;
3223         int c = 0;
3224         int lastc, delim, quotewarning;
3225         String value;
3226 
3227         delim = 0;
3228         pdelim[0] = '"';
3229 
3230         // Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are
3231         // significant and must be preserved
3232 
3233         if (this.configuration.literalAttribs)
3234         {
3235             munge = false;
3236         }
3237 
3238         // skip white space before the '='
3239         while (true)
3240         {
3241             c = this.in.readChar();
3242 
3243             if (c == StreamIn.END_OF_STREAM)
3244             {
3245                 this.in.ungetChar(c);
3246                 break;
3247             }
3248 
3249             if (!TidyUtils.isWhite((char) c))
3250             {
3251                 break;
3252             }
3253         }
3254 
3255         // c should be '=' if there is a value other legal possibilities are white space, '/' and '>'
3256 
3257         if (c != '=' && c != '"' && c != '\'')
3258         {
3259             this.in.ungetChar(c);
3260             return null;
3261         }
3262 
3263         // skip white space after '='
3264 
3265         while (true)
3266         {
3267             c = this.in.readChar();
3268 
3269             if (c == StreamIn.END_OF_STREAM)
3270             {
3271                 this.in.ungetChar(c);
3272                 break;
3273             }
3274 
3275             if (!TidyUtils.isWhite((char) c))
3276             {
3277                 break;
3278             }
3279         }
3280 
3281         // check for quote marks
3282 
3283         if (c == '"' || c == '\'')
3284         {
3285             delim = c;
3286         }
3287         else if (c == '<')
3288         {
3289             start = this.lexsize;
3290             addCharToLexer(c);
3291             pdelim[0] = parseServerInstruction();
3292             len = this.lexsize - start;
3293             this.lexsize = start;
3294             return (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
3295         }
3296         else
3297         {
3298             this.in.ungetChar(c);
3299         }
3300 
3301         // and read the value string check for quote mark if needed
3302 
3303         quotewarning = 0;
3304         start = this.lexsize;
3305         c = '\0';
3306 
3307         while (true)
3308         {
3309             lastc = c; // track last character
3310             c = this.in.readChar();
3311 
3312             if (c == StreamIn.END_OF_STREAM)
3313             {
3314                 report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3315                 this.in.ungetChar(c);
3316                 break;
3317             }
3318 
3319             if (delim == (char) 0)
3320             {
3321                 if (c == '>')
3322                 {
3323                     this.in.ungetChar(c);
3324                     break;
3325                 }
3326 
3327                 if (c == '"' || c == '\'')
3328                 {
3329                     report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
3330                     break;
3331                 }
3332 
3333                 if (c == '<')
3334                 {
3335                     this.in.ungetChar(c); // fix for 433360
3336                     c = '>';
3337                     this.in.ungetChar(c);
3338                     report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3339                     break;
3340                 }
3341 
3342                 // For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however
3343                 // care is needed to avoid so treating <a href=http://www.acme.com /> in this way, which would map the
3344                 // <a> tag to <a href="http://www.acme.com"/>
3345 
3346                 if (c == '/')
3347                 {
3348                     // peek ahead in case of />
3349                     c = this.in.readChar();
3350 
3351                     if (c == '>' && !AttributeTable.getDefaultAttributeTable().isUrl(name))
3352                     {
3353                         isempty[0] = true;
3354                         this.in.ungetChar(c);
3355                         break;
3356                     }
3357 
3358                     // unget peeked char
3359                     this.in.ungetChar(c);
3360                     c = '/';
3361                 }
3362             }
3363             else
3364             {
3365                 // delim is '\'' or '"'
3366                 if (c == delim)
3367                 {
3368                     break;
3369                 }
3370 
3371                 // treat CRLF, CR and LF as single line break
3372 
3373                 if (c == '\r')
3374                 {
3375                     c = this.in.readChar();
3376                     if (c != '\n')
3377                     {
3378                         this.in.ungetChar(c);
3379                     }
3380 
3381                     c = '\n';
3382                 }
3383 
3384                 if (c == '\n' || c == '<' || c == '>')
3385                 {
3386                     ++quotewarning;
3387                 }
3388 
3389                 if (c == '>')
3390                 {
3391                     seenGt = true;
3392                 }
3393             }
3394 
3395             if (c == '&')
3396             {
3397                 // no entities in ID attributes
3398                 if ("id".equalsIgnoreCase(name))
3399                 {
3400                     report.attrError(this, null, null, Report.ENTITY_IN_ID);
3401                     continue;
3402                 }
3403 
3404                 addCharToLexer(c);
3405                 parseEntity((short) 0);
3406                 continue;
3407 
3408             }
3409 
3410             // kludge for JavaScript attribute values with line continuations in string literals
3411 
3412             if (c == '\\')
3413             {
3414                 c = this.in.readChar();
3415 
3416                 if (c != '\n')
3417                 {
3418                     this.in.ungetChar(c);
3419                     c = '\\';
3420                 }
3421             }
3422 
3423             if (TidyUtils.isWhite((char) c))
3424             {
3425                 if (delim == (char) 0)
3426                 {
3427                     break;
3428                 }
3429 
3430                 if (munge)
3431                 {
3432                     // discard line breaks in quoted URLs
3433                     // #438650 - fix by Randy Waki
3434                     if (c == '\n' && AttributeTable.getDefaultAttributeTable().isUrl(name))
3435                     {
3436                         // warn that we discard this newline
3437                         report.attrError(this, this.token, null, Report.NEWLINE_IN_URI);
3438                         continue;
3439                     }
3440 
3441                     c = ' ';
3442 
3443                     if (lastc == ' ')
3444                     {
3445                         continue;
3446                     }
3447                 }
3448             }
3449             else if (foldCase && TidyUtils.isUpper((char) c))
3450             {
3451                 c = TidyUtils.toLower((char) c);
3452             }
3453 
3454             addCharToLexer(c);
3455         }
3456 
3457         if (quotewarning > 10 && seenGt && munge)
3458         {
3459             // there is almost certainly a missing trailing quote mark as we have see too many newlines, < or >
3460             // characters. an exception is made for Javascript attributes and the javascript URL scheme which may
3461             // legitimately include < and >, and for attributes starting with "<xml " as generated by Microsoft Office.
3462 
3463             if (!AttributeTable.getDefaultAttributeTable().isScript(name)
3464                 && !(AttributeTable.getDefaultAttributeTable().isUrl(name) && "javascript:".equals(TidyUtils.getString(
3465                     this.lexbuf,
3466                     start,
3467                     11)))
3468                 && !"<xml ".equals(TidyUtils.getString(this.lexbuf, start, 5))) // #500236 - fix by Klaus Johannes Rusch
3469             // 06 Jan 02
3470             {
3471                 report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
3472             }
3473         }
3474 
3475         len = this.lexsize - start;
3476         this.lexsize = start;
3477 
3478         if (len > 0 || delim != 0)
3479         {
3480             // ignore leading and trailing white space for all but title, alt, value and prompts attributes unless
3481             // --literal-attributes is set to yes
3482             // #994841 - Whitespace is removed from value attributes
3483 
3484             if (munge && !TidyUtils.isInValuesIgnoreCase(new String[]{"alt", "title", "value", "prompt"}, name))
3485             {
3486                 while (TidyUtils.isWhite((char) this.lexbuf[start + len - 1]))
3487                 {
3488                     --len;
3489                 }
3490 
3491                 while (TidyUtils.isWhite((char) this.lexbuf[start]) && start < len)
3492                 {
3493                     ++start;
3494                     --len;
3495                 }
3496             }
3497 
3498             value = TidyUtils.getString(this.lexbuf, start, len);
3499         }
3500         else
3501         {
3502             value = null;
3503         }
3504 
3505         // note delimiter if given
3506         if (delim != 0)
3507         {
3508             pdelim[0] = delim;
3509         }
3510         else
3511         {
3512             pdelim[0] = '"';
3513         }
3514 
3515         return value;
3516     }
3517 
3518     /**
3519      * Check if attr is a valid name.
3520      * @param attr String to check, must be non-null
3521      * @return <code>true</code> if attr is a valid name.
3522      */
3523     public static boolean isValidAttrName(String attr)
3524     {
3525         char c;
3526         int i;
3527 
3528         // first character should be a letter
3529         c = attr.charAt(0);
3530 
3531         if (!TidyUtils.isLetter(c))
3532         {
3533             return false;
3534         }
3535 
3536         // remaining characters should be namechars
3537         for (i = 1; i < attr.length(); i++)
3538         {
3539             c = attr.charAt(i);
3540 
3541             if (TidyUtils.isNamechar(c))
3542             {
3543                 continue;
3544             }
3545 
3546             return false;
3547         }
3548 
3549         return true;
3550     }
3551 
3552     /**
3553      * In CSS1, selectors can contain only the characters A-Z, 0-9, and Unicode characters 161-255, plus dash (-); they
3554      * cannot start with a dash or a digit; they can also contain escaped characters and any Unicode character as a
3555      * numeric code (see next item). The backslash followed by at most four hexadecimal digits (0..9A..F) stands for the
3556      * Unicode character with that number. Any character except a hexadecimal digit can be escaped to remove its special
3557      * meaning, by putting a backslash in front.
3558      * @param buf css selector name
3559      * @return <code>true</code> if the given string is a valid css1 selector name
3560      */
3561     public static boolean isCSS1Selector(String buf)
3562     {
3563         if (buf == null)
3564         {
3565             return false;
3566         }
3567 
3568         // #508936 - CSS class naming for -clean option
3569         boolean valid = true;
3570         int esclen = 0;
3571         char c;
3572         int pos;
3573 
3574         for (pos = 0; valid && pos < buf.length(); ++pos)
3575         {
3576             c = buf.charAt(pos);
3577             if (c == '\\')
3578             {
3579                 esclen = 1; // ab\555\444 is 4 chars {'a', 'b', \555, \444}
3580             }
3581             else if (Character.isDigit(c))
3582             {
3583                 // Digit not 1st, unless escaped (Max length "\112F")
3584                 if (esclen > 0)
3585                 {
3586                     valid = (++esclen < 6);
3587                 }
3588                 if (valid)
3589                 {
3590                     valid = (pos > 0 || esclen > 0);
3591                 }
3592             }
3593             else
3594             {
3595                 valid = (esclen > 0 // Escaped? Anything goes.
3596                     || (pos > 0 && c == '-') // Dash cannot be 1st char
3597                     || Character.isLetter(c) // a-z, A-Z anywhere
3598                 || (c >= 161 && c <= 255)); // Unicode 161-255 anywhere
3599                 esclen = 0;
3600             }
3601         }
3602         return valid;
3603     }
3604 
3605     /**
3606      * Parse tag attributes.
3607      * @param isempty is tag empty?
3608      * @return parsed attribute/value list
3609      */
3610     public AttVal parseAttrs(boolean[] isempty)
3611     {
3612         AttVal av, list;
3613         String attribute, value;
3614         int[] delim = new int[1];
3615         Node[] asp = new Node[1];
3616         Node[] php = new Node[1];
3617 
3618         list = null;
3619 
3620         while (!endOfInput())
3621         {
3622             attribute = parseAttribute(isempty, asp, php);
3623 
3624             if (attribute == null)
3625             {
3626                 // check if attributes are created by ASP markup
3627                 if (asp[0] != null)
3628                 {
3629                     av = new AttVal(list, null, asp[0], null, '\0', null, null);
3630                     list = av;
3631                     continue;
3632                 }
3633 
3634                 // check if attributes are created by PHP markup
3635                 if (php[0] != null)
3636                 {
3637                     av = new AttVal(list, null, null, php[0], '\0', null, null);
3638                     list = av;
3639                     continue;
3640                 }
3641 
3642                 break;
3643             }
3644 
3645             value = parseValue(attribute, false, isempty, delim);
3646 
3647             if (attribute != null && isValidAttrName(attribute))
3648             {
3649                 av = new AttVal(list, null, null, null, delim[0], attribute, value);
3650                 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
3651                 list = av;
3652             }
3653             else
3654             {
3655                 av = new AttVal(null, null, null, null, 0, attribute, value);
3656 
3657                 // #427664 - fix by Gary Peskin 04 Aug 00; other fixes by Dave Raggett
3658                 if (value != null)
3659                 {
3660                     report.attrError(this, this.token, av, Report.BAD_ATTRIBUTE_VALUE);
3661                 }
3662                 else if (TidyUtils.lastChar(attribute) == '"')
3663                 {
3664                     report.attrError(this, this.token, av, Report.MISSING_QUOTEMARK);
3665                 }
3666                 else
3667                 {
3668                     report.attrError(this, this.token, av, Report.UNKNOWN_ATTRIBUTE);
3669                 }
3670             }
3671         }
3672 
3673         return list;
3674     }
3675 
3676     /**
3677      * Push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones
3678      * generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance:
3679      * <code>&lt;p>&lt;em> text &lt;p>&lt;em> more text</code> Shouldn't be mapped to
3680      * <code>&lt;p>&lt;em> text &lt;/em>&lt;/p>&lt;p>&lt;em>&lt;em> more text &lt;/em>&lt;/em></code>
3681      * @param node Node to be pushed
3682      */
3683     public void pushInline(Node node)
3684     {
3685         IStack is;
3686 
3687         if (node.implicit)
3688         {
3689             return;
3690         }
3691 
3692         if (node.tag == null)
3693         {
3694             return;
3695         }
3696 
3697         if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
3698         {
3699             return;
3700         }
3701 
3702         if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
3703         {
3704             return;
3705         }
3706 
3707         if (node.tag != this.configuration.tt.tagFont && isPushed(node))
3708         {
3709             return;
3710         }
3711 
3712         // make sure there is enough space for the stack
3713         is = new IStack();
3714         is.tag = node.tag;
3715         is.element = node.element;
3716         if (node.attributes != null)
3717         {
3718             is.attributes = cloneAttributes(node.attributes);
3719         }
3720         this.istack.push(is);
3721     }
3722 
3723     /**
3724      * Pop a copy of an inline node from the stack.
3725      * @param node Node to be popped
3726      */
3727     public void popInline(Node node)
3728     {
3729         IStack is;
3730 
3731         if (node != null)
3732         {
3733 
3734             if (node.tag == null)
3735             {
3736                 return;
3737             }
3738 
3739             if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
3740             {
3741                 return;
3742             }
3743 
3744             if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
3745             {
3746                 return;
3747             }
3748 
3749             // if node is </a> then pop until we find an <a>
3750             if (node.tag == this.configuration.tt.tagA)
3751             {
3752 
3753                 while (this.istack.size() > 0)
3754                 {
3755                     is = (IStack) this.istack.pop();
3756                     if (is.tag == this.configuration.tt.tagA)
3757                     {
3758                         break;
3759                     }
3760                 }
3761 
3762                 if (this.insert >= this.istack.size())
3763                 {
3764                     this.insert = -1;
3765                 }
3766                 return;
3767             }
3768         }
3769 
3770         if (this.istack.size() > 0)
3771         {
3772             is = (IStack) this.istack.pop();
3773             if (this.insert >= this.istack.size())
3774             {
3775                 this.insert = -1;
3776             }
3777         }
3778     }
3779 
3780     /**
3781      * Is the node in the stack?
3782      * @param node Node
3783      * @return <code>true</code> is the node is found in the stack
3784      */
3785     public boolean isPushed(Node node)
3786     {
3787         int i;
3788         IStack is;
3789 
3790         for (i = this.istack.size() - 1; i >= 0; --i)
3791         {
3792             is = (IStack) this.istack.elementAt(i);
3793             if (is.tag == node.tag)
3794             {
3795                 return true;
3796             }
3797         }
3798 
3799         return false;
3800     }
3801 
3802     /**
3803      * This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P,
3804      * TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. When the inline stack is not empty, as
3805      * will be the case in: <code>&lt;i>&lt;h1>italic heading&lt;/h1>&lt;/i></code> which is then treated as
3806      * equivalent to <code>&lt;h1>&lt;i>italic heading&lt;/i>&lt;/h1></code> This is implemented by setting the lexer
3807      * into a mode where it gets tokens from the inline stack rather than from the input stream.
3808      * @param node original node
3809      * @return stack size
3810      */
3811     public int inlineDup(Node node)
3812     {
3813         int n;
3814 
3815         n = this.istack.size() - this.istackbase;
3816         if (n > 0)
3817         {
3818             this.insert = this.istackbase;
3819             this.inode = node;
3820         }
3821 
3822         return n;
3823     }
3824 
3825     /**
3826      * @return
3827      */
3828     public Node insertedToken()
3829     {
3830         Node node;
3831         IStack is;
3832         int n;
3833 
3834         // this will only be null if inode != null
3835         if (this.insert == -1)
3836         {
3837             node = this.inode;
3838             this.inode = null;
3839             return node;
3840         }
3841 
3842         // is this is the "latest" node then update the position, otherwise use current values
3843         if (this.inode == null)
3844         {
3845             this.lines = this.in.getCurline();
3846             this.columns = this.in.getCurcol();
3847         }
3848 
3849         node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend);
3850 
3851         // GLP: Bugfix 126261. Remove when this change is fixed in istack.c in the original Tidy
3852         node.implicit = true;
3853         is = (IStack) this.istack.elementAt(this.insert);
3854         node.element = is.element;
3855         node.tag = is.tag;
3856         if (is.attributes != null)
3857         {
3858             node.attributes = cloneAttributes(is.attributes);
3859         }
3860 
3861         // advance lexer to next item on the stack
3862         n = this.insert;
3863 
3864         // and recover state if we have reached the end
3865         if (++n < this.istack.size())
3866         {
3867             this.insert = n;
3868         }
3869         else
3870         {
3871             this.insert = -1;
3872         }
3873 
3874         return node;
3875     }
3876 
3877     /**
3878      * Can the given element be removed?
3879      * @param element node
3880      * @return <code>true</code> if he element can be removed
3881      */
3882     public boolean canPrune(Node element)
3883     {
3884         if (element.type == Node.TEXT_NODE)
3885         {
3886             return true;
3887         }
3888 
3889         if (element.content != null)
3890         {
3891             return false;
3892         }
3893 
3894         if (element.tag == this.configuration.tt.tagA && element.attributes != null)
3895         {
3896             return false;
3897         }
3898 
3899         if (element.tag == this.configuration.tt.tagP && !this.configuration.dropEmptyParas)
3900         {
3901             return false;
3902         }
3903 
3904         if (element.tag == null)
3905         {
3906             return false;
3907         }
3908 
3909         if (TidyUtils.toBoolean(element.tag.model & Dict.CM_ROW))
3910         {
3911             return false;
3912         }
3913 
3914         if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY))
3915         {
3916             return false;
3917         }
3918 
3919         if (element.tag == this.configuration.tt.tagApplet)
3920         {
3921             return false;
3922         }
3923 
3924         if (element.tag == this.configuration.tt.tagObject)
3925         {
3926             return false;
3927         }
3928 
3929         if (element.tag == this.configuration.tt.tagScript && element.getAttrByName("src") != null)
3930         {
3931             return false;
3932         }
3933 
3934         // #540555 Empty title tag is trimmed
3935         if (element.tag == this.configuration.tt.tagTitle)
3936         {
3937             return false;
3938         }
3939 
3940         // #433359 - fix by Randy Waki 12 Mar 01 - Empty iframe is trimmed
3941         if (element.tag == this.configuration.tt.tagIframe)
3942         {
3943             return false;
3944         }
3945 
3946         if (element.getAttrByName("id") != null || element.getAttrByName("name") != null)
3947         {
3948             return false;
3949         }
3950 
3951         return true;
3952     }
3953 
3954     /**
3955      * duplicate name attribute as an id and check if id and name match.
3956      * @param node Node to check for name/it attributes
3957      */
3958     public void fixId(Node node)
3959     {
3960         AttVal name = node.getAttrByName("name");
3961         AttVal id = node.getAttrByName("id");
3962 
3963         if (name != null)
3964         {
3965             if (id != null)
3966             {
3967                 if (id.value != null && !id.value.equals(name.value))
3968                 {
3969                     report.attrError(this, node, name, Report.ID_NAME_MISMATCH);
3970                 }
3971             }
3972             else if (this.configuration.xmlOut)
3973             {
3974                 node.addAttribute("id", name.value);
3975             }
3976         }
3977     }
3978 
3979     /**
3980      * Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated.
3981      */
3982     public void deferDup()
3983     {
3984         this.insert = -1;
3985         this.inode = null;
3986     }
3987 
3988     /**
3989      * Constraint the html version in the document to the given one. Everything is allowed in proprietary version of
3990      * HTML this is handled here rather than in the tag/attr dicts.
3991      * @param vers html version code
3992      */
3993     void constrainVersion(int vers)
3994     {
3995         this.versions &= (vers | Dict.VERS_PROPRIETARY);
3996     }
3997 
3998     /**
3999      * Is content acceptable for pre elements?
4000      * @param node content
4001      * @return <code>true</code> if node is acceptable in pre elements
4002      */
4003     protected boolean preContent(Node node)
4004     {
4005         // p is coerced to br's
4006         if (node.tag == this.configuration.tt.tagP)
4007         {
4008             return true;
4009         }
4010 
4011         if (node.tag == null
4012             || node.tag == this.configuration.tt.tagP
4013             || !TidyUtils.toBoolean(node.tag.model & (Dict.CM_INLINE | Dict.CM_NEW)))
4014         {
4015             return false;
4016         }
4017         return true;
4018     }
4019 
4020     /**
4021      * document type.
4022      */
4023     private static class W3CVersionInfo
4024     {
4025 
4026         /**
4027          * name.
4028          */
4029         String name;
4030 
4031         /**
4032          * voyager name.
4033          */
4034         String voyagerName;
4035 
4036         /**
4037          * profile.
4038          */
4039         String profile;
4040 
4041         /**
4042          * code.
4043          */
4044         short code;
4045 
4046         /**
4047          * Instantiates a new W3CVersionInfo.
4048          * @param name version name
4049          * @param voyagerName voyager (xhtml) name
4050          * @param profile VOYAGER_STRICT | VOYAGER_LOOSE | VOYAGER_FRAMESET
4051          * @param code unique code for this version info
4052          */
4053         public W3CVersionInfo(String name, String voyagerName, String profile, short code)
4054         {
4055             this.name = name;
4056             this.voyagerName = voyagerName;
4057             this.profile = profile;
4058             this.code = code;
4059         }
4060     }
4061 
4062 }