Maven Clover report

Clover coverage report - Maven Clover report

Coverage timestamp: Tue Aug 1 2006 15:09:51 CEST

FRAMES NO FRAMES

file stats:	LOC:	4,062		Methods:	51
	NCLOC:	2,671		Classes:	2

Source file

Conditionals

Statements

Methods

TOTAL

Lexer.java

73.4%

80%

92.2%

78%

1		/*
2		* Java HTML Tidy - JTidy
3		* HTML parser and pretty printer
4		*
5		* Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6		* Institute of Technology, Institut National de Recherche en
7		* Informatique et en Automatique, Keio University). All Rights
8		* Reserved.
9		*
10		* Contributing Author(s):
11		*
12		* Dave Raggett <dsr@w3.org>
13		* Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14		* Gary L Peskin <garyp@firstech.com> (Java development)
15		* Sami Lempinen <sami@lempinen.net> (release management)
16		* Fabrizio Giustina <fgiust at users.sourceforge.net>
17		*
18		* The contributing author(s) would like to thank all those who
19		* helped with testing, bug fixes, and patience. This wouldn't
20		* have been possible without all of you.
21		*
22		* COPYRIGHT NOTICE:
23		*
24		* This software and documentation is provided "as is," and
25		* the copyright holders and contributing author(s) make no
26		* representations or warranties, express or implied, including
27		* but not limited to, warranties of merchantability or fitness
28		* for any particular purpose or that the use of the software or
29		* documentation will not infringe any third party patents,
30		* copyrights, trademarks or other rights.
31		*
32		* The copyright holders and contributing author(s) will not be
33		* liable for any direct, indirect, special or consequential damages
34		* arising out of any use of the software or documentation, even if
35		* advised of the possibility of such damage.
36		*
37		* Permission is hereby granted to use, copy, modify, and distribute
38		* this source code, or portions hereof, documentation and executables,
39		* for any purpose, without fee, subject to the following restrictions:
40		*
41		* 1. The origin of this source code must not be misrepresented.
42		* 2. Altered versions must be plainly marked as such and must
43		* not be misrepresented as being the original source.
44		* 3. This Copyright notice may not be removed or altered from any
45		* source or altered source distribution.
46		*
47		* The copyright holders and contributing author(s) specifically
48		* permit, without fee, and encourage the use of this source code
49		* as a component for supporting the Hypertext Markup Language in
50		* commercial products. If you use this source code in a product,
51		* acknowledgment is not required but would be appreciated.
52		*
53		*/
54		package org.w3c.tidy;
55
56		import java.io.PrintWriter;
57		import java.util.List;
58		import java.util.Stack;
59		import java.util.Vector;
60
61
62		/**
63		* Lexer for html parser.
64		* <p>
65		* Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
66		* level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
67		* null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
68		* mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
69		* to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
70		* Not yet done: - Doctype subset and marked sections
71		* </p>
72		* @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
73		* @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
74		* @author Fabrizio Giustina
75		* @version $Revision: 807 $ ($Author: fgiust $)
76		*/
77		public class Lexer
78		{
79
80		/**
81		* state: ignore whitespace.
82		*/
83		public static final short IGNORE_WHITESPACE = 0;
84
85		/**
86		* state: mixed content.
87		*/
88		public static final short MIXED_CONTENT = 1;
89
90		/**
91		* state: preformatted.
92		*/
93		public static final short PREFORMATTED = 2;
94
95		/**
96		* state: ignore markup.
97		*/
98		public static final short IGNORE_MARKUP = 3;
99
100		/**
101		* URI for XHTML 1.0 transitional DTD.
102		*/
103		private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
104
105		/**
106		* URI for XHTML 1.0 strict DTD.
107		*/
108		private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
109
110		/**
111		* URI for XHTML 1.0 frameset DTD.
112		*/
113		private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
114
115		/**
116		* URI for XHTML 1.1.
117		*/
118		private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";
119
120		/**
121		* URI for XHTML Basic 1.0.
122		*/
123		// private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd";
124		/**
125		* xhtml namespace.
126		*/
127		private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
128
129		/**
130		* lists all the known versions.
131		*/
132		private static final Lexer.W3CVersionInfo[] W3CVERSION = {
133		new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
134		new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
135		new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
136		new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
137		new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
138		new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
139		new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
140		new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
141		new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
142		new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20),
143		new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)};
144
145		/**
146		* getToken state: content.
147		*/
148		private static final short LEX_CONTENT = 0;
149
150		/**
151		* getToken state: gt.
152		*/
153		private static final short LEX_GT = 1;
154
155		/**
156		* getToken state: endtag.
157		*/
158		private static final short LEX_ENDTAG = 2;
159
160		/**
161		* getToken state: start tag.
162		*/
163		private static final short LEX_STARTTAG = 3;
164
165		/**
166		* getToken state: comment.
167		*/
168		private static final short LEX_COMMENT = 4;
169
170		/**
171		* getToken state: doctype.
172		*/
173		private static final short LEX_DOCTYPE = 5;
174
175		/**
176		* getToken state: procinstr.
177		*/
178		private static final short LEX_PROCINSTR = 6;
179
180		/**
181		* getToken state: cdata.
182		*/
183		private static final short LEX_CDATA = 8;
184
185		/**
186		* getToken state: section.
187		*/
188		private static final short LEX_SECTION = 9;
189
190		/**
191		* getToken state: asp.
192		*/
193		private static final short LEX_ASP = 10;
194
195		/**
196		* getToken state: jste.
197		*/
198		private static final short LEX_JSTE = 11;
199
200		/**
201		* getToken state: php.
202		*/
203		private static final short LEX_PHP = 12;
204
205		/**
206		* getToken state: xml declaration.
207		*/
208		private static final short LEX_XMLDECL = 13;
209
210		/**
211		* file stream.
212		*/
213		protected StreamIn in;
214
215		/**
216		* error output stream.
217		*/
218		protected PrintWriter errout;
219
220		/**
221		* for accessibility errors.
222		*/
223		protected short badAccess;
224
225		/**
226		* for bad style errors.
227		*/
228		protected short badLayout;
229
230		/**
231		* for bad char encodings.
232		*/
233		protected short badChars;
234
235		/**
236		* for mismatched/mispositioned form tags.
237		*/
238		protected short badForm;
239
240		/**
241		* count of warnings in this document.
242		*/
243		protected short warnings;
244
245		/**
246		* count of errors.
247		*/
248		protected short errors;
249
250		/**
251		* lines seen.
252		*/
253		protected int lines;
254
255		/**
256		* at start of current token.
257		*/
258		protected int columns;
259
260		/**
261		* used to collapse contiguous white space.
262		*/
263		protected boolean waswhite;
264
265		/**
266		* true after token has been pushed back.
267		*/
268		protected boolean pushed;
269
270		/**
271		* when space is moved after end tag.
272		*/
273		protected boolean insertspace;
274
275		/**
276		* Netscape compatibility.
277		*/
278		protected boolean excludeBlocks;
279
280		/**
281		* true if moved out of table.
282		*/
283		protected boolean exiled;
284
285		/**
286		* true if xmlns attribute on html element.
287		*/
288		protected boolean isvoyager;
289
290		/**
291		* bit vector of HTML versions.
292		*/
293		protected short versions;
294
295		/**
296		* version as given by doctype (if any).
297		*/
298		protected int doctype;
299
300		/**
301		* set if html or PUBLIC is missing.
302		*/
303		protected boolean badDoctype;
304
305		/**
306		* start of current node.
307		*/
308		protected int txtstart;
309
310		/**
311		* end of current node.
312		*/
313		protected int txtend;
314
315		/**
316		* state of lexer's finite state machine.
317		*/
318		protected short state;
319
320		/**
321		* current node.
322		*/
323		protected Node token;
324
325		/**
326		* Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
327		* all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
328		*/
329		protected byte[] lexbuf;
330
331		/**
332		* allocated.
333		*/
334		protected int lexlength;
335
336		/**
337		* used.
338		*/
339		protected int lexsize;
340
341		/**
342		* Inline stack for compatibility with Mosaic. For deferring text node.
343		*/
344		protected Node inode;
345
346		/**
347		* for inferring inline tags.
348		*/
349		protected int insert;
350
351		/**
352		* stack.
353		*/
354		protected Stack istack;
355
356		/**
357		* start of frame.
358		*/
359		protected int istackbase;
360
361		/**
362		* used for cleaning up presentation markup.
363		*/
364		protected Style styles;
365
366		/**
367		* configuration.
368		*/
369		protected Configuration configuration;
370
371		/**
372		* already seen end body tag?
373		*/
374		protected boolean seenEndBody;
375
376		/**
377		* already seen end html tag?
378		*/
379		protected boolean seenEndHtml;
380
381		/**
382		* report.
383		*/
384		protected Report report;
385
386		/**
387		* Root node is saved here.
388		*/
389		protected Node root;
390
391		/**
392		* node list.
393		*/
394		private List nodeList;
395
396		/**
397		* Instantiates a new Lexer.
398		* @param in StreamIn
399		* @param configuration configuation instance
400		* @param report report instance, for reporting errors
401		*/
402	292	public Lexer(StreamIn in, Configuration configuration, Report report)
403		{
404	292	this.report = report;
405	292	this.in = in;
406	292	this.lines = 1;
407	292	this.columns = 1;
408	292	this.state = LEX_CONTENT;
409	292	this.versions = (Dict.VERS_ALL \| Dict.VERS_PROPRIETARY);
410	292	this.doctype = Dict.VERS_UNKNOWN;
411	292	this.insert = -1;
412	292	this.istack = new Stack();
413	292	this.configuration = configuration;
414	292	this.nodeList = new Vector();
415		}
416
417		/**
418		* Creates a new node and add it to nodelist.
419		* @return Node
420		*/
421	339	public Node newNode()
422		{
423	339	Node node = new Node();
424	339	this.nodeList.add(node);
425	339	return node;
426		}
427
428		/**
429		* Creates a new node and add it to nodelist.
430		* @param type node type: Node.ROOT_NODE \| Node.DOCTYPE_TAG \| Node.COMMENT_TAG \| Node.PROC_INS_TAG \| Node.TEXT_NODE \|
431		* Node.START_TAG \| Node.END_TAG \| Node.START_END_TAG \| Node.CDATA_TAG \| Node.SECTION_TAG \| Node. ASP_TAG \|
432		* Node.JSTE_TAG \| Node.PHP_TAG \| Node.XML_DECL
433		* @param textarray array of bytes contained in the Node
434		* @param start start position
435		* @param end end position
436		* @return Node
437		*/
438	7627	public Node newNode(short type, byte[] textarray, int start, int end)
439		{
440	7627	Node node = new Node(type, textarray, start, end);
441	7627	this.nodeList.add(node);
442	7627	return node;
443		}
444
445		/**
446		* Creates a new node and add it to nodelist.
447		* @param type node type: Node.ROOT_NODE \| Node.DOCTYPE_TAG \| Node.COMMENT_TAG \| Node.PROC_INS_TAG \| Node.TEXT_NODE \|
448		* Node.START_TAG \| Node.END_TAG \| Node.START_END_TAG \| Node.CDATA_TAG \| Node.SECTION_TAG \| Node. ASP_TAG \|
449		* Node.JSTE_TAG \| Node.PHP_TAG \| Node.XML_DECL
450		* @param textarray array of bytes contained in the Node
451		* @param start start position
452		* @param end end position
453		* @param element tag name
454		* @return Node
455		*/
456	13251	public Node newNode(short type, byte[] textarray, int start, int end, String element)
457		{
458	13251	Node node = new Node(type, textarray, start, end, element, this.configuration.tt);
459	13251	this.nodeList.add(node);
460	13251	return node;
461		}
462
463		/**
464		* Clones a node and add it to node list.
465		* @param node Node
466		* @return cloned Node
467		*/
468	4	public Node cloneNode(Node node)
469		{
470	4	Node cnode = (Node) node.clone();
471	4	this.nodeList.add(cnode);
472	4	for (AttVal att = cnode.attributes; att != null; att = att.next)
473		{
474	0	if (att.asp != null)
475		{
476	0	this.nodeList.add(att.asp);
477		}
478	0	if (att.php != null)
479		{
480	0	this.nodeList.add(att.php);
481		}
482		}
483	4	return cnode;
484		}
485
486		/**
487		* Clones an attribute value and add eventual asp or php node to node list.
488		* @param attrs original AttVal
489		* @return cloned AttVal
490		*/
491	1944	public AttVal cloneAttributes(AttVal attrs)
492		{
493	1944	AttVal cattrs = (AttVal) attrs.clone();
494	1944	for (AttVal att = cattrs; att != null; att = att.next)
495		{
496	2526	if (att.asp != null)
497		{
498	0	this.nodeList.add(att.asp);
499		}
500	2526	if (att.php != null)
501		{
502	0	this.nodeList.add(att.php);
503		}
504		}
505	1944	return cattrs;
506		}
507
508		/**
509		* Update <code>oldtextarray</code> in the current nodes.
510		* @param oldtextarray previous text array
511		* @param newtextarray new text array
512		*/
513	10	protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
514		{
515	10	Node node;
516	10	for (int i = 0; i < this.nodeList.size(); i++)
517		{
518	17511	node = (Node) (this.nodeList.get(i));
519	17511	if (node.textarray == oldtextarray)
520		{
521	17499	node.textarray = newtextarray;
522		}
523		}
524		}
525
526		/**
527		* Adds a new line node. Used for creating preformatted text from Word2000.
528		* @return new line node
529		*/
530	0	public Node newLineNode()
531		{
532	0	Node node = newNode();
533
534	0	node.textarray = this.lexbuf;
535	0	node.start = this.lexsize;
536	0	addCharToLexer('\n');
537	0	node.end = this.lexsize;
538	0	return node;
539		}
540
541		/**
542		* Has end of input stream been reached?
543		* @return <code>true</code> if end of input stream been reached
544		*/
545	7927	public boolean endOfInput()
546		{
547	7927	return this.in.isEndOfStream();
548		}
549
550		/**
551		* Adds a byte to lexer buffer.
552		* @param c byte to add
553		*/
554	344463	public void addByte(int c)
555		{
556	344463	if (this.lexsize + 1 >= this.lexlength)
557		{
558	248	while (this.lexsize + 1 >= this.lexlength)
559		{
560	248	if (this.lexlength == 0)
561		{
562	238	this.lexlength = 8192;
563		}
564		else
565		{
566	10	this.lexlength = this.lexlength * 2;
567		}
568		}
569
570	248	byte[] temp = this.lexbuf;
571	248	this.lexbuf = new byte[this.lexlength];
572	248	if (temp != null)
573		{
574	10	System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
575	10	updateNodeTextArrays(temp, this.lexbuf);
576		}
577		}
578
579	344463	this.lexbuf[this.lexsize++] = (byte) c;
580	344463	this.lexbuf[this.lexsize] = (byte) '\0'; // debug
581		}
582
583		/**
584		* Substitute the last char in buffer.
585		* @param c new char
586		*/
587	935	public void changeChar(byte c)
588		{
589	935	if (this.lexsize > 0)
590		{
591	935	this.lexbuf[this.lexsize - 1] = c;
592		}
593		}
594
595		/**
596		* Store char c as UTF-8 encoded byte stream.
597		* @param c char to store
598		*/
599	331875	public void addCharToLexer(int c)
600		{
601		// Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char
602		// Fix by Pablo Mayrgundter 17-08-2004
603
604	331875	if ((this.configuration.xmlOut \|\| this.configuration.xHTML) // only for xml output
605		&& !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first.
606		\|\| c == 0x9
607		\|\| c == 0xA
608		\|\| c == 0xD // Then white-space.
609		\|\| (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode.
610		\|\| (c >= 0x10000 && c <= 0x10FFFF)))
611		{
612	1	return;
613		}
614
615	331874	int i = 0;
616	331874	int[] count = new int[]{0};
617	331874	byte[] buf = new byte[10]; // unsigned char
618
619	331874	boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count);
620	331874	if (err)
621		{
622		// replacement char 0xFFFD encoded as UTF-8
623	0	buf[0] = (byte) 0xEF;
624	0	buf[1] = (byte) 0xBF;
625	0	buf[2] = (byte) 0xBD;
626	0	count[0] = 3;
627		}
628
629	331874	for (i = 0; i < count[0]; i++)
630		{
631	344345	addByte(buf[i]); // uint
632		}
633
634		}
635
636		/**
637		* Adds a string to lexer buffer.
638		* @param str String to add
639		*/
640	0	public void addStringToLexer(String str)
641		{
642	0	for (int i = 0; i < str.length(); i++)
643		{
644	0	addCharToLexer(str.charAt(i));
645		}
646		}
647
648		/**
649		* Parse an html entity.
650		* @param mode mode
651		*/
652	1777	public void parseEntity(short mode)
653		{
654		// No longer attempts to insert missing ';' for unknown
655		// entities unless one was present already, since this
656		// gives unexpected results.
657		//
658		// For example: <a href="something.htm?foo&bar&fred">
659		// was tidied to: <a href="something.htm?foo&bar;&fred;">
660		// rather than: <a href="something.htm?foo&bar&fred">
661		//
662		// My thanks for Maurice Buxton for spotting this.
663		//
664		// Also Randy Waki pointed out the following case for the
665		// 04 Aug 00 version (bug #433012):
666		//
667		// For example: <a href="something.htm?id=1&lang=en">
668		// was tidied to: <a href="something.htm?id=1&lang;=en">
669		// rather than: <a href="something.htm?id=1&lang=en">
670		//
671		// where "lang" is a known entity (#9001), but browsers would
672		// misinterpret "&lang;" because it had a value > 256.
673		//
674		// So the case of an apparently known entity with a value > 256 and
675		// missing a semicolon is handled specially.
676		//
677		// "ParseEntity" is also a bit of a misnomer - it handles entities and
678		// numeric character references. Invalid NCR's are now reported.
679
680	1777	int start;
681	1777	boolean first = true;
682	1777	boolean semicolon = false;
683	1777	int c, ch, startcol;
684	1777	String str;
685
686	1777	start = this.lexsize - 1; // to start at "&"
687	1777	startcol = this.in.getCurcol() - 1;
688
689	?	while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
690		{
691	8667	if (c == ';')
692		{
693	1217	semicolon = true;
694	1217	break;
695		}
696
697	7450	if (first && c == '#')
698		{
699		// #431953 - start RJ
700	313	if (!this.configuration.ncr
701		\|\| "BIG5".equals(this.configuration.getInCharEncodingName())
702		\|\| "SHIFTJIS".equals(this.configuration.getInCharEncodingName()))
703		{
704	0	this.in.ungetChar(c);
705	0	return;
706		}
707		// #431953 - end RJ
708
709	313	addCharToLexer(c);
710	313	first = false;
711	313	continue;
712		}
713
714	7137	first = false;
715
716	7137	if (TidyUtils.isNamechar((char) c))
717		{
718	6577	addCharToLexer(c);
719	6577	continue;
720		}
721
722		// otherwise put it back
723	560	this.in.ungetChar(c);
724	560	break;
725		}
726
727	1777	str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start);
728
729	1777	if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML)
730		{
731	2	report.entityError(this, Report.APOS_UNDEFINED, str, 39);
732		}
733
734	1777	ch = EntityTable.getDefaultEntityTable().entityCode(str);
735
736		// drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004
737		// if ((this.configuration.xmlOut \|\| this.configuration.xHTML) // only for xml output
738		// && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first.
739		// \|\| ch == 0x9 \|\| ch == 0xA \|\| ch == 0xD // Then white-space.
740		// \|\| (ch >= 0xE000 && ch <= 0xFFFD)))
741		// {
742		// this.lexsize = start;
743		// return;
744		// }
745
746		// deal with unrecognized or invalid entities
747		// #433012 - fix by Randy Waki 17 Feb 01
748		// report invalid NCR's - Terry Teague 01 Sep 01
749	1777	if (ch <= 0 \|\| (ch >= 256 && c != ';'))
750		{
751		// set error position just before offending character
752	363	this.lines = this.in.getCurline();
753	363	this.columns = startcol;
754
755	363	if (this.lexsize > start + 1)
756		{
757	341	if (ch >= 128 && ch <= 159)
758		{
759		// invalid numeric character reference
760	0	int c1 = 0;
761
762	0	if ("WIN1252".equals(configuration.replacementCharEncoding))
763		{
764	0	c1 = EncodingUtils.decodeWin1252(ch);
765		}
766	0	else if ("MACROMAN".equals(configuration.replacementCharEncoding))
767		{
768	0	c1 = EncodingUtils.decodeMacRoman(ch);
769		}
770
771		// "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing
772
773	0	int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;
774
775	0	if (c != ';') /* issue warning if not terminated by ';' */
776		{
777	0	report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c);
778		}
779
780	0	report.encodingError(this, (short) (Report.INVALID_NCR \| replaceMode), ch);
781
782	0	if (c1 != 0)
783		{
784		// make the replacement
785	0	this.lexsize = start;
786	0	addCharToLexer(c1);
787	0	semicolon = false;
788		}
789		else
790		{
791		/* discard */
792	0	this.lexsize = start;
793	0	semicolon = false;
794		}
795
796		}
797		else
798		{
799	341	report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
800		}
801
802	341	if (semicolon)
803		{
804	4	addCharToLexer(';');
805		}
806		}
807		else
808		{
809		// naked &
810	22	report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
811		}
812		}
813		else
814		{
815		// issue warning if not terminated by ';'
816	1414	if (c != ';')
817		{
818		// set error position just before offending character
819	201	this.lines = this.in.getCurline();
820	201	this.columns = startcol;
821	201	report.entityError(this, Report.MISSING_SEMICOLON, str, c);
822		}
823
824	1414	this.lexsize = start;
825
826	1414	if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED))
827		{
828	0	ch = ' ';
829		}
830
831	1414	addCharToLexer(ch);
832
833	1414	if (ch == '&' && !this.configuration.quoteAmpersand)
834		{
835	0	addCharToLexer('a');
836	0	addCharToLexer('m');
837	0	addCharToLexer('p');
838	0	addCharToLexer(';');
839		}
840		}
841		}
842
843		/**
844		* Parses a tag name.
845		* @return first char after the tag name
846		*/
847	13071	public char parseTagName()
848		{
849	13071	int c;
850
851		// fold case of first char in buffer
852	13071	c = this.lexbuf[this.txtstart];
853
854	13071	if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
855		{
856	1476	c = TidyUtils.toLower((char) c);
857	1476	this.lexbuf[this.txtstart] = (byte) c;
858		}
859
860	?	while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
861		{
862	32365	if (!TidyUtils.isNamechar((char) c))
863		{
864	13071	break;
865		}
866
867		// fold case of subsequent chars
868	19294	if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
869		{
870	2314	c = TidyUtils.toLower((char) c);
871		}
872
873	19294	addCharToLexer(c);
874		}
875
876	13071	this.txtend = this.lexsize;
877	13071	return (char) c;
878		}
879
880		/**
881		* calls addCharToLexer for any char in the string.
882		* @param str input String
883		*/
884	824	public void addStringLiteral(String str)
885		{
886	824	int len = str.length();
887	824	for (int i = 0; i < len; i++)
888		{
889	11105	addCharToLexer(str.charAt(i));
890		}
891		}
892
893		/**
894		* calls addCharToLexer for any char in the string till len is reached.
895		* @param str input String
896		* @param len length of the substring to be added
897		*/
898	1	void addStringLiteralLen(String str, int len)
899		{
900	1	int strlen = str.length();
901	1	if (strlen < len)
902		{
903	0	len = strlen;
904		}
905	1	for (int i = 0; i < len; i++)
906		{
907	60	addCharToLexer(str.charAt(i));
908		}
909		}
910
911		/**
912		* Choose what version to use for new doctype.
913		* @return html version constant
914		*/
915	266	public short htmlVersion()
916		{
917	266	if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20))
918		{
919	51	return Dict.VERS_HTML20;
920		}
921
922	215	if (!(this.configuration.xmlOut \| this.configuration.xmlTags \| this.isvoyager)
923		&& TidyUtils.toBoolean(versions & Dict.VERS_HTML32))
924		{
925	60	return Dict.VERS_HTML32;
926		}
927	155	if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11))
928		{
929	15	return Dict.VERS_XHTML11;
930		}
931	140	if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT))
932		{
933	48	return Dict.VERS_HTML40_STRICT;
934		}
935
936	92	if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE))
937		{
938	45	return Dict.VERS_HTML40_LOOSE;
939		}
940
941	47	if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET))
942		{
943	14	return Dict.VERS_FRAMESET;
944		}
945
946	33	return Dict.VERS_UNKNOWN;
947		}
948
949		/**
950		* Choose what version to use for new doctype.
951		* @return html version name
952		*/
953	214	public String htmlVersionName()
954		{
955	214	short guessed;
956	214	int j;
957
958	214	guessed = apparentVersion();
959
960	214	for (j = 0; j < W3CVERSION.length; ++j)
961		{
962	1213	if (guessed == W3CVERSION[j].code)
963		{
964	197	if (this.isvoyager)
965		{
966	62	return W3CVERSION[j].voyagerName;
967		}
968
969	135	return W3CVERSION[j].name;
970		}
971		}
972
973	17	return null;
974		}
975
976		/**
977		* Add meta element for Tidy. If the meta tag is already present, update release date.
978		* @param root root node
979		* @return <code>true</code> if the tag has been added
980		*/
981	15	public boolean addGenerator(Node root)
982		{
983	15	AttVal attval;
984	15	Node node;
985	15	Node head = root.findHEAD(this.configuration.tt);
986
987	15	if (head != null)
988		{
989	15	String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see www.w3.org";
990
991	15	for (node = head.content; node != null; node = node.next)
992		{
993	19	if (node.tag == this.configuration.tt.tagMeta)
994		{
995	3	attval = node.getAttrByName("name");
996
997	3	if (attval != null && attval.value != null && "generator".equalsIgnoreCase(attval.value))
998		{
999	1	attval = node.getAttrByName("content");
1000
1001	1	if (attval != null
1002		&& attval.value != null
1003		&& attval.value.length() >= 9
1004		&& "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9)))
1005		{
1006	0	attval.value = meta;
1007	0	return false;
1008		}
1009		}
1010		}
1011		}
1012
1013	15	node = this.inferredTag("meta");
1014	15	node.addAttribute("content", meta);
1015	15	node.addAttribute("name", "generator");
1016	15	head.insertNodeAtStart(node);
1017	15	return true;
1018		}
1019
1020	0	return false;
1021		}
1022
1023		/**
1024		* Check system keywords (keywords should be uppercase).
1025		* @param doctype doctype node
1026		* @return true if doctype keywords are all uppercase
1027		*/
1028	118	public boolean checkDocTypeKeyWords(Node doctype)
1029		{
1030	118	int len = doctype.end - doctype.start;
1031	118	String s = TidyUtils.getString(this.lexbuf, doctype.start, len);
1032
1033	118	return !(TidyUtils.findBadSubString("SYSTEM", s, len)
1034		\|\| TidyUtils.findBadSubString("PUBLIC", s, len)
1035		\|\| TidyUtils.findBadSubString("//DTD", s, len)
1036		\|\| TidyUtils.findBadSubString("//W3C", s, len) \|\| TidyUtils.findBadSubString("//EN", s, len));
1037		}
1038
1039		/**
1040		* Examine DOCTYPE to identify version.
1041		* @param doctype doctype node
1042		* @return version code
1043		*/
1044	119	public short findGivenVersion(Node doctype)
1045		{
1046	119	String p, s;
1047	119	int i, j;
1048	119	int len;
1049	119	String str1;
1050	119	String str2;
1051
1052		// if root tag for doctype isn't html give up now
1053	119	str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5);
1054	119	if (!"html ".equalsIgnoreCase(str1))
1055		{
1056	2	return 0;
1057		}
1058
1059	117	if (!checkDocTypeKeyWords(doctype))
1060		{
1061	0	report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
1062		}
1063
1064		// give up if all we are given is the system id for the doctype
1065	117	str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7);
1066	117	if ("SYSTEM ".equalsIgnoreCase(str1))
1067		{
1068		// but at least ensure the case is correct
1069	0	if (!str1.substring(0, 6).equals("SYSTEM"))
1070		{
1071	0	System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6);
1072		}
1073	0	return 0; // unrecognized
1074		}
1075
1076	117	if ("PUBLIC ".equalsIgnoreCase(str1))
1077		{
1078	117	if (!str1.substring(0, 6).equals("PUBLIC"))
1079		{
1080	0	System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6);
1081		}
1082		}
1083		else
1084		{
1085	0	this.badDoctype = true;
1086		}
1087
1088	1521	for (i = doctype.start; i < doctype.end; ++i)
1089		{
1090	1521	if (this.lexbuf[i] == (byte) '"')
1091		{
1092	117	str1 = TidyUtils.getString(this.lexbuf, i + 1, 12);
1093	117	str2 = TidyUtils.getString(this.lexbuf, i + 1, 13);
1094	117	if (str1.equals("-//W3C//DTD "))
1095		{
1096		// compute length of identifier e.g. "HTML 4.0 Transitional"
1097	115	for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
1098		{
1099		//
1100		}
1101	115	len = j - i - 13;
1102	115	p = TidyUtils.getString(this.lexbuf, i + 13, len);
1103
1104	115	for (j = 1; j < W3CVERSION.length; ++j)
1105		{
1106	806	s = W3CVERSION[j].name;
1107	806	if (len == s.length() && s.equals(p))
1108		{
1109	52	return W3CVERSION[j].code;
1110		}
1111		}
1112
1113		// else unrecognized version
1114		}
1115	2	else if (str2.equals("-//IETF//DTD "))
1116		{
1117		// compute length of identifier e.g. "HTML 2.0"
1118	2	for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
1119		{
1120		//
1121		}
1122	2	len = j - i - 14;
1123
1124	2	p = TidyUtils.getString(this.lexbuf, i + 14, len);
1125	2	s = W3CVERSION[0].name;
1126	2	if (len == s.length() && s.equals(p))
1127		{
1128	0	return W3CVERSION[0].code;
1129		}
1130
1131		// else unrecognized version
1132		}
1133	65	break;
1134		}
1135		}
1136
1137	65	return 0;
1138		}
1139
1140		/**
1141		* Fix xhtml namespace.
1142		* @param root root Node
1143		* @param profile current profile
1144		*/
1145	79	public void fixHTMLNameSpace(Node root, String profile)
1146		{
1147	79	Node node;
1148	79	AttVal attr;
1149
1150	79	node = root.content;
1151	79	while (node != null && node.tag != this.configuration.tt.tagHtml)
1152		{
1153	83	node = node.next;
1154		}
1155
1156	79	if (node != null)
1157		{
1158
1159	79	for (attr = node.attributes; attr != null; attr = attr.next)
1160		{
1161	74	if (attr.attribute.equals("xmlns"))
1162		{
1163	64	break;
1164		}
1165
1166		}
1167
1168	79	if (attr != null)
1169		{
1170	64	if (!attr.value.equals(profile))
1171		{
1172	0	report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
1173	0	attr.value = profile;
1174		}
1175		}
1176		else
1177		{
1178	15	attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
1179	15	attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr);
1180	15	node.attributes = attr;
1181		}
1182		}
1183		}
1184
1185		/**
1186		* Put DOCTYPE declaration between the &lt:?xml version "1.0" ... ?> declaration, if any, and the
1187		* <code>html</code> tag. Should also work for any comments, etc. that may precede the <code>html</code> tag.
1188		* @param root root node
1189		* @return new doctype node
1190		*/
1191	85	Node newXhtmlDocTypeNode(Node root)
1192		{
1193	85	Node html = root.findHTML(this.configuration.tt);
1194	85	if (html == null)
1195		{
1196	0	return null;
1197		}
1198
1199	85	Node newdoctype = newNode();
1200	85	newdoctype.setType(Node.DOCTYPE_TAG);
1201	85	newdoctype.next = html;
1202	85	newdoctype.parent = root;
1203	85	newdoctype.prev = null;
1204
1205	85	if (html == root.content)
1206		{
1207		// No <?xml ... ?> declaration.
1208	79	root.content.prev = newdoctype;
1209	79	root.content = newdoctype;
1210	79	newdoctype.prev = null;
1211		}
1212		else
1213		{
1214		// we have an <?xml ... ?> declaration.
1215	6	newdoctype.prev = html.prev;
1216	6	newdoctype.prev.next = newdoctype;
1217		}
1218	85	html.prev = newdoctype;
1219	85	return newdoctype;
1220		}
1221
1222		/**
1223		* Adds a new xhtml doctype to the document.
1224		* @param root root node
1225		* @return <code>true</code> if a doctype has been added
1226		*/
1227	79	public boolean setXHTMLDocType(Node root)
1228		{
1229	79	String fpi = " ";
1230	79	String sysid = "";
1231	79	String namespace = XHTML_NAMESPACE;
1232	79	String dtdsub = null;
1233	79	Node doctype;
1234	79	int dtdlen = 0;
1235
1236	79	doctype = root.findDocType();
1237
1238	79	fixHTMLNameSpace(root, namespace); // #427839 - fix by Evan Lenz 05 Sep 00
1239
1240	79	if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
1241		{
1242	2	if (doctype != null)
1243		{
1244	0	Node.discardElement(doctype);
1245		}
1246	2	return true;
1247		}
1248
1249	77	if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
1250		{
1251		// see what flavor of XHTML this document matches
1252	77	if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1253		{
1254		// use XHTML strict
1255	46	fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1256	46	sysid = VOYAGER_STRICT;
1257		}
1258	31	else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1259		{
1260		// use XHTML frames
1261	1	fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
1262	1	sysid = VOYAGER_FRAMESET;
1263		}
1264	30	else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE))
1265		{
1266	27	fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1267	27	sysid = VOYAGER_LOOSE;
1268		}
1269	3	else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1270		{
1271		// use XHTML 1.1
1272	1	fpi = "-//W3C//DTD XHTML 1.1//EN";
1273	1	sysid = VOYAGER_11;
1274		}
1275		else
1276		{
1277		// proprietary
1278	2	fpi = null;
1279	2	sysid = "";
1280	2	if (doctype != null)// #473490 - fix by BjÅ¡rn HÅ¡hrmann 10 Oct 01
1281		{
1282	2	Node.discardElement(doctype);
1283		}
1284		}
1285		}
1286	0	else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
1287		{
1288	0	fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
1289	0	sysid = VOYAGER_STRICT;
1290		}
1291	0	else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
1292		{
1293	0	fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
1294	0	sysid = VOYAGER_LOOSE;
1295		}
1296
1297	77	if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null)
1298		{
1299	0	fpi = this.configuration.docTypeStr;
1300	0	sysid = "";
1301		}
1302
1303	77	if (fpi == null)
1304		{
1305	2	return false;
1306		}
1307
1308	75	if (doctype != null)
1309		{
1310		// Look for internal DTD subset
1311	64	if (configuration.xHTML \|\| configuration.xmlOut)
1312		{
1313
1314	64	int len = doctype.end - doctype.start + 1;
1315	64	String start = TidyUtils.getString(this.lexbuf, doctype.start, len);
1316
1317	64	int dtdbeg = start.indexOf('[');
1318	64	if (dtdbeg >= 0)
1319		{
1320	39	int dtdend = start.substring(dtdbeg).indexOf(']');
1321	39	if (dtdend >= 0)
1322		{
1323	1	dtdlen = dtdend + 1;
1324	1	dtdsub = start.substring(dtdbeg);
1325		}
1326		}
1327		}
1328		}
1329		else
1330		{
1331	?	if ((doctype = newXhtmlDocTypeNode(root)) == null)
1332		{
1333	0	return false;
1334		}
1335		}
1336
1337	75	this.txtstart = this.lexsize;
1338	75	this.txtend = this.lexsize;
1339
1340		// add public identifier
1341	75	addStringLiteral("html PUBLIC ");
1342
1343		// check if the fpi is quoted or not
1344	75	if (fpi.charAt(0) == '"')
1345		{
1346	0	addStringLiteral(fpi);
1347		}
1348		else
1349		{
1350	75	addStringLiteral("\"");
1351	75	addStringLiteral(fpi);
1352	75	addStringLiteral("\"");
1353		}
1354
1355	75	if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen)
1356		{
1357	0	addStringLiteral("\n\"");
1358		}
1359		else
1360		{
1361		// FG: don't wrap
1362	75	addStringLiteral(" \"");
1363		}
1364
1365		// add system identifier
1366	75	addStringLiteral(sysid);
1367	75	addStringLiteral("\"");
1368
1369	75	if (dtdlen > 0 && dtdsub != null)
1370		{
1371	1	addCharToLexer(' ');
1372	1	addStringLiteralLen(dtdsub, dtdlen);
1373		}
1374
1375	75	this.txtend = this.lexsize;
1376
1377	75	int length = this.txtend - this.txtstart;
1378	75	doctype.textarray = new byte[length];
1379
1380	75	System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
1381	75	doctype.start = 0;
1382	75	doctype.end = length;
1383
1384	75	return false;
1385		}
1386
1387		/**
1388		* Return the html version used in document.
1389		* @return version code
1390		*/
1391	214	public short apparentVersion()
1392		{
1393	214	switch (this.doctype)
1394		{
1395	165	case Dict.VERS_UNKNOWN :
1396	165	return htmlVersion();
1397
1398	0	case Dict.VERS_HTML20 :
1399	0	if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
1400		{
1401	0	return Dict.VERS_HTML20;
1402		}
1403
1404	0	break;
1405
1406	6	case Dict.VERS_HTML32 :
1407	6	if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
1408		{
1409	5	return Dict.VERS_HTML32;
1410		}
1411
1412	1	break; // to replace old version by new
1413
1414	2	case Dict.VERS_HTML40_STRICT :
1415	2	if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1416		{
1417	2	return Dict.VERS_HTML40_STRICT;
1418		}
1419
1420	0	break;
1421
1422	34	case Dict.VERS_HTML40_LOOSE :
1423	34	if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
1424		{
1425	29	return Dict.VERS_HTML40_LOOSE;
1426		}
1427
1428	5	break; // to replace old version by new
1429
1430	2	case Dict.VERS_FRAMESET :
1431	2	if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1432		{
1433	2	return Dict.VERS_FRAMESET;
1434		}
1435
1436	0	break;
1437
1438	5	case Dict.VERS_XHTML11 :
1439	5	if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1440		{
1441	3	return Dict.VERS_XHTML11;
1442		}
1443
1444	2	break;
1445	0	default :
1446		// should never reach here
1447	0	break;
1448		}
1449
1450		// kludge to avoid error appearing at end of file
1451		// it would be better to note the actual position
1452		// when first encountering the doctype declaration
1453
1454	8	this.lines = 1;
1455	8	this.columns = 1;
1456
1457	8	report.warning(this, null, null, Report.INCONSISTENT_VERSION);
1458	8	return this.htmlVersion();
1459		}
1460
1461		/**
1462		* Fixup doctype if missing.
1463		* @param root root node
1464		* @return <code>false</code> if current version has not been identified
1465		*/
1466	141	public boolean fixDocType(Node root)
1467		{
1468	141	Node doctype;
1469	141	int guessed = Dict.VERS_HTML40_STRICT, i;
1470
1471	141	if (this.badDoctype)
1472		{
1473	0	report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
1474		}
1475
1476	141	doctype = root.findDocType();
1477
1478	141	if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
1479		{
1480	0	if (doctype != null)
1481		{
1482	0	Node.discardElement(doctype);
1483		}
1484	0	return true;
1485		}
1486
1487	141	if (this.configuration.xmlOut)
1488		{
1489	4	return true;
1490		}
1491
1492	137	if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
1493		{
1494	0	Node.discardElement(doctype);
1495	0	doctype = null;
1496	0	guessed = Dict.VERS_HTML40_STRICT;
1497		}
1498	137	else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
1499		{
1500	0	Node.discardElement(doctype);
1501	0	doctype = null;
1502	0	guessed = Dict.VERS_HTML40_LOOSE;
1503		}
1504	137	else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
1505		{
1506	135	if (doctype != null)
1507		{
1508	49	if (this.doctype == Dict.VERS_UNKNOWN)
1509		{
1510	4	return false;
1511		}
1512
1513	45	switch (this.doctype)
1514		{
1515	0	case Dict.VERS_UNKNOWN :
1516	0	return false;
1517
1518	0	case Dict.VERS_HTML20 :
1519	0	if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
1520		{
1521	0	return true;
1522		}
1523
1524	0	break; // to replace old version by new
1525
1526	5	case Dict.VERS_HTML32 :
1527	5	if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
1528		{
1529	5	return true;
1530		}
1531
1532	0	break; // to replace old version by new
1533
1534	2	case Dict.VERS_HTML40_STRICT :
1535	2	if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
1536		{
1537	2	return true;
1538		}
1539
1540	0	break; // to replace old version by new
1541
1542	32	case Dict.VERS_HTML40_LOOSE :
1543	32	if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
1544		{
1545	27	return true;
1546		}
1547
1548	5	break; // to replace old version by new
1549
1550	1	case Dict.VERS_FRAMESET :
1551	1	if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
1552		{
1553	1	return true;
1554		}
1555
1556	0	break; // to replace old version by new
1557
1558	5	case Dict.VERS_XHTML11 :
1559	5	if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
1560		{
1561	3	return true;
1562		}
1563
1564	2	break; // to replace old version by new
1565	0	default :
1566		// should never reach here
1567	0	break;
1568		}
1569
1570		// INCONSISTENT_VERSION warning is now issued by ApparentVersion()
1571		}
1572
1573		// choose new doctype
1574	93	guessed = htmlVersion();
1575		}
1576
1577	95	if (guessed == Dict.VERS_UNKNOWN)
1578		{
1579	16	return false;
1580		}
1581
1582		// for XML use the Voyager system identifier
1583	79	if (this.configuration.xmlOut \|\| this.configuration.xmlTags \|\| this.isvoyager)
1584		{
1585	0	if (doctype != null)
1586		{
1587	0	Node.discardElement(doctype);
1588		}
1589
1590	0	fixHTMLNameSpace(root, XHTML_NAMESPACE);
1591
1592		// Namespace is the same for all XHTML variants
1593		// Also, don't return yet. Still need to add DOCTYPE declaration.
1594		//
1595		// for (i = 0; i < W3CVersion.length; ++i)
1596		// {
1597		// if (guessed == W3CVersion[i].code)
1598		// {
1599		// fixHTMLNameSpace(root, W3CVersion[i].profile);
1600		// break;
1601		// }
1602		// }
1603		// return true;
1604		}
1605
1606	79	if (doctype == null)
1607		{
1608	?	if ((doctype = newXhtmlDocTypeNode(root)) == null)
1609		{
1610	0	return false;
1611		}
1612		}
1613
1614	79	this.txtstart = this.lexsize;
1615	79	this.txtend = this.lexsize;
1616
1617		// use the appropriate public identifier
1618	79	addStringLiteral("html PUBLIC ");
1619
1620	79	if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER
1621		&& this.configuration.docTypeStr != null
1622		&& this.configuration.docTypeStr.length() > 0)
1623		{
1624		// check if the fpi is quoted or not
1625	2	if (this.configuration.docTypeStr.charAt(0) == '"')
1626		{
1627	2	addStringLiteral(this.configuration.docTypeStr);
1628		}
1629		else
1630		{
1631	0	addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
1632	0	addStringLiteral(this.configuration.docTypeStr);
1633	0	addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
1634		}
1635		}
1636	77	else if (guessed == Dict.VERS_HTML20)
1637		{
1638	12	addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
1639		}
1640		else
1641		{
1642	65	addStringLiteral("\"-//W3C//DTD ");
1643
1644	263	for (i = 0; i < W3CVERSION.length; ++i)
1645		{
1646	263	if (guessed == W3CVERSION[i].code)
1647		{
1648	65	addStringLiteral(W3CVERSION[i].name);
1649	65	break;
1650		}
1651		}
1652
1653	65	addStringLiteral("//EN\"");
1654		}
1655
1656	79	this.txtend = this.lexsize;
1657
1658	79	int length = this.txtend - this.txtstart;
1659	79	doctype.textarray = new byte[length];
1660
1661	79	System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
1662	79	doctype.start = 0;
1663	79	doctype.end = length;
1664
1665	79	return true;
1666		}
1667
1668		/**
1669		* Ensure XML document starts with <code><?XML version="1.0"?></code>. Add encoding attribute if not using
1670		* ASCII or UTF-8 output.
1671		* @param root root node
1672		* @return always true
1673		*/
1674	15	public boolean fixXmlDecl(Node root)
1675		{
1676	15	Node xml;
1677	15	AttVal version;
1678	15	AttVal encoding;
1679
1680	15	if (root.content != null && root.content.type == Node.XML_DECL)
1681		{
1682	2	xml = root.content;
1683		}
1684		else
1685		{
1686	13	xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0);
1687	13	xml.next = root.content;
1688
1689	13	if (root.content != null)
1690		{
1691	13	root.content.prev = xml;
1692	13	xml.next = root.content;
1693		}
1694
1695	13	root.content = xml;
1696		}
1697
1698	15	version = xml.getAttrByName("version");
1699	15	encoding = xml.getAttrByName("encoding");
1700
1701		// We need to insert a check if declared encoding and output encoding mismatch
1702		// and fix the Xml declaration accordingly!!!
1703	15	if (encoding == null && !"UTF8".equals(this.configuration.getOutCharEncodingName()))
1704		{
1705	14	if ("ISO8859_1".equals(this.configuration.getOutCharEncodingName()))
1706		{
1707	5	xml.addAttribute("encoding", "iso-8859-1");
1708		}
1709	14	if ("ISO2022".equals(this.configuration.getOutCharEncodingName()))
1710		{
1711	0	xml.addAttribute("encoding", "iso-2022");
1712		}
1713		}
1714
1715	15	if (version == null)
1716		{
1717	13	xml.addAttribute("version", "1.0");
1718		}
1719
1720	15	return true;
1721		}
1722
1723		/**
1724		* Generates and inserts a new node.
1725		* @param name tag name
1726		* @return generated node
1727		*/
1728	179	public Node inferredTag(String name)
1729		{
1730	179	Node node;
1731
1732	179	node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name);
1733	179	node.implicit = true;
1734	179	return node;
1735		}
1736
1737		/**
1738		* Create a text node for the contents of a CDATA element like style or script which ends with </foo> for some
1739		* foo.
1740		* @param container container node
1741		* @return cdata node
1742		*/
1743	34	public Node getCDATA(Node container)
1744		{
1745	34	int c, lastc, start, len, i;
1746	34	int qt = 0;
1747	34	int esc = 0;
1748	34	String str;
1749	34	boolean endtag = false;
1750	34	boolean begtag = false;
1751
1752	34	if (container.isJavaScript())
1753		{
1754	11	esc = '\\';
1755		}
1756
1757	34	this.lines = this.in.getCurline();
1758	34	this.columns = this.in.getCurcol();
1759	34	this.waswhite = false;
1760	34	this.txtstart = this.lexsize;
1761	34	this.txtend = this.lexsize;
1762
1763	34	lastc = '\0';
1764	34	start = -1;
1765
1766	?	while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
1767		{
1768		// treat \r\n as \n and \r as \n
1769	14128	if (qt > 0)
1770		{
1771		// #598860 script parsing fails with quote chars
1772		// A quoted string is ended by the quotation character, or end of line
1773	1060	if ((c == '\r' \|\| c == '\n' \|\| c == qt) && (!TidyUtils.toBoolean(esc) \|\| lastc != esc))
1774		{
1775	63	qt = 0;
1776		}
1777	997	else if (c == '/' && lastc == '<')
1778		{
1779	7	start = this.lexsize + 1; // to first letter
1780		}
1781
1782	990	else if (c == '>' && start >= 0)
1783		{
1784	7	len = this.lexsize - start;
1785
1786	7	this.lines = this.in.getCurline();
1787	7	this.columns = this.in.getCurcol() - 3;
1788
1789	7	report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1790
1791		// if javascript insert backslash before /
1792	7	if (TidyUtils.toBoolean(esc))
1793		{
1794	0	for (i = this.lexsize; i > start - 1; --i)
1795		{
1796	0	this.lexbuf[i] = this.lexbuf[i - 1];
1797		}
1798
1799	0	this.lexbuf[start - 1] = (byte) esc;
1800	0	this.lexsize++;
1801		}
1802
1803	7	start = -1;
1804		}
1805		}
1806	13068	else if (TidyUtils.isQuote(c) && (!TidyUtils.toBoolean(esc) \|\| lastc != esc))
1807		{
1808	63	qt = c;
1809		}
1810	13005	else if (c == '<')
1811		{
1812	55	start = this.lexsize + 1; // to first letter
1813	55	endtag = false;
1814	55	begtag = true;
1815		}
1816	12950	else if (c == '!' && lastc == '<') // Cancel start tag
1817		{
1818	16	start = -1;
1819	16	endtag = false;
1820	16	begtag = false;
1821		}
1822	12934	else if (c == '/' && lastc == '<')
1823		{
1824	28	start = this.lexsize + 1; // to first letter
1825	28	endtag = true;
1826	28	begtag = false;
1827		}
1828	12906	else if (c == '>' && start >= 0) // End of begin or end tag
1829		{
1830	34	int decr = 2;
1831
1832	?	if (endtag && ((len = this.lexsize - start) == container.element.length()))
1833		{
1834
1835	26	str = TidyUtils.getString(this.lexbuf, start, len);
1836	26	if (container.element.equalsIgnoreCase(str))
1837		{
1838	26	this.txtend = start - decr;
1839	26	this.lexsize = start - decr; // #433857 - fix by Huajun Zeng 26 Apr 01
1840	26	break;
1841		}
1842		}
1843
1844		// Unquoted markup will end SCRIPT or STYLE elements
1845
1846	8	this.lines = this.in.getCurline();
1847	8	this.columns = this.in.getCurcol() - 3;
1848
1849	8	report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
1850	8	if (begtag)
1851		{
1852	6	decr = 1;
1853		}
1854	8	this.txtend = start - decr;
1855	8	this.lexsize = start - decr;
1856	8	break;
1857		}
1858		// #427844 - fix by Markus Hoenicka 21 Oct 00
1859	12872	else if (c == '\r')
1860		{
1861	0	if (begtag \|\| endtag)
1862		{
1863	0	continue; // discard whitespace in endtag
1864		}
1865
1866	0	c = this.in.readChar();
1867
1868	0	if (c != '\n')
1869		{
1870	0	this.in.ungetChar(c);
1871		}
1872
1873	0	c = '\n';
1874
1875		}
1876	12872	else if ((c == '\n' \|\| c == '\t' \|\| c == ' ') && (begtag \|\| endtag))
1877		{
1878	235	continue; // discard whitespace in endtag
1879		}
1880
1881	13859	addCharToLexer(c);
1882	13859	this.txtend = this.lexsize;
1883	13859	lastc = c;
1884		}
1885
1886	34	if (c == StreamIn.END_OF_STREAM)
1887		{
1888	0	report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
1889		}
1890
1891	34	if (this.txtend > this.txtstart)
1892		{
1893	31	this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
1894	31	return this.token;
1895		}
1896
1897	3	return null;
1898		}
1899
1900		/**
1901		*
1902		*
1903		*/
1904	432	public void ungetToken()
1905		{
1906	432	this.pushed = true;
1907		}
1908
1909		/**
1910		* Gets a token.
1911		* @param mode one of the following:
1912		* <ul>
1913		* <li><code>MixedContent</code>-- for elements which don't accept PCDATA</li>
1914		* <li><code>Preformatted</code>-- white spacepreserved as is</li>
1915		* <li><code>IgnoreMarkup</code>-- for CDATA elements such as script, style</li>
1916		* </ul>
1917		* @return next Node
1918		*/
1919	21370	public Node getToken(short mode)
1920		{
1921	21370	int c = 0;
1922	21370	int badcomment = 0;
1923		// pass by reference
1924	21370	boolean[] isempty = new boolean[1];
1925	21370	boolean inDTDSubset = false;
1926	21370	AttVal attributes = null;
1927
1928	21370	if (this.pushed)
1929		{
1930		// duplicate inlines in preference to pushed text nodes when appropriate
1931	430	if (this.token.type != Node.TEXT_NODE \|\| (this.insert == -1 && this.inode == null))
1932		{
1933	430	this.pushed = false;
1934	430	return this.token;
1935		}
1936		}
1937
1938		// at start of block elements, unclosed inline
1939	20940	if (this.insert != -1 \|\| this.inode != null)
1940		{
1941	99	return insertedToken();
1942		}
1943
1944	20841	this.lines = this.in.getCurline();
1945	20841	this.columns = this.in.getCurcol();
1946	20841	this.waswhite = false;
1947
1948	20841	this.txtstart = this.lexsize;
1949	20841	this.txtend = this.lexsize;
1950
1951	?	while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
1952		{
1953		// FG fix for [427846] different from tidy
1954		// if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
1955	186555	if (this.insertspace && mode != IGNORE_WHITESPACE)
1956		{
1957	871	addCharToLexer(' ');
1958		}
1959	186555	if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
1960		{
1961	878	this.waswhite = true;
1962	878	this.insertspace = false;
1963		}
1964
1965		// treat \r\n as \n and \r as \n
1966	186555	if (c == '\r')
1967		{
1968	8	c = this.in.readChar();
1969
1970	8	if (c != '\n')
1971		{
1972	8	this.in.ungetChar(c);
1973		}
1974
1975	8	c = '\n';
1976		}
1977
1978	186555	addCharToLexer(c);
1979
1980	186555	switch (this.state)
1981		{
1982	130194	case LEX_CONTENT :
1983		// element content
1984
1985		// Discard white space if appropriate.
1986		// Its cheaper to do this here rather than in parser methods for elements that
1987		// don't have mixed content.
1988	130194	if (TidyUtils.isWhite((char) c) && (mode == IGNORE_WHITESPACE) && this.lexsize == this.txtstart + 1)
1989		{
1990	13605	--this.lexsize;
1991	13605	this.waswhite = false;
1992	13605	this.lines = this.in.getCurline();
1993	13605	this.columns = this.in.getCurcol();
1994	13605	continue;
1995		}
1996
1997	116589	if (c == '<')
1998		{
1999	13652	this.state = LEX_GT;
2000	13652	continue;
2001		}
2002
2003	102937	if (TidyUtils.isWhite((char) c))
2004		{
2005		// was previous char white?
2006	22065	if (this.waswhite)
2007		{
2008	11056	if (mode != PREFORMATTED && mode != IGNORE_MARKUP)
2009		{
2010	11025	--this.lexsize;
2011	11025	this.lines = this.in.getCurline();
2012	11025	this.columns = this.in.getCurcol();
2013		}
2014		}
2015		else
2016		{
2017		// prev char wasn't white
2018	11009	this.waswhite = true;
2019
2020	11009	if (mode != PREFORMATTED && mode != IGNORE_MARKUP && c != ' ')
2021		{
2022	935	changeChar((byte) ' ');
2023		}
2024		}
2025
2026	22065	continue;
2027		}
2028	80872	else if (c == '&' && mode != IGNORE_MARKUP)
2029		{
2030	1436	parseEntity(mode);
2031		}
2032
2033		// this is needed to avoid trimming trailing whitespace
2034	80872	if (mode == IGNORE_WHITESPACE)
2035		{
2036	1168	mode = MIXED_CONTENT;
2037		}
2038
2039	80872	this.waswhite = false;
2040	80872	continue;
2041
2042	13652	case LEX_GT :
2043		// <
2044
2045		// check for endtag
2046	13652	if (c == '/')
2047		{
2048	5501	c = this.in.readChar();
2049	5501	if (c == StreamIn.END_OF_STREAM)
2050		{
2051	0	this.in.ungetChar(c);
2052	0	continue;
2053		}
2054
2055	5501	addCharToLexer(c);
2056
2057	5501	if (TidyUtils.isLetter((char) c))
2058		{
2059	5500	this.lexsize -= 3;
2060	5500	this.txtend = this.lexsize;
2061	5500	this.in.ungetChar(c);
2062	5500	this.state = LEX_ENDTAG;
2063	5500	this.lexbuf[this.lexsize] = (byte) '\0'; // debug
2064
2065		// changed from
2066		// this.in.curcol -= 2;
2067	5500	this.columns -= 2;
2068
2069		// if some text before the </ return it now
2070	5500	if (this.txtend > this.txtstart)
2071		{
2072		// trim space char before end tag
2073	3696	if (mode == IGNORE_WHITESPACE && this.lexbuf[this.lexsize - 1] == (byte) ' ')
2074		{
2075	0	this.lexsize -= 1;
2076	0	this.txtend = this.lexsize;
2077		}
2078
2079	3696	this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2080	3696	return this.token;
2081		}
2082
2083	1804	continue; // no text so keep going
2084		}
2085
2086		// otherwise treat as CDATA
2087	1	this.waswhite = false;
2088	1	this.state = LEX_CONTENT;
2089	1	continue;
2090		}
2091
2092	8151	if (mode == IGNORE_MARKUP)
2093		{
2094		// otherwise treat as CDATA
2095	0	this.waswhite = false;
2096	0	this.state = LEX_CONTENT;
2097	0	continue;
2098		}
2099
2100		// look out for comments, doctype or marked sections this isn't quite right, but its getting there
2101	8151	if (c == '!')
2102		{
2103	549	c = this.in.readChar();
2104
2105	549	if (c == '-')
2106		{
2107	381	c = this.in.readChar();
2108
2109	381	if (c == '-')
2110		{
2111	381	this.state = LEX_COMMENT; // comment
2112	381	this.lexsize -= 2;
2113	381	this.txtend = this.lexsize;
2114
2115		// if some text before < return it now
2116	381	if (this.txtend > this.txtstart)
2117		{
2118	285	this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2119	285	return this.token;
2120		}
2121
2122	96	this.txtstart = this.lexsize;
2123	96	continue;
2124		}
2125
2126	0	report.warning(this, null, null, Report.MALFORMED_COMMENT);
2127		}
2128	168	else if (c == 'd' \|\| c == 'D')
2129		{
2130	119	this.state = LEX_DOCTYPE; // doctype
2131	119	this.lexsize -= 2;
2132	119	this.txtend = this.lexsize;
2133	119	mode = IGNORE_WHITESPACE;
2134
2135		// skip until white space or '>'
2136
2137	119	for (;;)
2138		{
2139	830	c = this.in.readChar();
2140
2141	830	if (c == StreamIn.END_OF_STREAM \|\| c == '>')
2142		{
2143	1	this.in.ungetChar(c);
2144	1	break;
2145		}
2146
2147	829	if (!TidyUtils.isWhite((char) c))
2148		{
2149	711	continue;
2150		}
2151
2152		// and skip to end of whitespace
2153
2154	118	for (;;)
2155		{
2156	119	c = this.in.readChar();
2157
2158	119	if (c == StreamIn.END_OF_STREAM \|\| c == '>')
2159		{
2160	0	this.in.ungetChar(c);
2161	0	break;
2162		}
2163
2164	119	if (TidyUtils.isWhite((char) c))
2165		{
2166	1	continue;
2167		}
2168
2169	118	this.in.ungetChar(c);
2170	118	break;
2171		}
2172
2173	118	break;
2174		}
2175
2176		// if some text before < return it now
2177	119	if (this.txtend > this.txtstart)
2178		{
2179	2	this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2180	2	return this.token;
2181		}
2182
2183	117	this.txtstart = this.lexsize;
2184	117	continue;
2185		}
2186	49	else if (c == '[')
2187		{
2188		// Word 2000 embeds <![if ...]> ... <![endif]> sequences
2189	49	this.lexsize -= 2;
2190	49	this.state = LEX_SECTION;
2191	49	this.txtend = this.lexsize;
2192
2193		// if some text before < return it now
2194	49	if (this.txtend > this.txtstart)
2195		{
2196	23	this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2197	23	return this.token;
2198		}
2199
2200	26	this.txtstart = this.lexsize;
2201	26	continue;
2202		}
2203
2204		// otherwise swallow chars up to and including next '>'
2205	0	while (true)
2206		{
2207	0	c = this.in.readChar();
2208	0	if (c == '>')
2209		{
2210	0	break;
2211		}
2212	0	if (c == -1)
2213		{
2214	0	this.in.ungetChar(c);
2215	0	break;
2216		}
2217		}
2218
2219	0	this.lexsize -= 2;
2220	0	this.lexbuf[this.lexsize] = (byte) '\0';
2221	0	this.state = LEX_CONTENT;
2222	0	continue;
2223		}
2224
2225		// processing instructions
2226
2227	7602	if (c == '?')
2228		{
2229	27	this.lexsize -= 2;
2230	27	this.state = LEX_PROCINSTR;
2231	27	this.txtend = this.lexsize;
2232
2233		// if some text before < return it now
2234	27	if (this.txtend > this.txtstart)
2235		{
2236	1	this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2237	1	return this.token;
2238		}
2239
2240	26	this.txtstart = this.lexsize;
2241	26	continue;
2242		}
2243
2244		// Microsoft ASP's e.g. <% ... server-code ... %>
2245	7575	if (c == '%')
2246		{
2247	0	this.lexsize -= 2;
2248	0	this.state = LEX_ASP;
2249	0	this.txtend = this.lexsize;
2250
2251		// if some text before < return it now
2252	0	if (this.txtend > this.txtstart)
2253		{
2254	0	this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2255	0	return this.token;
2256		}
2257
2258	0	this.txtstart = this.lexsize;
2259	0	continue;
2260		}
2261
2262		// Netscapes JSTE e.g. <# ... server-code ... #>
2263	7575	if (c == '#')
2264		{
2265	1	this.lexsize -= 2;
2266	1	this.state = LEX_JSTE;
2267	1	this.txtend = this.lexsize;
2268
2269		// if some text before < return it now
2270	1	if (this.txtend > this.txtstart)
2271		{
2272	1	this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2273	1	return this.token;
2274		}
2275
2276	0	this.txtstart = this.lexsize;
2277	0	continue;
2278		}
2279
2280		// check for start tag
2281	7574	if (TidyUtils.isLetter((char) c))
2282		{
2283	7571	this.in.ungetChar(c); // push back letter
2284	7571	this.lexsize -= 2; // discard " <" + letter
2285	7571	this.txtend = this.lexsize;
2286	7571	this.state = LEX_STARTTAG; // ready to read tag name
2287
2288		// if some text before < return it now
2289	7571	if (this.txtend > this.txtstart)
2290		{
2291	2903	this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2292	2903	return this.token;
2293		}
2294
2295	4668	continue; // no text so keep going
2296		}
2297
2298		// otherwise treat as CDATA
2299	3	this.state = LEX_CONTENT;
2300	3	this.waswhite = false;
2301	3	continue;
2302
2303	5500	case LEX_ENDTAG :
2304		// </letter
2305	5500	this.txtstart = this.lexsize - 1;
2306
2307		// changed from
2308		// this.in.curcol -= 2;
2309	5500	this.columns -= 2;
2310
2311	5500	c = parseTagName();
2312	5500	this.token = newNode(Node.END_TAG, // create endtag token
2313		this.lexbuf,
2314		this.txtstart,
2315		this.txtend,
2316		TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart));
2317	5500	this.lexsize = this.txtstart;
2318	5500	this.txtend = this.txtstart;
2319
2320		// skip to '>'
2321	5500	while (c != '>')
2322		{
2323	8	c = this.in.readChar();
2324
2325	8	if (c == StreamIn.END_OF_STREAM)
2326		{
2327	1	break;
2328		}
2329		}
2330
2331	5500	if (c == StreamIn.END_OF_STREAM)
2332		{
2333	1	this.in.ungetChar(c);
2334	1	continue;
2335		}
2336
2337	5499	this.state = LEX_CONTENT;
2338	5499	this.waswhite = false;
2339	5499	return this.token; // the endtag token
2340
2341	7571	case LEX_STARTTAG :
2342		// first letter of tagname
2343	7571	this.txtstart = this.lexsize - 1; // set txtstart to first letter
2344	7571	c = parseTagName();
2345	7571	isempty[0] = false;
2346	7571	attributes = null;
2347	7571	this.token = newNode(
2348	7571	(isempty[0] ? Node.START_END_TAG : Node.START_TAG),
2349		this.lexbuf,
2350		this.txtstart,
2351		this.txtend,
2352		TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart));
2353
2354		// parse attributes, consuming closing ">"
2355	7571	if (c != '>')
2356		{
2357	3007	if (c == '/')
2358		{
2359	0	this.in.ungetChar(c);
2360		}
2361
2362	3007	attributes = parseAttrs(isempty);
2363		}
2364
2365	7571	if (isempty[0])
2366		{
2367	90	this.token.type = Node.START_END_TAG;
2368		}
2369
2370	7571	this.token.attributes = attributes;
2371	7571	this.lexsize = this.txtstart;
2372	7571	this.txtend = this.txtstart;
2373
2374		// swallow newline following start tag
2375		// special check needed for CRLF sequence
2376		// this doesn't apply to empty elements
2377		// nor to preformatted content that needs escaping
2378
2379	7571	if (
2380
2381	7571	(mode != PREFORMATTED \|\| preContent(this.token))
2382		&& (this.token.expectsContent() \|\| this.token.tag == this.configuration.tt.tagBr))
2383		{
2384
2385	7160	c = this.in.readChar();
2386
2387	7160	if (c == '\r')
2388		{
2389	0	c = this.in.readChar();
2390
2391	0	if (c != '\n')
2392		{
2393	0	this.in.ungetChar(c);
2394		}
2395		}
2396	7160	else if (c != '\n' && c != '\f')
2397		{
2398	4740	this.in.ungetChar(c);
2399		}
2400
2401	7160	this.waswhite = true; // to swallow leading whitespace
2402		}
2403		else
2404		{
2405	411	this.waswhite = false;
2406		}
2407
2408	7571	this.state = LEX_CONTENT;
2409
2410	7571	if (this.token.tag == null)
2411		{
2412	27	report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
2413		}
2414	7544	else if (!this.configuration.xmlTags)
2415		{
2416	7505	constrainVersion(this.token.tag.versions);
2417
2418	7505	if (TidyUtils.toBoolean(this.token.tag.versions & Dict.VERS_PROPRIETARY))
2419		{
2420		// #427810 - fix by Gary Deschaines 24 May 00
2421	23	if (this.configuration.makeClean && (this.token.tag != this.configuration.tt.tagNobr && //
2422		this.token.tag != this.configuration.tt.tagWbr))
2423		{
2424	14	report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
2425		}
2426		// #427810 - fix by Terry Teague 2 Jul 01
2427	9	else if (!this.configuration.makeClean)
2428		{
2429	9	report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
2430		}
2431		}
2432
2433	7505	if (this.token.tag.getChkattrs() != null)
2434		{
2435	2828	this.token.tag.getChkattrs().check(this, this.token);
2436		}
2437		else
2438		{
2439	4677	this.token.checkAttributes(this);
2440		}
2441
2442		// should this be called before attribute checks?
2443	7505	this.token.repairDuplicateAttributes(this);
2444
2445		}
2446
2447	7571	return this.token; // return start tag
2448
2449	18755	case LEX_COMMENT :
2450		// seen <!-- so look for -->
2451
2452	18755	if (c != '-')
2453		{
2454	18157	continue;
2455		}
2456
2457	598	c = this.in.readChar();
2458	598	addCharToLexer(c);
2459
2460	598	if (c != '-')
2461		{
2462	210	continue;
2463		}
2464
2465	388	end_comment : while (true)
2466		{
2467	388	c = this.in.readChar();
2468
2469	388	if (c == '>')
2470		{
2471	380	if (badcomment != 0)
2472		{
2473	8	report.warning(this, null, null, Report.MALFORMED_COMMENT);
2474		}
2475
2476	380	this.txtend = this.lexsize - 2; // AQ 8Jul2000
2477	380	this.lexbuf[this.lexsize] = (byte) '\0';
2478	380	this.state = LEX_CONTENT;
2479	380	this.waswhite = false;
2480	380	this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
2481
2482		// now look for a line break
2483
2484	380	c = this.in.readChar();
2485
2486	380	if (c == '\r')
2487		{
2488	0	c = this.in.readChar();
2489
2490	0	if (c != '\n')
2491		{
2492	0	this.token.linebreak = true;
2493		}
2494		}
2495
2496	380	if (c == '\n')
2497		{
2498	368	this.token.linebreak = true;
2499		}
2500		else
2501		{
2502	12	this.in.ungetChar(c);
2503		}
2504
2505	380	return this.token;
2506		}
2507
2508		// note position of first such error in the comment
2509	8	if (badcomment == 0)
2510		{
2511	8	this.lines = this.in.getCurline();
2512	8	this.columns = this.in.getCurcol() - 3;
2513		}
2514
2515	8	badcomment++;
2516	8	if (this.configuration.fixComments)
2517		{
2518	8	this.lexbuf[this.lexsize - 2] = (byte) '=';
2519		}
2520
2521	8	addCharToLexer(c);
2522
2523		// if '-' then look for '>' to end the comment
2524	8	if (c != '-')
2525		{
2526	8	break end_comment;
2527		}
2528
2529		}
2530		// otherwise continue to look for -->
2531	8	this.lexbuf[this.lexsize - 2] = (byte) '=';
2532	8	continue;
2533
2534	9829	case LEX_DOCTYPE :
2535		// seen <!d so look for '> ' munging whitespace
2536
2537	9829	if (TidyUtils.isWhite((char) c))
2538		{
2539	794	if (this.waswhite)
2540		{
2541	140	this.lexsize -= 1;
2542		}
2543
2544	794	this.waswhite = true;
2545		}
2546		else
2547		{
2548	9035	this.waswhite = false;
2549		}
2550
2551	9829	if (inDTDSubset)
2552		{
2553	65	if (c == ']')
2554		{
2555	1	inDTDSubset = false;
2556		}
2557		}
2558	9764	else if (c == '[')
2559		{
2560	1	inDTDSubset = true;
2561		}
2562	9829	if (inDTDSubset \|\| c != '>')
2563		{
2564	9710	continue;
2565		}
2566
2567	119	this.lexsize -= 1;
2568	119	this.txtend = this.lexsize;
2569	119	this.lexbuf[this.lexsize] = (byte) '\0';
2570	119	this.state = LEX_CONTENT;
2571	119	this.waswhite = false;
2572	119	this.token = newNode(Node.DOCTYPE_TAG, this.lexbuf, this.txtstart, this.txtend);
2573		// make a note of the version named by the doctype
2574	119	this.doctype = findGivenVersion(this.token);
2575	119	return this.token;
2576
2577	190	case LEX_PROCINSTR :
2578		// seen <? so look for '> '
2579		// check for PHP preprocessor instructions <?php ... ?>
2580
2581	190	if (this.lexsize - this.txtstart == 3)
2582		{
2583	27	if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("php"))
2584		{
2585	2	this.state = LEX_PHP;
2586	2	continue;
2587		}
2588		}
2589
2590	188	if (this.lexsize - this.txtstart == 4)
2591		{
2592	25	if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("xml")
2593		&& TidyUtils.isWhite((char) this.lexbuf[this.txtstart + 3]))
2594		{
2595	23	this.state = LEX_XMLDECL;
2596	23	attributes = null;
2597	23	continue;
2598		}
2599		}
2600
2601	165	if (this.configuration.xmlPIs) // insist on ?> as terminator
2602		{
2603	27	if (c != '?')
2604		{
2605	27	continue;
2606		}
2607
2608		// now look for '>'
2609	0	c = this.in.readChar();
2610
2611	0	if (c == StreamIn.END_OF_STREAM)
2612		{
2613	0	report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
2614	0	this.in.ungetChar(c);
2615	0	continue;
2616		}
2617
2618	0	addCharToLexer(c);
2619		}
2620
2621	138	if (c != '>')
2622		{
2623	136	continue;
2624		}
2625
2626	2	this.lexsize -= 1;
2627	2	this.txtend = this.lexsize;
2628	2	this.lexbuf[this.lexsize] = (byte) '\0';
2629	2	this.state = LEX_CONTENT;
2630	2	this.waswhite = false;
2631	2	this.token = newNode(Node.PROC_INS_TAG, this.lexbuf, this.txtstart, this.txtend);
2632	2	return this.token;
2633
2634	0	case LEX_ASP :
2635		// seen <% so look for "%> "
2636	0	if (c != '%')
2637		{
2638	0	continue;
2639		}
2640
2641		// now look for '>'
2642	0	c = this.in.readChar();
2643
2644	0	if (c != '>')
2645		{
2646	0	this.in.ungetChar(c);
2647	0	continue;
2648		}
2649
2650	0	this.lexsize -= 1;
2651	0	this.txtend = this.lexsize;
2652	0	this.lexbuf[this.lexsize] = (byte) '\0';
2653	0	this.state = LEX_CONTENT;
2654	0	this.waswhite = false;
2655	0	this.token = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
2656	0	return this.token;
2657
2658	127	case LEX_JSTE :
2659		// seen <# so look for "#> "
2660	127	if (c != '#')
2661		{
2662	126	continue;
2663		}
2664
2665		// now look for '>'
2666	1	c = this.in.readChar();
2667
2668	1	if (c != '>')
2669		{
2670	0	this.in.ungetChar(c);
2671	0	continue;
2672		}
2673
2674	1	this.lexsize -= 1;
2675	1	this.txtend = this.lexsize;
2676	1	this.lexbuf[this.lexsize] = (byte) '\0';
2677	1	this.state = LEX_CONTENT;
2678	1	this.waswhite = false;
2679	1	this.token = newNode(Node.JSTE_TAG, this.lexbuf, this.txtstart, this.txtend);
2680	1	return this.token;
2681
2682	23	case LEX_PHP :
2683		// seen " <?php" so look for "?> "
2684	23	if (c != '?')
2685		{
2686	22	continue;
2687		}
2688
2689		// now look for '>'
2690	1	c = this.in.readChar();
2691
2692	1	if (c != '>')
2693		{
2694	0	this.in.ungetChar(c);
2695	0	continue;
2696		}
2697
2698	1	this.lexsize -= 1;
2699	1	this.txtend = this.lexsize;
2700	1	this.lexbuf[this.lexsize] = (byte) '\0';
2701	1	this.state = LEX_CONTENT;
2702	1	this.waswhite = false;
2703	1	this.token = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
2704	1	return this.token;
2705
2706	74	case LEX_XMLDECL : // seen "<?xml" so look for "?>"
2707
2708	74	if (TidyUtils.isWhite((char) c) && c != '?')
2709		{
2710	14	continue;
2711		}
2712
2713		// get pseudo-attribute
2714	60	if (c != '?')
2715		{
2716	39	String name;
2717	39	Node[] asp = new Node[1];
2718	39	Node[] php = new Node[1];
2719	39	AttVal av = new AttVal();
2720	39	int[] pdelim = new int[1];
2721	39	isempty[0] = false;
2722
2723	39	this.in.ungetChar(c);
2724
2725	39	name = this.parseAttribute(isempty, asp, php);
2726	39	av.attribute = name;
2727
2728	39	av.value = this.parseValue(name, true, isempty, pdelim);
2729	39	av.delim = pdelim[0];
2730	39	av.next = attributes;
2731
2732	39	attributes = av;
2733		// continue;
2734		}
2735
2736		// now look for '>'
2737	60	c = this.in.readChar();
2738
2739	60	if (c != '>')
2740		{
2741	37	this.in.ungetChar(c);
2742	37	continue;
2743		}
2744	23	this.lexsize -= 1;
2745	23	this.txtend = this.txtstart;
2746	23	this.lexbuf[this.txtend] = '\0';
2747	23	this.state = LEX_CONTENT;
2748	23	this.waswhite = false;
2749	23	this.token = newNode(Node.XML_DECL, this.lexbuf, this.txtstart, this.txtend);
2750	23	this.token.attributes = attributes;
2751	23	return this.token;
2752
2753	584	case LEX_SECTION :
2754		// seen " <![" so look for "]> "
2755	584	if (c == '[')
2756		{
2757	1	if (this.lexsize == (this.txtstart + 6)
2758		&& (TidyUtils.getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
2759		{
2760	1	this.state = LEX_CDATA;
2761	1	this.lexsize -= 6;
2762	1	continue;
2763		}
2764		}
2765
2766	583	if (c != ']')
2767		{
2768	535	continue;
2769		}
2770
2771		// now look for '>'
2772	48	c = this.in.readChar();
2773
2774	48	if (c != '>')
2775		{
2776	0	this.in.ungetChar(c);
2777	0	continue;
2778		}
2779
2780	48	this.lexsize -= 1;
2781	48	this.txtend = this.lexsize;
2782	48	this.lexbuf[this.lexsize] = (byte) '\0';
2783	48	this.state = LEX_CONTENT;
2784	48	this.waswhite = false;
2785	48	this.token = newNode(Node.SECTION_TAG, this.lexbuf, this.txtstart, this.txtend);
2786	48	return this.token;
2787
2788	56	case LEX_CDATA :
2789		// seen " <![CDATA[" so look for "]]> "
2790	56	if (c != ']')
2791		{
2792	55	continue;
2793		}
2794
2795		// now look for ']'
2796	1	c = this.in.readChar();
2797
2798	1	if (c != ']')
2799		{
2800	0	this.in.ungetChar(c);
2801	0	continue;
2802		}
2803
2804		// now look for '>'
2805	1	c = this.in.readChar();
2806
2807	1	if (c != '>')
2808		{
2809	0	this.in.ungetChar(c);
2810	0	continue;
2811		}
2812
2813	1	this.lexsize -= 1;
2814	1	this.txtend = this.lexsize;
2815	1	this.lexbuf[this.lexsize] = (byte) '\0';
2816	1	this.state = LEX_CONTENT;
2817	1	this.waswhite = false;
2818	1	this.token = newNode(Node.CDATA_TAG, this.lexbuf, this.txtstart, this.txtend);
2819	1	return this.token;
2820
2821	0	default :
2822		// should never reach here
2823	0	break;
2824		}
2825		}
2826
2827	285	if (this.state == LEX_CONTENT) // text string
2828		{
2829	282	this.txtend = this.lexsize;
2830
2831	282	if (this.txtend > this.txtstart)
2832		{
2833	6	this.in.ungetChar(c);
2834
2835	6	if (this.lexbuf[this.lexsize - 1] == (byte) ' ')
2836		{
2837	5	this.lexsize -= 1;
2838	5	this.txtend = this.lexsize;
2839		}
2840
2841	6	this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
2842	6	return this.token;
2843		}
2844		}
2845	3	else if (this.state == LEX_COMMENT) // comment
2846		{
2847	1	if (c == StreamIn.END_OF_STREAM)
2848		{
2849	1	report.warning(this, null, null, Report.MALFORMED_COMMENT);
2850		}
2851
2852	1	this.txtend = this.lexsize;
2853	1	this.lexbuf[this.lexsize] = (byte) '\0';
2854	1	this.state = LEX_CONTENT;
2855	1	this.waswhite = false;
2856	1	this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
2857	1	return this.token;
2858		}
2859
2860	278	return null;
2861		}
2862
2863		/**
2864		* parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to
2865		* dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to
2866		* tailor the attribute value. Here is an example of a work around for using ASP in attribute values:
2867		* <code>href='<%=rsSchool.Fields("ID").Value%>'</code> where the ASP that generates the attribute value is
2868		* masked from Tidy by the quotemarks.
2869		* @return parsed Node
2870		*/
2871	0	public Node parseAsp()
2872		{
2873	0	int c;
2874	0	Node asp = null;
2875
2876	0	this.txtstart = this.lexsize;
2877
2878	0	while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
2879		{
2880
2881	0	addCharToLexer(c);
2882
2883	0	if (c != '%')
2884		{
2885	0	continue;
2886		}
2887
2888	0	if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
2889		{
2890	0	break;
2891		}
2892	0	addCharToLexer(c);
2893
2894	0	if (c == '>')
2895		{
2896	0	break;
2897		}
2898		}
2899
2900	0	this.lexsize -= 2;
2901	0	this.txtend = this.lexsize;
2902
2903	0	if (this.txtend > this.txtstart)
2904		{
2905	0	asp = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
2906		}
2907
2908	0	this.txtstart = this.txtend;
2909	0	return asp;
2910		}
2911
2912		/**
2913		* PHP is like ASP but is based upon XML processing instructions, e.g. <code><?php ... ?></code>.
2914		* @return parsed Node
2915		*/
2916	0	public Node parsePhp()
2917		{
2918	0	int c;
2919	0	Node php = null;
2920
2921	0	this.txtstart = this.lexsize;
2922
2923	0	while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
2924		{
2925	0	addCharToLexer(c);
2926
2927	0	if (c != '?')
2928		{
2929	0	continue;
2930		}
2931
2932	0	if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
2933		{
2934	0	break;
2935		}
2936	0	addCharToLexer(c);
2937
2938	0	if (c == '>')
2939		{
2940	0	break;
2941		}
2942		}
2943
2944	0	this.lexsize -= 2;
2945	0	this.txtend = this.lexsize;
2946
2947	0	if (this.txtend > this.txtstart)
2948		{
2949	0	php = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
2950		}
2951
2952	0	this.txtstart = this.txtend;
2953	0	return php;
2954		}
2955
2956		/**
2957		* consumes the '>' terminating start tags.
2958		* @param isempty flag is passed as array so it can be modified
2959		* @param asp asp Node, passed as array so it can be modified
2960		* @param php php Node, passed as array so it can be modified
2961		* @return parsed attribute
2962		*/
2963	7964	public String parseAttribute(boolean[] isempty, Node[] asp, Node[] php)
2964		{
2965	7964	int start = 0;
2966	7964	String attr;
2967	7964	int c = 0;
2968	7964	int lastc = 0;
2969
2970	7964	asp[0] = null; // clear asp pointer
2971	7964	php[0] = null; // clear php pointer
2972		// skip white space before the attribute
2973
2974	7964	for (;;)
2975		{
2976	9529	c = this.in.readChar();
2977
2978	9529	if (c == '/')
2979		{
2980	93	c = this.in.readChar();
2981
2982	93	if (c == '>')
2983		{
2984	90	isempty[0] = true;
2985	90	return null;
2986		}
2987
2988	3	this.in.ungetChar(c);
2989	3	c = '/';
2990	3	break;
2991		}
2992
2993	9436	if (c == '>')
2994		{
2995	2914	return null;
2996		}
2997
2998	6522	if (c == '<')
2999		{
3000	3	c = this.in.readChar();
3001
3002	3	if (c == '%')
3003		{
3004	0	asp[0] = parseAsp();
3005	0	return null;
3006		}
3007	3	else if (c == '?')
3008		{
3009	0	php[0] = parsePhp();
3010	0	return null;
3011		}
3012
3013	3	this.in.ungetChar(c);
3014	3	if (this.state != LEX_XMLDECL) // FG fix for 532535
3015		{
3016	2	this.in.ungetChar('<'); // fix for 433360
3017		}
3018	3	report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3019	3	return null;
3020		}
3021
3022	6519	if (c == '=')
3023		{
3024	1	report.attrError(this, this.token, null, Report.UNEXPECTED_EQUALSIGN);
3025	1	continue;
3026		}
3027
3028	6518	if (c == '"' \|\| c == '\'')
3029		{
3030	3	report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
3031	3	continue;
3032		}
3033
3034	6515	if (c == StreamIn.END_OF_STREAM)
3035		{
3036	0	report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3037	0	this.in.ungetChar(c);
3038	0	return null;
3039		}
3040
3041	6515	if (!TidyUtils.isWhite((char) c))
3042		{
3043	4954	break;
3044		}
3045		}
3046
3047	4957	start = this.lexsize;
3048	4957	lastc = c;
3049
3050	4957	for (;;)
3051		{
3052		// but push back '=' for parseValue()
3053	30063	if (c == '=' \|\| c == '>')
3054		{
3055	4828	this.in.ungetChar(c);
3056	4828	break;
3057		}
3058
3059	25235	if (c == '<' \|\| c == StreamIn.END_OF_STREAM)
3060		{
3061	0	this.in.ungetChar(c);
3062	0	break;
3063		}
3064	25235	if (lastc == '-' && (c == '"' \|\| c == '\''))
3065		{
3066	1	this.lexsize--;
3067	1	this.in.ungetChar(c);
3068	1	break;
3069		}
3070	25234	if (TidyUtils.isWhite((char) c))
3071		{
3072	128	break;
3073		}
3074
3075		// what should be done about non-namechar characters?
3076		// currently these are incorporated into the attr name
3077
3078	25106	if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
3079		{
3080	4156	c = TidyUtils.toLower((char) c);
3081		}
3082
3083		// ++len; #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
3084	25106	addCharToLexer(c);
3085
3086	25106	lastc = c;
3087	25106	c = this.in.readChar();
3088		}
3089
3090		// #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
3091	4957	int len = this.lexsize - start;
3092	4957	attr = (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
3093	4957	this.lexsize = start;
3094
3095	4957	return attr;
3096		}
3097
3098		/**
3099		* Invoked when < is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this
3100		* routine recognizes ' and " quoted strings.
3101		* @return delimiter
3102		*/
3103	1	public int parseServerInstruction()
3104		{
3105	1	int c, delim = '"';
3106	1	boolean isrule = false;
3107
3108	1	c = this.in.readChar();
3109	1	addCharToLexer(c);
3110
3111		// check for ASP, PHP or Tango
3112	1	if (c == '%' \|\| c == '?' \|\| c == '@')
3113		{
3114	0	isrule = true;
3115		}
3116
3117	1	for (;;)
3118		{
3119	11	c = this.in.readChar();
3120
3121	11	if (c == StreamIn.END_OF_STREAM)
3122		{
3123	0	break;
3124		}
3125
3126	11	if (c == '>')
3127		{
3128	0	if (isrule)
3129		{
3130	0	addCharToLexer(c);
3131		}
3132		else
3133		{
3134	0	this.in.ungetChar(c);
3135		}
3136
3137	0	break;
3138		}
3139
3140		// if not recognized as ASP, PHP or Tango
3141		// then also finish value on whitespace
3142	11	if (!isrule)
3143		{
3144	11	if (TidyUtils.isWhite((char) c))
3145		{
3146	0	break;
3147		}
3148		}
3149
3150	11	addCharToLexer(c);
3151
3152	11	if (c == '"')
3153		{
3154	1	do
3155		{
3156	1	c = this.in.readChar();
3157
3158	1	if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
3159		{
3160	0	report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3161	0	this.in.ungetChar(c);
3162	0	return 0;
3163		}
3164	1	if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
3165		{
3166	1	this.in.ungetChar(c);
3167	1	report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3168	1	return 0;
3169		}
3170
3171	0	addCharToLexer(c);
3172		}
3173	0	while (c != '"');
3174	0	delim = '\'';
3175	0	continue;
3176		}
3177
3178	10	if (c == '\'')
3179		{
3180	0	do
3181		{
3182	0	c = this.in.readChar();
3183
3184	0	if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
3185		{
3186	0	report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3187	0	this.in.ungetChar(c);
3188	0	return 0;
3189		}
3190	0	if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
3191		{
3192	0	this.in.ungetChar(c);
3193	0	report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3194	0	return 0;
3195		}
3196
3197	0	addCharToLexer(c);
3198		}
3199	0	while (c != '\'');
3200		}
3201		}
3202
3203	0	return delim;
3204		}
3205
3206		/**
3207		* Parse an attribute value.
3208		* @param name attribute name
3209		* @param foldCase fold case?
3210		* @param isempty is attribute empty? Passed as an array reference to allow modification
3211		* @param pdelim delimiter, passed as an array reference to allow modification
3212		* @return parsed value
3213		*/
3214	4958	public String parseValue(String name, boolean foldCase, boolean[] isempty, int[] pdelim)
3215		{
3216		// values start with "=" or " = " etc.
3217		// doesn't consume the ">" at end of start tag
3218
3219	4958	int len = 0;
3220	4958	int start;
3221	4958	boolean seenGt = false;
3222	4958	boolean munge = true;
3223	4958	int c = 0;
3224	4958	int lastc, delim, quotewarning;
3225	4958	String value;
3226
3227	4958	delim = 0;
3228	4958	pdelim[0] = '"';
3229
3230		// Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are
3231		// significant and must be preserved
3232
3233	4958	if (this.configuration.literalAttribs)
3234		{
3235	7	munge = false;
3236		}
3237
3238		// skip white space before the '='
3239	4958	while (true)
3240		{
3241	4972	c = this.in.readChar();
3242
3243	4972	if (c == StreamIn.END_OF_STREAM)
3244		{
3245	0	this.in.ungetChar(c);
3246	0	break;
3247		}
3248
3249	4972	if (!TidyUtils.isWhite((char) c))
3250		{
3251	4958	break;
3252		}
3253		}
3254
3255		// c should be '=' if there is a value other legal possibilities are white space, '/' and '>'
3256
3257	4958	if (c != '=' && c != '"' && c != '\'')
3258		{
3259	95	this.in.ungetChar(c);
3260	95	return null;
3261		}
3262
3263		// skip white space after '='
3264
3265	4863	while (true)
3266		{
3267	4954	c = this.in.readChar();
3268
3269	4954	if (c == StreamIn.END_OF_STREAM)
3270		{
3271	0	this.in.ungetChar(c);
3272	0	break;
3273		}
3274
3275	4954	if (!TidyUtils.isWhite((char) c))
3276		{
3277	4863	break;
3278		}
3279		}
3280
3281		// check for quote marks
3282
3283	4863	if (c == '"' \|\| c == '\'')
3284		{
3285	3564	delim = c;
3286		}
3287	1299	else if (c == '<')
3288		{
3289	1	start = this.lexsize;
3290	1	addCharToLexer(c);
3291	1	pdelim[0] = parseServerInstruction();
3292	1	len = this.lexsize - start;
3293	1	this.lexsize = start;
3294	1	return (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
3295		}
3296		else
3297		{
3298	1298	this.in.ungetChar(c);
3299		}
3300
3301		// and read the value string check for quote mark if needed
3302
3303	4862	quotewarning = 0;
3304	4862	start = this.lexsize;
3305	4862	c = '\0';
3306
3307	4862	while (true)
3308		{
3309	65521	lastc = c; // track last character
3310	65521	c = this.in.readChar();
3311
3312	65521	if (c == StreamIn.END_OF_STREAM)
3313		{
3314	1	report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
3315	1	this.in.ungetChar(c);
3316	1	break;
3317		}
3318
3319	65520	if (delim == (char) 0)
3320		{
3321	6898	if (c == '>')
3322		{
3323	390	this.in.ungetChar(c);
3324	390	break;
3325		}
3326
3327	6508	if (c == '"' \|\| c == '\'')
3328		{
3329	3	report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
3330	3	break;
3331		}
3332
3333	6505	if (c == '<')
3334		{
3335	4	this.in.ungetChar(c); // fix for 433360
3336	4	c = '>';
3337	4	this.in.ungetChar(c);
3338	4	report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
3339	4	break;
3340		}
3341
3342		// For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however
3343		// care is needed to avoid so treating <a href=http://www.acme.com /> in this way, which would map the
3344		// <a> tag to <a href="http://www.acme.com"/>
3345
3346	6501	if (c == '/')
3347		{
3348		// peek ahead in case of />
3349	0	c = this.in.readChar();
3350
3351	0	if (c == '>' && !AttributeTable.getDefaultAttributeTable().isUrl(name))
3352		{
3353	0	isempty[0] = true;
3354	0	this.in.ungetChar(c);
3355	0	break;
3356		}
3357
3358		// unget peeked char
3359	0	this.in.ungetChar(c);
3360	0	c = '/';
3361		}
3362		}
3363		else
3364		{
3365		// delim is '\'' or '"'
3366	58622	if (c == delim)
3367		{
3368	3563	break;
3369		}
3370
3371		// treat CRLF, CR and LF as single line break
3372
3373	55059	if (c == '\r')
3374		{
3375	0	c = this.in.readChar();
3376	0	if (c != '\n')
3377		{
3378	0	this.in.ungetChar(c);
3379		}
3380
3381	0	c = '\n';
3382		}
3383
3384	55059	if (c == '\n' \|\| c == '<' \|\| c == '>')
3385		{
3386	102	++quotewarning;
3387		}
3388
3389	55059	if (c == '>')
3390		{
3391	14	seenGt = true;
3392		}
3393		}
3394
3395	61560	if (c == '&')
3396		{
3397		// no entities in ID attributes
3398	341	if ("id".equalsIgnoreCase(name))
3399		{
3400	0	report.attrError(this, null, null, Report.ENTITY_IN_ID);
3401	0	continue;
3402		}
3403
3404	341	addCharToLexer(c);
3405	341	parseEntity((short) 0);
3406	341	continue;
3407
3408		}
3409
3410		// kludge for JavaScript attribute values with line continuations in string literals
3411
3412	61219	if (c == '\\')
3413		{
3414	22	c = this.in.readChar();
3415
3416	22	if (c != '\n')
3417		{
3418	22	this.in.ungetChar(c);
3419	22	c = '\\';
3420		}
3421		}
3422
3423	61219	if (TidyUtils.isWhite((char) c))
3424		{
3425	1993	if (delim == (char) 0)
3426		{
3427	901	break;
3428		}
3429
3430	1092	if (munge)
3431		{
3432		// discard line breaks in quoted URLs
3433		// #438650 - fix by Randy Waki
3434	1090	if (c == '\n' && AttributeTable.getDefaultAttributeTable().isUrl(name))
3435		{
3436		// warn that we discard this newline
3437	8	report.attrError(this, this.token, null, Report.NEWLINE_IN_URI);
3438	8	continue;
3439		}
3440
3441	1082	c = ' ';
3442
3443	1082	if (lastc == ' ')
3444		{
3445	61	continue;
3446		}
3447		}
3448		}
3449	59226	else if (foldCase && TidyUtils.isUpper((char) c))
3450		{
3451	18	c = TidyUtils.toLower((char) c);
3452		}
3453
3454	60249	addCharToLexer(c);
3455		}
3456
3457	4862	if (quotewarning > 10 && seenGt && munge)
3458		{
3459		// there is almost certainly a missing trailing quote mark as we have see too many newlines, < or >
3460		// characters. an exception is made for Javascript attributes and the javascript URL scheme which may
3461		// legitimately include < and >, and for attributes starting with "<xml " as generated by Microsoft Office.
3462
3463	1	if (!AttributeTable.getDefaultAttributeTable().isScript(name)
3464		&& !(AttributeTable.getDefaultAttributeTable().isUrl(name) && "javascript:".equals(TidyUtils.getString(
3465		this.lexbuf,
3466		start,
3467		11)))
3468		&& !"<xml ".equals(TidyUtils.getString(this.lexbuf, start, 5))) // #500236 - fix by Klaus Johannes Rusch
3469		// 06 Jan 02
3470		{
3471	0	report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
3472		}
3473		}
3474
3475	4862	len = this.lexsize - start;
3476	4862	this.lexsize = start;
3477
3478	4862	if (len > 0 \|\| delim != 0)
3479		{
3480		// ignore leading and trailing white space for all but title, alt, value and prompts attributes unless
3481		// --literal-attributes is set to yes
3482		// #994841 - Whitespace is removed from value attributes
3483
3484	4861	if (munge && !TidyUtils.isInValuesIgnoreCase(new String[]{"alt", "title", "value", "prompt"}, name))
3485		{
3486	4727	while (TidyUtils.isWhite((char) this.lexbuf[start + len - 1]))
3487		{
3488	1	--len;
3489		}
3490
3491	4727	while (TidyUtils.isWhite((char) this.lexbuf[start]) && start < len)
3492		{
3493	0	++start;
3494	0	--len;
3495		}
3496		}
3497
3498	4861	value = TidyUtils.getString(this.lexbuf, start, len);
3499		}
3500		else
3501		{
3502	1	value = null;
3503		}
3504
3505		// note delimiter if given
3506	4862	if (delim != 0)
3507		{
3508	3564	pdelim[0] = delim;
3509		}
3510		else
3511		{
3512	1298	pdelim[0] = '"';
3513		}
3514
3515	4862	return value;
3516		}
3517
3518		/**
3519		* Check if attr is a valid name.
3520		* @param attr String to check, must be non-null
3521		* @return <code>true</code> if attr is a valid name.
3522		*/
3523	4919	public static boolean isValidAttrName(String attr)
3524		{
3525	4919	char c;
3526	4919	int i;
3527
3528		// first character should be a letter
3529	4919	c = attr.charAt(0);
3530
3531	4919	if (!TidyUtils.isLetter(c))
3532		{
3533	10	return false;
3534		}
3535
3536		// remaining characters should be namechars
3537	4909	for (i = 1; i < attr.length(); i++)
3538		{
3539	19848	c = attr.charAt(i);
3540
3541	19848	if (TidyUtils.isNamechar(c))
3542		{
3543	19824	continue;
3544		}
3545
3546	24	return false;
3547		}
3548
3549	4885	return true;
3550		}
3551
3552		/**
3553		* In CSS1, selectors can contain only the characters A-Z, 0-9, and Unicode characters 161-255, plus dash (-); they
3554		* cannot start with a dash or a digit; they can also contain escaped characters and any Unicode character as a
3555		* numeric code (see next item). The backslash followed by at most four hexadecimal digits (0..9A..F) stands for the
3556		* Unicode character with that number. Any character except a hexadecimal digit can be escaped to remove its special
3557		* meaning, by putting a backslash in front.
3558		* @param buf css selector name
3559		* @return <code>true</code> if the given string is a valid css1 selector name
3560		*/
3561	7	public static boolean isCSS1Selector(String buf)
3562		{
3563	7	if (buf == null)
3564		{
3565	0	return false;
3566		}
3567
3568		// #508936 - CSS class naming for -clean option
3569	7	boolean valid = true;
3570	7	int esclen = 0;
3571	7	char c;
3572	7	int pos;
3573
3574	7	for (pos = 0; valid && pos < buf.length(); ++pos)
3575		{
3576	32	c = buf.charAt(pos);
3577	32	if (c == '\\')
3578		{
3579	3	esclen = 1; // ab\555\444 is 4 chars {'a', 'b', \555, \444}
3580		}
3581	29	else if (Character.isDigit(c))
3582		{
3583		// Digit not 1st, unless escaped (Max length "\112F")
3584	13	if (esclen > 0)
3585		{
3586	8	valid = (++esclen < 6);
3587		}
3588	13	if (valid)
3589		{
3590	12	valid = (pos > 0 \|\| esclen > 0);
3591		}
3592		}
3593		else
3594		{
3595	16	valid = (esclen > 0 // Escaped? Anything goes.
3596		\|\| (pos > 0 && c == '-') // Dash cannot be 1st char
3597		\|\| Character.isLetter(c) // a-z, A-Z anywhere
3598		\|\| (c >= 161 && c <= 255)); // Unicode 161-255 anywhere
3599	16	esclen = 0;
3600		}
3601		}
3602	7	return valid;
3603		}
3604
3605		/**
3606		* Parse tag attributes.
3607		* @param isempty is tag empty?
3608		* @return parsed attribute/value list
3609		*/
3610	3007	public AttVal parseAttrs(boolean[] isempty)
3611		{
3612	3007	AttVal av, list;
3613	3007	String attribute, value;
3614	3007	int[] delim = new int[1];
3615	3007	Node[] asp = new Node[1];
3616	3007	Node[] php = new Node[1];
3617
3618	3007	list = null;
3619
3620	3007	while (!endOfInput())
3621		{
3622	7925	attribute = parseAttribute(isempty, asp, php);
3623
3624	7925	if (attribute == null)
3625		{
3626		// check if attributes are created by ASP markup
3627	3006	if (asp[0] != null)
3628		{
3629	0	av = new AttVal(list, null, asp[0], null, '\0', null, null);
3630	0	list = av;
3631	0	continue;
3632		}
3633
3634		// check if attributes are created by PHP markup
3635	3006	if (php[0] != null)
3636		{
3637	0	av = new AttVal(list, null, null, php[0], '\0', null, null);
3638	0	list = av;
3639	0	continue;
3640		}
3641
3642	3006	break;
3643		}
3644
3645	4919	value = parseValue(attribute, false, isempty, delim);
3646
3647	4919	if (attribute != null && isValidAttrName(attribute))
3648		{
3649	4885	av = new AttVal(list, null, null, null, delim[0], attribute, value);
3650	4885	av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
3651	4885	list = av;
3652		}
3653		else
3654		{
3655	34	av = new AttVal(null, null, null, null, 0, attribute, value);
3656
3657		// #427664 - fix by Gary Peskin 04 Aug 00; other fixes by Dave Raggett
3658	34	if (value != null)
3659		{
3660	8	report.attrError(this, this.token, av, Report.BAD_ATTRIBUTE_VALUE);
3661		}
3662	26	else if (TidyUtils.lastChar(attribute) == '"')
3663		{
3664	9	report.attrError(this, this.token, av, Report.MISSING_QUOTEMARK);
3665		}
3666		else
3667		{
3668	17	report.attrError(this, this.token, av, Report.UNKNOWN_ATTRIBUTE);
3669		}
3670		}
3671		}
3672
3673	3007	return list;
3674		}
3675
3676		/**
3677		* Push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones
3678		* generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance:
3679		* <code><p><em> text <p><em> more text</code> Shouldn't be mapped to
3680		* <code><p><em> text </em></p><p><em><em> more text </em></em></code>
3681		* @param node Node to be pushed
3682		*/
3683	2539	public void pushInline(Node node)
3684		{
3685	2539	IStack is;
3686
3687	2539	if (node.implicit)
3688		{
3689	89	return;
3690		}
3691
3692	2450	if (node.tag == null)
3693		{
3694	0	return;
3695		}
3696
3697	2450	if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
3698		{
3699	0	return;
3700		}
3701
3702	2450	if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
3703		{
3704	0	return;
3705		}
3706
3707	2450	if (node.tag != this.configuration.tt.tagFont && isPushed(node))
3708		{
3709	28	return;
3710		}
3711
3712		// make sure there is enough space for the stack
3713	2422	is = new IStack();
3714	2422	is.tag = node.tag;
3715	2422	is.element = node.element;
3716	2422	if (node.attributes != null)
3717		{
3718	1869	is.attributes = cloneAttributes(node.attributes);
3719		}
3720	2422	this.istack.push(is);
3721		}
3722
3723		/**
3724		* Pop a copy of an inline node from the stack.
3725		* @param node Node to be popped
3726		*/
3727	2456	public void popInline(Node node)
3728		{
3729	2456	IStack is;
3730
3731	2456	if (node != null)
3732		{
3733
3734	2455	if (node.tag == null)
3735		{
3736	0	return;
3737		}
3738
3739	2455	if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
3740		{
3741	0	return;
3742		}
3743
3744	2455	if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
3745		{
3746	0	return;
3747		}
3748
3749		// if node is </a> then pop until we find an <a>
3750	2455	if (node.tag == this.configuration.tt.tagA)
3751		{
3752
3753	1116	while (this.istack.size() > 0)
3754		{
3755	1109	is = (IStack) this.istack.pop();
3756	1109	if (is.tag == this.configuration.tt.tagA)
3757		{
3758	1108	break;
3759		}
3760		}
3761
3762	1116	if (this.insert >= this.istack.size())
3763		{
3764	0	this.insert = -1;
3765		}
3766	1116	return;
3767		}
3768		}
3769
3770	1340	if (this.istack.size() > 0)
3771		{
3772	1311	is = (IStack) this.istack.pop();
3773	1311	if (this.insert >= this.istack.size())
3774		{
3775	1	this.insert = -1;
3776		}
3777		}
3778		}
3779
3780		/**
3781		* Is the node in the stack?
3782		* @param node Node
3783		* @return <code>true</code> is the node is found in the stack
3784		*/
3785	10931	public boolean isPushed(Node node)
3786		{
3787	10931	int i;
3788	10931	IStack is;
3789
3790	10931	for (i = this.istack.size() - 1; i >= 0; --i)
3791		{
3792	8983	is = (IStack) this.istack.elementAt(i);
3793	8983	if (is.tag == node.tag)
3794		{
3795	148	return true;
3796		}
3797		}
3798
3799	10783	return false;
3800		}
3801
3802		/**
3803		* This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P,
3804		* TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. When the inline stack is not empty, as
3805		* will be the case in: <code><i><h1>italic heading</h1></i></code> which is then treated as
3806		* equivalent to <code><h1><i>italic heading</i></h1></code> This is implemented by setting the lexer
3807		* into a mode where it gets tokens from the inline stack rather than from the input stream.
3808		* @param node original node
3809		* @return stack size
3810		*/
3811	3292	public int inlineDup(Node node)
3812		{
3813	3292	int n;
3814
3815	3292	n = this.istack.size() - this.istackbase;
3816	3292	if (n > 0)
3817		{
3818	88	this.insert = this.istackbase;
3819	88	this.inode = node;
3820		}
3821
3822	3292	return n;
3823		}
3824
3825		/**
3826		* @return
3827		*/
3828	99	public Node insertedToken()
3829		{
3830	99	Node node;
3831	99	IStack is;
3832	99	int n;
3833
3834		// this will only be null if inode != null
3835	99	if (this.insert == -1)
3836		{
3837	10	node = this.inode;
3838	10	this.inode = null;
3839	10	return node;
3840		}
3841
3842		// is this is the "latest" node then update the position, otherwise use current values
3843	89	if (this.inode == null)
3844		{
3845	79	this.lines = this.in.getCurline();
3846	79	this.columns = this.in.getCurcol();
3847		}
3848
3849	89	node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend);
3850
3851		// GLP: Bugfix 126261. Remove when this change is fixed in istack.c in the original Tidy
3852	89	node.implicit = true;
3853	89	is = (IStack) this.istack.elementAt(this.insert);
3854	89	node.element = is.element;
3855	89	node.tag = is.tag;
3856	89	if (is.attributes != null)
3857		{
3858	75	node.attributes = cloneAttributes(is.attributes);
3859		}
3860
3861		// advance lexer to next item on the stack
3862	89	n = this.insert;
3863
3864		// and recover state if we have reached the end
3865	89	if (++n < this.istack.size())
3866		{
3867	2	this.insert = n;
3868		}
3869		else
3870		{
3871	87	this.insert = -1;
3872		}
3873
3874	89	return node;
3875		}
3876
3877		/**
3878		* Can the given element be removed?
3879		* @param element node
3880		* @return <code>true</code> if he element can be removed
3881		*/
3882	5948	public boolean canPrune(Node element)
3883		{
3884	5948	if (element.type == Node.TEXT_NODE)
3885		{
3886	1267	return true;
3887		}
3888
3889	4681	if (element.content != null)
3890		{
3891	4406	return false;
3892		}
3893
3894	275	if (element.tag == this.configuration.tt.tagA && element.attributes != null)
3895		{
3896	73	return false;
3897		}
3898
3899	202	if (element.tag == this.configuration.tt.tagP && !this.configuration.dropEmptyParas)
3900		{
3901	0	return false;
3902		}
3903
3904	202	if (element.tag == null)
3905		{
3906	0	return false;
3907		}
3908
3909	202	if (TidyUtils.toBoolean(element.tag.model & Dict.CM_ROW))
3910		{
3911	6	return false;
3912		}
3913
3914	196	if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY))
3915		{
3916	80	return false;
3917		}
3918
3919	116	if (element.tag == this.configuration.tt.tagApplet)
3920		{
3921	0	return false;
3922		}
3923
3924	116	if (element.tag == this.configuration.tt.tagObject)
3925		{
3926	0	return false;
3927		}
3928
3929	116	if (element.tag == this.configuration.tt.tagScript && element.getAttrByName("src") != null)
3930		{
3931	0	return false;
3932		}
3933
3934		// #540555 Empty title tag is trimmed
3935	116	if (element.tag == this.configuration.tt.tagTitle)
3936		{
3937	1	return false;
3938		}
3939
3940		// #433359 - fix by Randy Waki 12 Mar 01 - Empty iframe is trimmed
3941	115	if (element.tag == this.configuration.tt.tagIframe)
3942		{
3943	1	return false;
3944		}
3945
3946	114	if (element.getAttrByName("id") != null \|\| element.getAttrByName("name") != null)
3947		{
3948	0	return false;
3949		}
3950
3951	114	return true;
3952		}
3953
3954		/**
3955		* duplicate name attribute as an id and check if id and name match.
3956		* @param node Node to check for name/it attributes
3957		*/
3958	1113	public void fixId(Node node)
3959		{
3960	1113	AttVal name = node.getAttrByName("name");
3961	1113	AttVal id = node.getAttrByName("id");
3962
3963	1113	if (name != null)
3964		{
3965	91	if (id != null)
3966		{
3967	2	if (id.value != null && !id.value.equals(name.value))
3968		{
3969	0	report.attrError(this, node, name, Report.ID_NAME_MISMATCH);
3970		}
3971		}
3972	89	else if (this.configuration.xmlOut)
3973		{
3974	4	node.addAttribute("id", name.value);
3975		}
3976		}
3977		}
3978
3979		/**
3980		* Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated.
3981		*/
3982	71	public void deferDup()
3983		{
3984	71	this.insert = -1;
3985	71	this.inode = null;
3986		}
3987
3988		/**
3989		* Constraint the html version in the document to the given one. Everything is allowed in proprietary version of
3990		* HTML this is handled here rather than in the tag/attr dicts.
3991		* @param vers html version code
3992		*/
3993	13823	void constrainVersion(int vers)
3994		{
3995	13823	this.versions &= (vers \| Dict.VERS_PROPRIETARY);
3996		}
3997
3998		/**
3999		* Is content acceptable for pre elements?
4000		* @param node content
4001		* @return <code>true</code> if node is acceptable in pre elements
4002		*/
4003	16	protected boolean preContent(Node node)
4004		{
4005		// p is coerced to br's
4006	16	if (node.tag == this.configuration.tt.tagP)
4007		{
4008	0	return true;
4009		}
4010
4011	16	if (node.tag == null
4012		\|\| node.tag == this.configuration.tt.tagP
4013		\|\| !TidyUtils.toBoolean(node.tag.model & (Dict.CM_INLINE \| Dict.CM_NEW)))
4014		{
4015	4	return false;
4016		}
4017	12	return true;
4018		}
4019
4020		/**
4021		* document type.
4022		*/
4023		private static class W3CVersionInfo
4024		{
4025
4026		/**
4027		* name.
4028		*/
4029		String name;
4030
4031		/**
4032		* voyager name.
4033		*/
4034		String voyagerName;
4035
4036		/**
4037		* profile.
4038		*/
4039		String profile;
4040
4041		/**
4042		* code.
4043		*/
4044		short code;
4045
4046		/**
4047		* Instantiates a new W3CVersionInfo.
4048		* @param name version name
4049		* @param voyagerName voyager (xhtml) name
4050		* @param profile VOYAGER_STRICT \| VOYAGER_LOOSE \| VOYAGER_FRAMESET
4051		* @param code unique code for this version info
4052		*/
4053	11	public W3CVersionInfo(String name, String voyagerName, String profile, short code)
4054		{
4055	11	this.name = name;
4056	11	this.voyagerName = voyagerName;
4057	11	this.profile = profile;
4058	11	this.code = code;
4059		}
4060		}
4061
4062		}