View Javadoc

1   /*
2    *  Java HTML Tidy - JTidy
3    *  HTML parser and pretty printer
4    *
5    *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6    *  Institute of Technology, Institut National de Recherche en
7    *  Informatique et en Automatique, Keio University). All Rights
8    *  Reserved.
9    *
10   *  Contributing Author(s):
11   *
12   *     Dave Raggett <dsr@w3.org>
13   *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14   *     Gary L Peskin <garyp@firstech.com> (Java development)
15   *     Sami Lempinen <sami@lempinen.net> (release management)
16   *     Fabrizio Giustina <fgiust at users.sourceforge.net>
17   *
18   *  The contributing author(s) would like to thank all those who
19   *  helped with testing, bug fixes, and patience.  This wouldn't
20   *  have been possible without all of you.
21   *
22   *  COPYRIGHT NOTICE:
23   * 
24   *  This software and documentation is provided "as is," and
25   *  the copyright holders and contributing author(s) make no
26   *  representations or warranties, express or implied, including
27   *  but not limited to, warranties of merchantability or fitness
28   *  for any particular purpose or that the use of the software or
29   *  documentation will not infringe any third party patents,
30   *  copyrights, trademarks or other rights. 
31   *
32   *  The copyright holders and contributing author(s) will not be
33   *  liable for any direct, indirect, special or consequential damages
34   *  arising out of any use of the software or documentation, even if
35   *  advised of the possibility of such damage.
36   *
37   *  Permission is hereby granted to use, copy, modify, and distribute
38   *  this source code, or portions hereof, documentation and executables,
39   *  for any purpose, without fee, subject to the following restrictions:
40   *
41   *  1. The origin of this source code must not be misrepresented.
42   *  2. Altered versions must be plainly marked as such and must
43   *     not be misrepresented as being the original source.
44   *  3. This Copyright notice may not be removed or altered from any
45   *     source or altered source distribution.
46   * 
47   *  The copyright holders and contributing author(s) specifically
48   *  permit, without fee, and encourage the use of this source code
49   *  as a component for supporting the Hypertext Markup Language in
50   *  commercial products. If you use this source code in a product,
51   *  acknowledgment is not required but would be appreciated.
52   *
53   */
54  package org.w3c.tidy;
55  
56  import java.io.IOException;
57  import java.io.InputStream;
58  
59  import org.w3c.tidy.EncodingUtils.GetBytes;
60  
61  
62  /***
63   * Input Stream Implementation. This implementation is from the c version of tidy and it doesn't take advantage of java
64   * readers.
65   * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
66   * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
67   * @author Fabrizio Giustina
68   * @version $Revision: 1.28 $ ($Author: fgiust $)
69   */
70  public class StreamInImpl implements StreamIn
71  {
72  
73      /***
74       * number of characters kept in buffer.
75       */
76      private static final int CHARBUF_SIZE = 5;
77  
78      /***
79       * needed for error reporting.
80       */
81      private Lexer lexer;
82  
83      /***
84       * character buffer.
85       */
86      private int[] charbuf = new int[CHARBUF_SIZE];
87  
88      /***
89       * actual position in buffer.
90       */
91      private int bufpos;
92  
93      /***
94       * Private unget buffer for the raw bytes read from the input stream. Normally this will only be used by the UTF-8
95       * decoder to resynchronize the input stream after finding an illegal UTF-8 sequences. But it can be used for other
96       * purposes when reading bytes in ReadCharFromStream.
97       */
98      private char[] rawBytebuf = new char[CHARBUF_SIZE];
99  
100     /***
101      * actual position in rawBytebuf.
102      */
103     private int rawBufpos;
104 
105     /***
106      * has a raw byte been pushed into stack?
107      */
108     private boolean rawPushed;
109 
110     /***
111      * looking for an UTF BOM?
112      */
113     private boolean lookingForBOM = true;
114 
115     /***
116      * has end of stream been reached?
117      */
118     private boolean endOfStream;
119 
120     private boolean pushed;
121 
122     private int tabs;
123 
124     /***
125      * tab size in chars.
126      */
127     private int tabsize;
128 
129     /***
130      * FSM for ISO2022.
131      */
132     private int state;
133 
134     /***
135      * Encoding.
136      */
137     private int encoding;
138 
139     /***
140      * current column number.
141      */
142     private int curcol;
143 
144     /***
145      * last column.
146      */
147     private int lastcol;
148 
149     /***
150      * current line number.
151      */
152     private int curline;
153 
154     /***
155      * input stream.
156      */
157     private InputStream stream;
158 
159     /***
160      * Getter.
161      */
162     private GetBytes getBytes;
163 
164     /***
165      * Avoid mapping values > 127 to entities.
166      */
167     private boolean rawOut;
168 
169     /***
170      * Instatiates a new StreamInImpl.
171      * @param stream input stream
172      * @param configuration Configuration
173      */
174     public StreamInImpl(InputStream stream, Configuration configuration)
175     {
176         this.stream = stream;
177         this.charbuf[0] = '\0';
178         this.tabsize = configuration.tabsize;
179         this.curline = 1;
180         this.curcol = 1;
181         this.encoding = configuration.getInCharEncoding();
182         this.rawOut = configuration.rawOut;
183         this.state = EncodingUtils.FSM_ASCII;
184         this.getBytes = new GetBytes()
185         {
186 
187             StreamInImpl in;
188 
189             GetBytes setStreamIn(StreamInImpl in)
190             {
191                 this.in = in;
192                 return this;
193             }
194 
195             public void doGet(int[] buf, int[] count, boolean unget)
196             {
197                 in.readRawBytesFromStream(buf, count, unget);
198             }
199         } // set the StreamInImpl instance directly
200             .setStreamIn(this);
201     }
202 
203     /***
204      * @see org.w3c.tidy.StreamIn#getCurcol()
205      */
206     public int getCurcol()
207     {
208         return this.curcol;
209     }
210 
211     /***
212      * @see org.w3c.tidy.StreamIn#getCurline()
213      */
214     public int getCurline()
215     {
216         return this.curline;
217     }
218 
219     /***
220      * Setter for <code>lexer</code>.
221      * @param lexer The lexer to set.
222      */
223     public void setLexer(Lexer lexer)
224     {
225         this.lexer = lexer;
226     }
227 
228     /***
229      * @see org.w3c.tidy.StreamIn#readChar()
230      */
231     public int readChar()
232     {
233         int c;
234 
235         if (this.pushed)
236         {
237             c = this.charbuf[--(this.bufpos)];
238             if ((this.bufpos) == 0)
239             {
240                 this.pushed = false;
241             }
242 
243             if (c == '\n')
244             {
245                 this.curcol = 1;
246                 this.curline++;
247             }
248             else
249             {
250                 this.curcol++;
251             }
252 
253             return c;
254         }
255 
256         this.lastcol = this.curcol;
257 
258         if (this.tabs > 0)
259         {
260             this.curcol++;
261             this.tabs--;
262             return ' ';
263         }
264 
265         while (true)
266         {
267             c = readCharFromStream();
268 
269             if (c < 0)
270             {
271                 return END_OF_STREAM;
272             }
273 
274             if (c == '\n')
275             {
276                 this.curcol = 1;
277                 this.curline++;
278                 break;
279             }
280 
281             // #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00
282             if (c == '\r')
283             {
284                 c = readCharFromStream();
285                 if (c != '\n')
286                 {
287                     if (c != END_OF_STREAM) // EOF fix by Terry Teague 12 Aug 01
288                     {
289                         ungetChar(c);
290                     }
291                     c = '\n';
292                 }
293                 this.curcol = 1;
294                 this.curline++;
295                 break;
296             }
297 
298             if (c == '\t')
299             {
300                 this.tabs = this.tabsize - ((this.curcol - 1) % this.tabsize) - 1;
301                 this.curcol++;
302                 c = ' ';
303                 break;
304             }
305 
306             // strip control characters, except for Esc
307             if (c == '\033')
308             {
309                 break;
310             }
311             else if (c == '\015' && !lexer.configuration.xmlTags) //Form Feed is allowed in HTML
312             {
313                 break;
314             }
315             else if (0 < c && c < 32)
316             {
317                 continue; // discard control char
318             }
319 
320             // watch out for chars that have already been decoded such as
321             // IS02022, UTF-8 etc, that don't require further decoding
322             if (rawOut
323                 || this.encoding == Configuration.ISO2022
324                 || this.encoding == Configuration.UTF8
325                 || this.encoding == Configuration.SHIFTJIS // #431953 - RJ
326                 || this.encoding == Configuration.BIG5) // #431953 - RJ
327             {
328                 this.curcol++;
329                 break;
330             }
331 
332             // handle surrogate pairs
333             if ((this.encoding == Configuration.UTF16LE)
334                 || (this.encoding == Configuration.UTF16)
335                 || (this.encoding == Configuration.UTF16BE))
336             {
337                 if (c > EncodingUtils.MAX_UTF8_FROM_UCS4)
338                 {
339                     // invalid UTF-16 value
340                     this.lexer.report.encodingError(this.lexer, Report.INVALID_UTF16 | Report.DISCARDED_CHAR, c);
341                     c = 0;
342                 }
343                 // high surrogate
344                 else if (c >= EncodingUtils.UTF16_LOW_SURROGATE_BEGIN && c <= EncodingUtils.UTF16_LOW_SURROGATE_END)
345                 {
346                     int n, m;
347 
348                     n = c;
349 
350                     m = readCharFromStream();
351                     if (m < 0)
352                     {
353                         return END_OF_STREAM;
354                     }
355                     // low surrogate
356                     if (m >= EncodingUtils.UTF16_HIGH_SURROGATE_BEGIN && m <= EncodingUtils.UTF16_HIGH_SURROGATE_END)
357                     {
358                         // pair found, recombine them
359                         c = (n - EncodingUtils.UTF16_LOW_SURROGATE_BEGIN)
360                             * 0x400
361                             + (m - EncodingUtils.UTF16_HIGH_SURROGATE_BEGIN)
362                             + 0x10000;
363 
364                         // check for invalid pairs
365                         if (((c & 0x0000FFFE) == 0x0000FFFE)
366                             || ((c & 0x0000FFFF) == 0x0000FFFF)
367                             || (c < EncodingUtils.UTF16_SURROGATES_BEGIN))
368                         {
369                             this.lexer.report
370                                 .encodingError(this.lexer, Report.INVALID_UTF16 | Report.DISCARDED_CHAR, c);
371                             c = 0;
372                         }
373                     }
374                     else
375                     {
376                         // not a valid pair
377                         this.lexer.report.encodingError(this.lexer, Report.INVALID_UTF16 | Report.DISCARDED_CHAR, c);
378                         c = 0;
379                         // should we unget the just read char?
380                     }
381                 }
382                 else
383                 {
384                     // no recombination needed
385                 }
386             }
387 
388             if (this.encoding == Configuration.MACROMAN)
389             {
390                 c = EncodingUtils.decodeMacRoman(c);
391             }
392 
393             // produced e.g. as a side-effect of smart quotes in Word
394             // but can't happen if using MACROMAN encoding
395             if (127 < c && c < 160)
396             {
397                 int c1 = 0;
398                 int replaceMode;
399 
400                 // set error position just before offending character
401                 this.lexer.lines = this.curline;
402                 this.lexer.columns = this.curcol;
403 
404                 if ((this.encoding == Configuration.WIN1252)
405                     || (this.lexer.configuration.replacementCharEncoding == Configuration.WIN1252))
406                 {
407                     c1 = EncodingUtils.decodeWin1252(c);
408                 }
409                 else if (this.lexer.configuration.replacementCharEncoding == Configuration.MACROMAN)
410                 {
411                     c1 = EncodingUtils.decodeMacRoman(c);
412                 }
413 
414                 replaceMode = TidyUtils.toBoolean(c1) ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;
415 
416                 if ((c1 == 0) && (this.encoding == Configuration.WIN1252) || (this.encoding == Configuration.MACROMAN))
417                 {
418                     this.lexer.report.encodingError(this.lexer, Report.VENDOR_SPECIFIC_CHARS | replaceMode, c);
419                 }
420                 else if ((this.encoding != Configuration.WIN1252) && (this.encoding != Configuration.MACROMAN))
421                 {
422                     this.lexer.report.encodingError(this.lexer, Report.INVALID_SGML_CHARS | replaceMode, c);
423                 }
424 
425                 c = c1;
426             }
427 
428             if (c == 0)
429             {
430                 continue; // illegal char is discarded
431             }
432 
433             this.curcol++;
434             break;
435         }
436 
437         return c;
438     }
439 
440     /***
441      * @see org.w3c.tidy.StreamIn#ungetChar(int)
442      */
443     public void ungetChar(int c)
444     {
445         this.pushed = true;
446         if (this.bufpos >= CHARBUF_SIZE)
447         {
448             // pop last element
449             System.arraycopy(this.charbuf, 0, this.charbuf, 1, CHARBUF_SIZE - 1);
450             this.bufpos--;
451         }
452         this.charbuf[(this.bufpos)++] = c;
453 
454         if (c == '\n')
455         {
456             --this.curline;
457         }
458 
459         this.curcol = this.lastcol;
460     }
461 
462     /***
463      * @see org.w3c.tidy.StreamIn#isEndOfStream()
464      */
465     public boolean isEndOfStream()
466     {
467         return this.endOfStream;
468     }
469 
470     /***
471      * @see org.w3c.tidy.StreamIn#readCharFromStream()
472      */
473     public int readCharFromStream()
474     {
475         int c;
476         int[] n = new int[]{0};
477         int[] tempchar = new int[1];
478         int[] count = new int[]{1};
479 
480         readRawBytesFromStream(tempchar, count, false);
481         if (count[0] <= 0)
482         {
483             endOfStream = true;
484             return END_OF_STREAM;
485         }
486 
487         c = tempchar[0];
488 
489         if (lookingForBOM
490             && (this.encoding == Configuration.UTF16
491                 || this.encoding == Configuration.UTF16LE
492                 || this.encoding == Configuration.UTF16BE || this.encoding == Configuration.UTF8))
493         {
494             // check for a Byte Order Mark
495             int c1, bom;
496 
497             lookingForBOM = false;
498 
499             if (c == END_OF_STREAM)
500             {
501                 lookingForBOM = false;
502                 endOfStream = true;
503                 return END_OF_STREAM;
504             }
505 
506             count[0] = 1;
507             readRawBytesFromStream(tempchar, count, false);
508             c1 = tempchar[0];
509 
510             bom = (c << 8) + c1;
511 
512             if (bom == EncodingUtils.UNICODE_BOM_BE)
513             {
514                 // big-endian UTF-16
515                 if (this.encoding != Configuration.UTF16 && this.encoding != Configuration.UTF16BE)
516                 {
517                     this.lexer.report.encodingError(this.lexer, Report.ENCODING_MISMATCH, Configuration.UTF16BE);
518                     // non-fatal error
519                 }
520                 this.encoding = Configuration.UTF16BE;
521                 this.lexer.configuration.setInCharEncoding(Configuration.UTF16BE);
522                 return EncodingUtils.UNICODE_BOM; // return decoded BOM
523             }
524             else if (bom == EncodingUtils.UNICODE_BOM_LE)
525             {
526                 // little-endian UTF-16
527                 if (this.encoding != Configuration.UTF16 && this.encoding != Configuration.UTF16LE)
528                 {
529                     this.lexer.report.encodingError(this.lexer, Report.ENCODING_MISMATCH, Configuration.UTF16LE);
530                     // non-fatal error
531                 }
532                 this.encoding = Configuration.UTF16LE;
533                 this.lexer.configuration.setInCharEncoding(Configuration.UTF16LE);
534                 return EncodingUtils.UNICODE_BOM; // return decoded BOM
535             }
536             else
537             {
538                 int c2;
539 
540                 count[0] = 1;
541                 readRawBytesFromStream(tempchar, count, false);
542                 c2 = tempchar[0];
543 
544                 if (((c << 16) + (c1 << 8) + c2) == EncodingUtils.UNICODE_BOM_UTF8)
545                 {
546                     // UTF-8
547                     this.encoding = Configuration.UTF8;
548                     if (this.encoding != Configuration.UTF8)
549                     {
550                         this.lexer.report.encodingError(this.lexer, Report.ENCODING_MISMATCH, Configuration.UTF8);
551                         // non-fatal error
552                     }
553                     this.lexer.configuration.setInCharEncoding(Configuration.UTF8);
554                     return EncodingUtils.UNICODE_BOM; // return decoded BOM
555                 }
556 
557                 // the 2nd and/or 3rd bytes weren't what we were expecting, so unget the extra 2 bytes
558                 rawPushed = true;
559 
560                 if ((rawBufpos + 1) >= CHARBUF_SIZE)
561                 {
562                     System.arraycopy(rawBytebuf, 2, rawBytebuf, 0, CHARBUF_SIZE - 2);
563                     rawBufpos -= 2;
564                 }
565                 // make sure the bytes are pushed in the right order
566                 rawBytebuf[rawBufpos++] = (char) c2;
567                 rawBytebuf[rawBufpos++] = (char) c1;
568                 // drop through to code below, with the original char
569 
570             }
571         }
572 
573         this.lookingForBOM = false;
574 
575         // A document in ISO-2022 based encoding uses some ESC sequences called "designator" to switch character sets.
576         // The designators defined and used in ISO-2022-JP are: "ESC" + "(" + ? for ISO646 variants "ESC" + "$" + ? and
577         // "ESC" + "$" + "(" + ? for multibyte character sets Where ? stands for a single character used to indicate the
578         // character set for multibyte characters. Tidy handles this by preserving the escape sequence and setting the
579         // top bit of each byte for non-ascii chars. This bit is then cleared on output. The input stream keeps track of
580         // the state to determine when to set/clear the bit.
581 
582         if (this.encoding == Configuration.ISO2022)
583         {
584             if (c == 0x1b) // ESC
585             {
586                 this.state = EncodingUtils.FSM_ESC;
587                 return c;
588             }
589 
590             switch (this.state)
591             {
592                 case EncodingUtils.FSM_ESC :
593                     if (c == '$')
594                     {
595                         this.state = EncodingUtils.FSM_ESCD;
596                     }
597                     else if (c == '(')
598                     {
599                         this.state = EncodingUtils.FSM_ESCP;
600                     }
601                     else
602                     {
603                         this.state = EncodingUtils.FSM_ASCII;
604                     }
605                     break;
606 
607                 case EncodingUtils.FSM_ESCD :
608                     if (c == '(')
609                     {
610                         this.state = EncodingUtils.FSM_ESCDP;
611                     }
612                     else
613                     {
614                         this.state = EncodingUtils.FSM_NONASCII;
615                     }
616                     break;
617 
618                 case EncodingUtils.FSM_ESCDP :
619                     this.state = EncodingUtils.FSM_NONASCII;
620                     break;
621 
622                 case EncodingUtils.FSM_ESCP :
623                     this.state = EncodingUtils.FSM_ASCII;
624                     break;
625 
626                 case EncodingUtils.FSM_NONASCII :
627                     c |= 0x80;
628                     break;
629 
630                 default :
631                     // 
632                     break;
633             }
634 
635             return c;
636         }
637 
638         if (this.encoding == Configuration.UTF16LE)
639         {
640             int c1;
641 
642             count[0] = 1;
643             readRawBytesFromStream(tempchar, count, false);
644             if (count[0] <= 0)
645             {
646                 endOfStream = true;
647                 return END_OF_STREAM;
648             }
649             c1 = tempchar[0];
650 
651             n[0] = (c1 << 8) + c;
652 
653             return n[0];
654         }
655 
656         // UTF-16 is big-endian by default
657         if ((this.encoding == Configuration.UTF16) || (this.encoding == Configuration.UTF16BE))
658         {
659             int c1;
660 
661             count[0] = 1;
662             readRawBytesFromStream(tempchar, count, false);
663             if (count[0] <= 0)
664             {
665                 endOfStream = true;
666                 return END_OF_STREAM;
667             }
668             c1 = tempchar[0];
669 
670             n[0] = (c << 8) + c1;
671 
672             return n[0];
673         }
674 
675         if (this.encoding == Configuration.UTF8)
676         {
677             // deal with UTF-8 encoded char
678             int[] count2 = new int[]{0};
679 
680             // first byte "c" is passed in separately
681             boolean err = EncodingUtils.decodeUTF8BytesToChar(n, c, new byte[0], this.getBytes, count2, 0);
682             if (!err && (n[0] == END_OF_STREAM) && (count2[0] == 1)) /* EOF */
683             {
684                 endOfStream = true;
685                 return END_OF_STREAM;
686             }
687             else if (err)
688             {
689                 /* set error position just before offending character */
690                 this.lexer.lines = this.curline;
691                 this.lexer.columns = this.curcol;
692 
693                 this.lexer.report.encodingError(this.lexer, (short) (Report.INVALID_UTF8 | Report.REPLACED_CHAR), n[0]);
694                 n[0] = 0xFFFD; /* replacement char */
695             }
696 
697             return n[0];
698         }
699 
700         // #431953 - start RJ
701         // This section is suitable for any "multibyte" variable-width character encoding in which a one-byte code is
702         // less than 128, and the first byte of a two-byte code is greater or equal to 128. Note that Big5 and ShiftJIS
703         // fit into this kind, even though their second byte may be less than 128
704 
705         if ((this.encoding == Configuration.BIG5) || (this.encoding == Configuration.SHIFTJIS))
706         {
707             if (c < 128)
708             {
709                 return c;
710             }
711             else if ((this.encoding == Configuration.SHIFTJIS) && (c >= 0xa1 && c <= 0xdf))
712             {
713                 // 461643 - fix suggested by Rick Cameron 14 Sep 01
714                 // for Shift_JIS, the values from 0xa1 through 0xdf represent singe-byte characters (U+FF61 to U+FF9F -
715                 // half-shift Katakana)
716                 return c;
717             }
718             else
719             {
720                 int c1;
721                 count[0] = 1;
722                 readRawBytesFromStream(tempchar, count, false);
723 
724                 if (count[0] <= 0)
725                 {
726                     endOfStream = true;
727                     return END_OF_STREAM;
728                 }
729 
730                 c1 = tempchar[0];
731                 n[0] = (c << 8) + c1;
732                 return n[0];
733             }
734         }
735         // #431953 - end RJ
736         n[0] = c;
737 
738         return n[0];
739     }
740 
741     /***
742      * Read raw bytes from stream, return <= 0 if EOF; or if "unget" is true, Unget the bytes to re-synchronize the
743      * input stream Normally UTF-8 successor bytes are read using this routine.
744      * @param buf character buffer
745      * @param count number of bytes to read
746      * @param unget unget bytes
747      */
748     protected void readRawBytesFromStream(int[] buf, int[] count, boolean unget)
749     {
750 
751         try
752         {
753             for (int i = 0; i < count[0]; i++)
754             {
755                 if (unget)
756                 {
757 
758                     int c = this.stream.read();
759 
760                     // should never get here; testing for 0xFF, a valid char, is not a good idea
761                     if (c == END_OF_STREAM) // || buf[i] == (unsigned char)EndOfStream
762                     {
763                         count[0] = -i;
764                         return;
765                     }
766 
767                     rawPushed = true;
768 
769                     if (rawBufpos >= CHARBUF_SIZE)
770                     {
771                         System.arraycopy(rawBytebuf, 1, rawBytebuf, 0, CHARBUF_SIZE - 1);
772                         rawBufpos--;
773                     }
774                     rawBytebuf[rawBufpos++] = (char) buf[i];
775                 }
776                 else
777                 {
778                     if (rawPushed)
779                     {
780                         buf[i] = rawBytebuf[--rawBufpos];
781                         if (rawBufpos == 0)
782                         {
783                             rawPushed = false;
784                         }
785                     }
786                     else
787                     {
788                         int c = this.stream.read();
789                         if (c == END_OF_STREAM)
790                         {
791                             count[0] = -i;
792                             break;
793                         }
794                         buf[i] = (char) c;
795                     }
796                 }
797             }
798         }
799         catch (IOException e)
800         {
801             System.err.println("StreamInImpl.readRawBytesFromStream: " + e.toString());
802         }
803         return;
804     }
805 
806 }