1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54 package org.w3c.tidy;
55
56 import java.io.IOException;
57 import java.io.InputStream;
58
59 import org.w3c.tidy.EncodingUtils.GetBytes;
60
61
62 /***
63 * Input Stream Implementation. This implementation is from the c version of tidy and it doesn't take advantage of java
64 * readers.
65 * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
66 * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
67 * @author Fabrizio Giustina
68 * @version $Revision: 1.28 $ ($Author: fgiust $)
69 */
70 public class StreamInImpl implements StreamIn
71 {
72
73 /***
74 * number of characters kept in buffer.
75 */
76 private static final int CHARBUF_SIZE = 5;
77
78 /***
79 * needed for error reporting.
80 */
81 private Lexer lexer;
82
83 /***
84 * character buffer.
85 */
86 private int[] charbuf = new int[CHARBUF_SIZE];
87
88 /***
89 * actual position in buffer.
90 */
91 private int bufpos;
92
93 /***
94 * Private unget buffer for the raw bytes read from the input stream. Normally this will only be used by the UTF-8
95 * decoder to resynchronize the input stream after finding an illegal UTF-8 sequences. But it can be used for other
96 * purposes when reading bytes in ReadCharFromStream.
97 */
98 private char[] rawBytebuf = new char[CHARBUF_SIZE];
99
100 /***
101 * actual position in rawBytebuf.
102 */
103 private int rawBufpos;
104
105 /***
106 * has a raw byte been pushed into stack?
107 */
108 private boolean rawPushed;
109
110 /***
111 * looking for an UTF BOM?
112 */
113 private boolean lookingForBOM = true;
114
115 /***
116 * has end of stream been reached?
117 */
118 private boolean endOfStream;
119
120 private boolean pushed;
121
122 private int tabs;
123
124 /***
125 * tab size in chars.
126 */
127 private int tabsize;
128
129 /***
130 * FSM for ISO2022.
131 */
132 private int state;
133
134 /***
135 * Encoding.
136 */
137 private int encoding;
138
139 /***
140 * current column number.
141 */
142 private int curcol;
143
144 /***
145 * last column.
146 */
147 private int lastcol;
148
149 /***
150 * current line number.
151 */
152 private int curline;
153
154 /***
155 * input stream.
156 */
157 private InputStream stream;
158
159 /***
160 * Getter.
161 */
162 private GetBytes getBytes;
163
164 /***
165 * Avoid mapping values > 127 to entities.
166 */
167 private boolean rawOut;
168
169 /***
170 * Instatiates a new StreamInImpl.
171 * @param stream input stream
172 * @param configuration Configuration
173 */
174 public StreamInImpl(InputStream stream, Configuration configuration)
175 {
176 this.stream = stream;
177 this.charbuf[0] = '\0';
178 this.tabsize = configuration.tabsize;
179 this.curline = 1;
180 this.curcol = 1;
181 this.encoding = configuration.getInCharEncoding();
182 this.rawOut = configuration.rawOut;
183 this.state = EncodingUtils.FSM_ASCII;
184 this.getBytes = new GetBytes()
185 {
186
187 StreamInImpl in;
188
189 GetBytes setStreamIn(StreamInImpl in)
190 {
191 this.in = in;
192 return this;
193 }
194
195 public void doGet(int[] buf, int[] count, boolean unget)
196 {
197 in.readRawBytesFromStream(buf, count, unget);
198 }
199 }
200 .setStreamIn(this);
201 }
202
203 /***
204 * @see org.w3c.tidy.StreamIn#getCurcol()
205 */
206 public int getCurcol()
207 {
208 return this.curcol;
209 }
210
211 /***
212 * @see org.w3c.tidy.StreamIn#getCurline()
213 */
214 public int getCurline()
215 {
216 return this.curline;
217 }
218
219 /***
220 * Setter for <code>lexer</code>.
221 * @param lexer The lexer to set.
222 */
223 public void setLexer(Lexer lexer)
224 {
225 this.lexer = lexer;
226 }
227
228 /***
229 * @see org.w3c.tidy.StreamIn#readChar()
230 */
231 public int readChar()
232 {
233 int c;
234
235 if (this.pushed)
236 {
237 c = this.charbuf[--(this.bufpos)];
238 if ((this.bufpos) == 0)
239 {
240 this.pushed = false;
241 }
242
243 if (c == '\n')
244 {
245 this.curcol = 1;
246 this.curline++;
247 }
248 else
249 {
250 this.curcol++;
251 }
252
253 return c;
254 }
255
256 this.lastcol = this.curcol;
257
258 if (this.tabs > 0)
259 {
260 this.curcol++;
261 this.tabs--;
262 return ' ';
263 }
264
265 while (true)
266 {
267 c = readCharFromStream();
268
269 if (c < 0)
270 {
271 return END_OF_STREAM;
272 }
273
274 if (c == '\n')
275 {
276 this.curcol = 1;
277 this.curline++;
278 break;
279 }
280
281
282 if (c == '\r')
283 {
284 c = readCharFromStream();
285 if (c != '\n')
286 {
287 if (c != END_OF_STREAM)
288 {
289 ungetChar(c);
290 }
291 c = '\n';
292 }
293 this.curcol = 1;
294 this.curline++;
295 break;
296 }
297
298 if (c == '\t')
299 {
300 this.tabs = this.tabsize - ((this.curcol - 1) % this.tabsize) - 1;
301 this.curcol++;
302 c = ' ';
303 break;
304 }
305
306
307 if (c == '\033')
308 {
309 break;
310 }
311 else if (c == '\015' && !lexer.configuration.xmlTags)
312 {
313 break;
314 }
315 else if (0 < c && c < 32)
316 {
317 continue;
318 }
319
320
321
322 if (rawOut
323 || this.encoding == Configuration.ISO2022
324 || this.encoding == Configuration.UTF8
325 || this.encoding == Configuration.SHIFTJIS
326 || this.encoding == Configuration.BIG5)
327 {
328 this.curcol++;
329 break;
330 }
331
332
333 if ((this.encoding == Configuration.UTF16LE)
334 || (this.encoding == Configuration.UTF16)
335 || (this.encoding == Configuration.UTF16BE))
336 {
337 if (c > EncodingUtils.MAX_UTF8_FROM_UCS4)
338 {
339
340 this.lexer.report.encodingError(this.lexer, Report.INVALID_UTF16 | Report.DISCARDED_CHAR, c);
341 c = 0;
342 }
343
344 else if (c >= EncodingUtils.UTF16_LOW_SURROGATE_BEGIN && c <= EncodingUtils.UTF16_LOW_SURROGATE_END)
345 {
346 int n, m;
347
348 n = c;
349
350 m = readCharFromStream();
351 if (m < 0)
352 {
353 return END_OF_STREAM;
354 }
355
356 if (m >= EncodingUtils.UTF16_HIGH_SURROGATE_BEGIN && m <= EncodingUtils.UTF16_HIGH_SURROGATE_END)
357 {
358
359 c = (n - EncodingUtils.UTF16_LOW_SURROGATE_BEGIN)
360 * 0x400
361 + (m - EncodingUtils.UTF16_HIGH_SURROGATE_BEGIN)
362 + 0x10000;
363
364
365 if (((c & 0x0000FFFE) == 0x0000FFFE)
366 || ((c & 0x0000FFFF) == 0x0000FFFF)
367 || (c < EncodingUtils.UTF16_SURROGATES_BEGIN))
368 {
369 this.lexer.report
370 .encodingError(this.lexer, Report.INVALID_UTF16 | Report.DISCARDED_CHAR, c);
371 c = 0;
372 }
373 }
374 else
375 {
376
377 this.lexer.report.encodingError(this.lexer, Report.INVALID_UTF16 | Report.DISCARDED_CHAR, c);
378 c = 0;
379
380 }
381 }
382 else
383 {
384
385 }
386 }
387
388 if (this.encoding == Configuration.MACROMAN)
389 {
390 c = EncodingUtils.decodeMacRoman(c);
391 }
392
393
394
395 if (127 < c && c < 160)
396 {
397 int c1 = 0;
398 int replaceMode;
399
400
401 this.lexer.lines = this.curline;
402 this.lexer.columns = this.curcol;
403
404 if ((this.encoding == Configuration.WIN1252)
405 || (this.lexer.configuration.replacementCharEncoding == Configuration.WIN1252))
406 {
407 c1 = EncodingUtils.decodeWin1252(c);
408 }
409 else if (this.lexer.configuration.replacementCharEncoding == Configuration.MACROMAN)
410 {
411 c1 = EncodingUtils.decodeMacRoman(c);
412 }
413
414 replaceMode = TidyUtils.toBoolean(c1) ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;
415
416 if ((c1 == 0) && (this.encoding == Configuration.WIN1252) || (this.encoding == Configuration.MACROMAN))
417 {
418 this.lexer.report.encodingError(this.lexer, Report.VENDOR_SPECIFIC_CHARS | replaceMode, c);
419 }
420 else if ((this.encoding != Configuration.WIN1252) && (this.encoding != Configuration.MACROMAN))
421 {
422 this.lexer.report.encodingError(this.lexer, Report.INVALID_SGML_CHARS | replaceMode, c);
423 }
424
425 c = c1;
426 }
427
428 if (c == 0)
429 {
430 continue;
431 }
432
433 this.curcol++;
434 break;
435 }
436
437 return c;
438 }
439
440 /***
441 * @see org.w3c.tidy.StreamIn#ungetChar(int)
442 */
443 public void ungetChar(int c)
444 {
445 this.pushed = true;
446 if (this.bufpos >= CHARBUF_SIZE)
447 {
448
449 System.arraycopy(this.charbuf, 0, this.charbuf, 1, CHARBUF_SIZE - 1);
450 this.bufpos--;
451 }
452 this.charbuf[(this.bufpos)++] = c;
453
454 if (c == '\n')
455 {
456 --this.curline;
457 }
458
459 this.curcol = this.lastcol;
460 }
461
462 /***
463 * @see org.w3c.tidy.StreamIn#isEndOfStream()
464 */
465 public boolean isEndOfStream()
466 {
467 return this.endOfStream;
468 }
469
470 /***
471 * @see org.w3c.tidy.StreamIn#readCharFromStream()
472 */
473 public int readCharFromStream()
474 {
475 int c;
476 int[] n = new int[]{0};
477 int[] tempchar = new int[1];
478 int[] count = new int[]{1};
479
480 readRawBytesFromStream(tempchar, count, false);
481 if (count[0] <= 0)
482 {
483 endOfStream = true;
484 return END_OF_STREAM;
485 }
486
487 c = tempchar[0];
488
489 if (lookingForBOM
490 && (this.encoding == Configuration.UTF16
491 || this.encoding == Configuration.UTF16LE
492 || this.encoding == Configuration.UTF16BE || this.encoding == Configuration.UTF8))
493 {
494
495 int c1, bom;
496
497 lookingForBOM = false;
498
499 if (c == END_OF_STREAM)
500 {
501 lookingForBOM = false;
502 endOfStream = true;
503 return END_OF_STREAM;
504 }
505
506 count[0] = 1;
507 readRawBytesFromStream(tempchar, count, false);
508 c1 = tempchar[0];
509
510 bom = (c << 8) + c1;
511
512 if (bom == EncodingUtils.UNICODE_BOM_BE)
513 {
514
515 if (this.encoding != Configuration.UTF16 && this.encoding != Configuration.UTF16BE)
516 {
517 this.lexer.report.encodingError(this.lexer, Report.ENCODING_MISMATCH, Configuration.UTF16BE);
518
519 }
520 this.encoding = Configuration.UTF16BE;
521 this.lexer.configuration.setInCharEncoding(Configuration.UTF16BE);
522 return EncodingUtils.UNICODE_BOM;
523 }
524 else if (bom == EncodingUtils.UNICODE_BOM_LE)
525 {
526
527 if (this.encoding != Configuration.UTF16 && this.encoding != Configuration.UTF16LE)
528 {
529 this.lexer.report.encodingError(this.lexer, Report.ENCODING_MISMATCH, Configuration.UTF16LE);
530
531 }
532 this.encoding = Configuration.UTF16LE;
533 this.lexer.configuration.setInCharEncoding(Configuration.UTF16LE);
534 return EncodingUtils.UNICODE_BOM;
535 }
536 else
537 {
538 int c2;
539
540 count[0] = 1;
541 readRawBytesFromStream(tempchar, count, false);
542 c2 = tempchar[0];
543
544 if (((c << 16) + (c1 << 8) + c2) == EncodingUtils.UNICODE_BOM_UTF8)
545 {
546
547 this.encoding = Configuration.UTF8;
548 if (this.encoding != Configuration.UTF8)
549 {
550 this.lexer.report.encodingError(this.lexer, Report.ENCODING_MISMATCH, Configuration.UTF8);
551
552 }
553 this.lexer.configuration.setInCharEncoding(Configuration.UTF8);
554 return EncodingUtils.UNICODE_BOM;
555 }
556
557
558 rawPushed = true;
559
560 if ((rawBufpos + 1) >= CHARBUF_SIZE)
561 {
562 System.arraycopy(rawBytebuf, 2, rawBytebuf, 0, CHARBUF_SIZE - 2);
563 rawBufpos -= 2;
564 }
565
566 rawBytebuf[rawBufpos++] = (char) c2;
567 rawBytebuf[rawBufpos++] = (char) c1;
568
569
570 }
571 }
572
573 this.lookingForBOM = false;
574
575
576
577
578
579
580
581
582 if (this.encoding == Configuration.ISO2022)
583 {
584 if (c == 0x1b)
585 {
586 this.state = EncodingUtils.FSM_ESC;
587 return c;
588 }
589
590 switch (this.state)
591 {
592 case EncodingUtils.FSM_ESC :
593 if (c == '$')
594 {
595 this.state = EncodingUtils.FSM_ESCD;
596 }
597 else if (c == '(')
598 {
599 this.state = EncodingUtils.FSM_ESCP;
600 }
601 else
602 {
603 this.state = EncodingUtils.FSM_ASCII;
604 }
605 break;
606
607 case EncodingUtils.FSM_ESCD :
608 if (c == '(')
609 {
610 this.state = EncodingUtils.FSM_ESCDP;
611 }
612 else
613 {
614 this.state = EncodingUtils.FSM_NONASCII;
615 }
616 break;
617
618 case EncodingUtils.FSM_ESCDP :
619 this.state = EncodingUtils.FSM_NONASCII;
620 break;
621
622 case EncodingUtils.FSM_ESCP :
623 this.state = EncodingUtils.FSM_ASCII;
624 break;
625
626 case EncodingUtils.FSM_NONASCII :
627 c |= 0x80;
628 break;
629
630 default :
631
632 break;
633 }
634
635 return c;
636 }
637
638 if (this.encoding == Configuration.UTF16LE)
639 {
640 int c1;
641
642 count[0] = 1;
643 readRawBytesFromStream(tempchar, count, false);
644 if (count[0] <= 0)
645 {
646 endOfStream = true;
647 return END_OF_STREAM;
648 }
649 c1 = tempchar[0];
650
651 n[0] = (c1 << 8) + c;
652
653 return n[0];
654 }
655
656
657 if ((this.encoding == Configuration.UTF16) || (this.encoding == Configuration.UTF16BE))
658 {
659 int c1;
660
661 count[0] = 1;
662 readRawBytesFromStream(tempchar, count, false);
663 if (count[0] <= 0)
664 {
665 endOfStream = true;
666 return END_OF_STREAM;
667 }
668 c1 = tempchar[0];
669
670 n[0] = (c << 8) + c1;
671
672 return n[0];
673 }
674
675 if (this.encoding == Configuration.UTF8)
676 {
677
678 int[] count2 = new int[]{0};
679
680
681 boolean err = EncodingUtils.decodeUTF8BytesToChar(n, c, new byte[0], this.getBytes, count2, 0);
682 if (!err && (n[0] == END_OF_STREAM) && (count2[0] == 1))
683 {
684 endOfStream = true;
685 return END_OF_STREAM;
686 }
687 else if (err)
688 {
689
690 this.lexer.lines = this.curline;
691 this.lexer.columns = this.curcol;
692
693 this.lexer.report.encodingError(this.lexer, (short) (Report.INVALID_UTF8 | Report.REPLACED_CHAR), n[0]);
694 n[0] = 0xFFFD;
695 }
696
697 return n[0];
698 }
699
700
701
702
703
704
705 if ((this.encoding == Configuration.BIG5) || (this.encoding == Configuration.SHIFTJIS))
706 {
707 if (c < 128)
708 {
709 return c;
710 }
711 else if ((this.encoding == Configuration.SHIFTJIS) && (c >= 0xa1 && c <= 0xdf))
712 {
713
714
715
716 return c;
717 }
718 else
719 {
720 int c1;
721 count[0] = 1;
722 readRawBytesFromStream(tempchar, count, false);
723
724 if (count[0] <= 0)
725 {
726 endOfStream = true;
727 return END_OF_STREAM;
728 }
729
730 c1 = tempchar[0];
731 n[0] = (c << 8) + c1;
732 return n[0];
733 }
734 }
735
736 n[0] = c;
737
738 return n[0];
739 }
740
741 /***
742 * Read raw bytes from stream, return <= 0 if EOF; or if "unget" is true, Unget the bytes to re-synchronize the
743 * input stream Normally UTF-8 successor bytes are read using this routine.
744 * @param buf character buffer
745 * @param count number of bytes to read
746 * @param unget unget bytes
747 */
748 protected void readRawBytesFromStream(int[] buf, int[] count, boolean unget)
749 {
750
751 try
752 {
753 for (int i = 0; i < count[0]; i++)
754 {
755 if (unget)
756 {
757
758 int c = this.stream.read();
759
760
761 if (c == END_OF_STREAM)
762 {
763 count[0] = -i;
764 return;
765 }
766
767 rawPushed = true;
768
769 if (rawBufpos >= CHARBUF_SIZE)
770 {
771 System.arraycopy(rawBytebuf, 1, rawBytebuf, 0, CHARBUF_SIZE - 1);
772 rawBufpos--;
773 }
774 rawBytebuf[rawBufpos++] = (char) buf[i];
775 }
776 else
777 {
778 if (rawPushed)
779 {
780 buf[i] = rawBytebuf[--rawBufpos];
781 if (rawBufpos == 0)
782 {
783 rawPushed = false;
784 }
785 }
786 else
787 {
788 int c = this.stream.read();
789 if (c == END_OF_STREAM)
790 {
791 count[0] = -i;
792 break;
793 }
794 buf[i] = (char) c;
795 }
796 }
797 }
798 }
799 catch (IOException e)
800 {
801 System.err.println("StreamInImpl.readRawBytesFromStream: " + e.toString());
802 }
803 return;
804 }
805
806 }