View Javadoc

1   /*
2    *  Java HTML Tidy - JTidy
3    *  HTML parser and pretty printer
4    *
5    *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6    *  Institute of Technology, Institut National de Recherche en
7    *  Informatique et en Automatique, Keio University). All Rights
8    *  Reserved.
9    *
10   *  Contributing Author(s):
11   *
12   *     Dave Raggett <dsr@w3.org>
13   *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14   *     Gary L Peskin <garyp@firstech.com> (Java development)
15   *     Sami Lempinen <sami@lempinen.net> (release management)
16   *     Fabrizio Giustina <fgiust at users.sourceforge.net>
17   *
18   *  The contributing author(s) would like to thank all those who
19   *  helped with testing, bug fixes, and patience.  This wouldn't
20   *  have been possible without all of you.
21   *
22   *  COPYRIGHT NOTICE:
23   * 
24   *  This software and documentation is provided "as is," and
25   *  the copyright holders and contributing author(s) make no
26   *  representations or warranties, express or implied, including
27   *  but not limited to, warranties of merchantability or fitness
28   *  for any particular purpose or that the use of the software or
29   *  documentation will not infringe any third party patents,
30   *  copyrights, trademarks or other rights. 
31   *
32   *  The copyright holders and contributing author(s) will not be
33   *  liable for any direct, indirect, special or consequential damages
34   *  arising out of any use of the software or documentation, even if
35   *  advised of the possibility of such damage.
36   *
37   *  Permission is hereby granted to use, copy, modify, and distribute
38   *  this source code, or portions hereof, documentation and executables,
39   *  for any purpose, without fee, subject to the following restrictions:
40   *
41   *  1. The origin of this source code must not be misrepresented.
42   *  2. Altered versions must be plainly marked as such and must
43   *     not be misrepresented as being the original source.
44   *  3. This Copyright notice may not be removed or altered from any
45   *     source or altered source distribution.
46   * 
47   *  The copyright holders and contributing author(s) specifically
48   *  permit, without fee, and encourage the use of this source code
49   *  as a component for supporting the Hypertext Markup Language in
50   *  commercial products. If you use this source code in a product,
51   *  acknowledgment is not required but would be appreciated.
52   *
53   */
54  package org.w3c.tidy;
55  
56  import java.io.IOException;
57  import java.io.OutputStream;
58  
59  import org.w3c.tidy.EncodingUtils.PutBytes;
60  
61  
62  /***
63   * Output implementation. This implementation is from the c version of tidy and it doesn't take advantage of java
64   * writers.
65   * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
66   * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
67   * @author Fabrizio Giustina
68   * @version $Revision: 1.16 $ ($Author: fgiust $)
69   */
70  public class OutImpl implements Out
71  {
72  
73      /***
74       * output encoding.
75       */
76      private int encoding;
77  
78      /***
79       * actual state for ISO 2022.
80       */
81      private int state;
82  
83      /***
84       * output stream.
85       */
86      private OutputStream out;
87  
88      /***
89       * putter callback.
90       */
91      private PutBytes putBytes;
92  
93      /***
94       * newline bytes.
95       */
96      private byte[] newline;
97  
98      /***
99       * Constructor.
100      * @param configuration actual configuration instance (needed for newline configuration)
101      * @param encoding encoding constant
102      * @param out output stream
103      */
104     public OutImpl(Configuration configuration, int encoding, OutputStream out)
105     {
106         this.encoding = encoding;
107         this.state = EncodingUtils.FSM_ASCII;
108         this.out = out;
109 
110         // copy configured newline in bytes
111         this.newline = new byte[configuration.newline.length];
112         for (int j = 0; j < configuration.newline.length; j++)
113         {
114             this.newline[j] = (byte) configuration.newline[j];
115         }
116 
117         this.putBytes = new PutBytes()
118         {
119 
120             private OutImpl impl;
121 
122             PutBytes setOut(OutImpl out)
123             {
124                 this.impl = out;
125                 return this;
126             }
127 
128             public void doPut(byte[] buf, int[] count)
129             {
130                 impl.outcUTF8Bytes(buf, count);
131             }
132         } // set the out instance direclty
133             .setOut(this);
134     }
135 
136     /***
137      * output UTF-8 bytes to output stream.
138      * @param buf array of bytes
139      * @param count number of bytes in buf to write
140      */
141     void outcUTF8Bytes(byte[] buf, int[] count)
142     {
143         try
144         {
145             for (int i = 0; i < count[0]; i++)
146             {
147                 out.write(buf[i]);
148             }
149         }
150         catch (IOException e)
151         {
152             System.err.println("OutImpl.outcUTF8Bytes: " + e.toString());
153         }
154     }
155 
156     /***
157      * .
158      * @see org.w3c.tidy.Out#outc(byte)
159      */
160     public void outc(byte c)
161     {
162         outc(c & 0xFF); // Convert to unsigned.
163     }
164 
165     /***
166      * @see org.w3c.tidy.Out#outc(int)
167      */
168     public void outc(int c)
169     {
170         int ch;
171 
172         try
173         {
174 
175             if (this.encoding == Configuration.MACROMAN)
176             {
177                 if (c < 128)
178                 {
179                     out.write(c);
180                 }
181                 else
182                 {
183                     int i;
184 
185                     for (i = 128; i < 256; i++)
186                     {
187                         if (EncodingUtils.decodeMacRoman(i - 128) == c)
188                         {
189                             out.write(i);
190                             break;
191                         }
192                     }
193                 }
194             }
195             else
196 
197             if (this.encoding == Configuration.WIN1252)
198             {
199                 if (c < 128 || (c > 159 && c < 256))
200                 {
201                     out.write(c);
202                 }
203                 else
204                 {
205                     int i;
206 
207                     for (i = 128; i < 160; i++)
208                     {
209                         if (EncodingUtils.decodeWin1252(i - 128) == c)
210                         {
211                             out.write(i);
212                             break;
213                         }
214                     }
215                 }
216             }
217             else if (this.encoding == Configuration.UTF8)
218             {
219                 int[] count = new int[]{0};
220 
221                 EncodingUtils.encodeCharToUTF8Bytes(c, null, this.putBytes, count);
222                 if (count[0] <= 0)
223                 {
224                     /* ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
225                     /* replacement char 0xFFFD encoded as UTF-8 */
226                     out.write(0xEF);
227                     out.write(0xBF);
228                     out.write(0xBF);
229                 }
230             }
231             else if (this.encoding == Configuration.ISO2022)
232             {
233                 if (c == 0x1b) /* ESC */
234                 {
235                     this.state = EncodingUtils.FSM_ESC;
236                 }
237                 else
238                 {
239                     switch (this.state)
240                     {
241                         case EncodingUtils.FSM_ESC :
242                             if (c == '$')
243                             {
244                                 this.state = EncodingUtils.FSM_ESCD;
245                             }
246                             else if (c == '(')
247                             {
248                                 this.state = EncodingUtils.FSM_ESCP;
249                             }
250                             else
251                             {
252                                 this.state = EncodingUtils.FSM_ASCII;
253                             }
254                             break;
255 
256                         case EncodingUtils.FSM_ESCD :
257                             if (c == '(')
258                             {
259                                 this.state = EncodingUtils.FSM_ESCDP;
260                             }
261                             else
262                             {
263                                 this.state = EncodingUtils.FSM_NONASCII;
264                             }
265                             break;
266 
267                         case EncodingUtils.FSM_ESCDP :
268                             this.state = EncodingUtils.FSM_NONASCII;
269                             break;
270 
271                         case EncodingUtils.FSM_ESCP :
272                             this.state = EncodingUtils.FSM_ASCII;
273                             break;
274 
275                         case EncodingUtils.FSM_NONASCII :
276                             c &= 0x7F;
277                             break;
278 
279                         default :
280                             // should not reach here
281                             break;
282                     }
283                 }
284 
285                 this.out.write(c);
286             }
287             else if (this.encoding == Configuration.UTF16LE
288                 || this.encoding == Configuration.UTF16BE
289                 || this.encoding == Configuration.UTF16)
290             {
291                 int i = 1;
292                 int numChars = 1;
293                 int[] theChars = new int[2];
294 
295                 if (c > EncodingUtils.MAX_UTF16_FROM_UCS4)
296                 {
297                     // invalid UTF-16 value
298                     /* ReportEncodingError(in.lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
299                     c = 0;
300                     numChars = 0;
301                 }
302                 else if (c >= EncodingUtils.UTF16_SURROGATES_BEGIN)
303                 {
304                     // encode surrogate pairs
305 
306                     // check for invalid pairs
307                     if (((c & 0x0000FFFE) == 0x0000FFFE) || ((c & 0x0000FFFF) == 0x0000FFFF))
308                     {
309                         /* ReportEncodingError(in.lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
310                         c = 0;
311                         numChars = 0;
312                     }
313                     else
314                     {
315                         theChars[0] = (c - EncodingUtils.UTF16_SURROGATES_BEGIN)
316                             / 0x400
317                             + EncodingUtils.UTF16_LOW_SURROGATE_BEGIN;
318                         theChars[1] = (c - EncodingUtils.UTF16_SURROGATES_BEGIN)
319                             % 0x400
320                             + EncodingUtils.UTF16_HIGH_SURROGATE_BEGIN;
321 
322                         // output both
323                         numChars = 2;
324                     }
325                 }
326                 else
327                 {
328                     // just put the char out
329                     theChars[0] = c;
330                 }
331 
332                 for (i = 0; i < numChars; i++)
333                 {
334                     c = theChars[i];
335 
336                     if (this.encoding == Configuration.UTF16LE)
337                     {
338                         ch = c & 0xFF;
339                         out.write(ch);
340                         ch = (c >> 8) & 0xFF;
341                         out.write(ch);
342                     }
343 
344                     else if (this.encoding == Configuration.UTF16BE || this.encoding == Configuration.UTF16)
345                     {
346                         ch = (c >> 8) & 0xFF;
347                         out.write(ch);
348                         ch = c & 0xFF;
349                         out.write(ch);
350                     }
351                 }
352             }
353             // #431953 - start RJ
354             else if (this.encoding == Configuration.BIG5 || this.encoding == Configuration.SHIFTJIS)
355             {
356                 if (c < 128)
357                 {
358                     this.out.write(c);
359                 }
360                 else
361                 {
362                     ch = (c >> 8) & 0xFF;
363                     this.out.write(ch);
364                     ch = c & 0xFF;
365                     this.out.write(ch);
366                 }
367             }
368             // #431953 - end RJ
369             else
370             {
371                 this.out.write(c);
372             }
373         }
374         catch (IOException e)
375         {
376             System.err.println("OutImpl.outc: " + e.toString());
377         }
378     }
379 
380     /***
381      * @see org.w3c.tidy.Out#newline()
382      */
383     public void newline()
384     {
385         try
386         {
387             this.out.write(this.newline);
388             this.out.flush();
389         }
390         catch (IOException e)
391         {
392             System.err.println("OutImpl.newline: " + e.toString());
393         }
394     }
395 
396     /***
397      * Setter for <code>out</code>.
398      * @param out The out to set.
399      */
400     public void setOut(OutputStream out)
401     {
402         this.out = out;
403     }
404 
405     /***
406      * Output a Byte Order Mark.
407      */
408     public void outBOM()
409     {
410         if (this.encoding == Configuration.UTF8
411             || this.encoding == Configuration.UTF16LE
412             || this.encoding == Configuration.UTF16BE
413             || this.encoding == Configuration.UTF16)
414         {
415             outc(EncodingUtils.UNICODE_BOM); // this will take care of encoding the BOM correctly
416         }
417     }
418 
419     /***
420      * @see org.w3c.tidy.Out#close()
421      */
422     public void close()
423     {
424         try
425         {
426             this.out.flush();
427             this.out.close();
428         }
429         catch (IOException e)
430         {
431             System.err.println("OutImpl.close: " + e.toString());
432         }
433     }
434 }