View Javadoc

1   /*
2    *  Java HTML Tidy - JTidy
3    *  HTML parser and pretty printer
4    *
5    *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6    *  Institute of Technology, Institut National de Recherche en
7    *  Informatique et en Automatique, Keio University). All Rights
8    *  Reserved.
9    *
10   *  Contributing Author(s):
11   *
12   *     Dave Raggett <dsr@w3.org>
13   *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14   *     Gary L Peskin <garyp@firstech.com> (Java development)
15   *     Sami Lempinen <sami@lempinen.net> (release management)
16   *     Fabrizio Giustina <fgiust at users.sourceforge.net>
17   *
18   *  The contributing author(s) would like to thank all those who
19   *  helped with testing, bug fixes, and patience.  This wouldn't
20   *  have been possible without all of you.
21   *
22   *  COPYRIGHT NOTICE:
23   *
24   *  This software and documentation is provided "as is," and
25   *  the copyright holders and contributing author(s) make no
26   *  representations or warranties, express or implied, including
27   *  but not limited to, warranties of merchantability or fitness
28   *  for any particular purpose or that the use of the software or
29   *  documentation will not infringe any third party patents,
30   *  copyrights, trademarks or other rights.
31   *
32   *  The copyright holders and contributing author(s) will not be
33   *  liable for any direct, indirect, special or consequential damages
34   *  arising out of any use of the software or documentation, even if
35   *  advised of the possibility of such damage.
36   *
37   *  Permission is hereby granted to use, copy, modify, and distribute
38   *  this source code, or portions hereof, documentation and executables,
39   *  for any purpose, without fee, subject to the following restrictions:
40   *
41   *  1. The origin of this source code must not be misrepresented.
42   *  2. Altered versions must be plainly marked as such and must
43   *     not be misrepresented as being the original source.
44   *  3. This Copyright notice may not be removed or altered from any
45   *     source or altered source distribution.
46   *
47   *  The copyright holders and contributing author(s) specifically
48   *  permit, without fee, and encourage the use of this source code
49   *  as a component for supporting the Hypertext Markup Language in
50   *  commercial products. If you use this source code in a product,
51   *  acknowledgment is not required but would be appreciated.
52   *
53   */
54  package org.w3c.tidy;
55  
56  import java.util.HashMap;
57  import java.util.Map;
58  
59  
60  /**
61   * Maps between Java and IANA character encoding names. Also handles encoding alias used in tidy c.
62   * @author Fabrizio Giustina
63   * @version $Revision: 804 $ ($Author: fgiust $)
64   * @see http://www.iana.org/assignments/character-sets
65   */
66  public abstract class EncodingNameMapper
67  {
68  
69      /**
70       * Map containing uppercase alias - {standard iana, standard java}.
71       */
72      private static Map encodingNameMap = new HashMap();
73  
74      static
75      {
76          encodingNameMap.put("ISO-8859-1", new String[]{"ISO-8859-1", "ISO8859_1"});
77          encodingNameMap.put("ISO8859_1", new String[]{"ISO-8859-1", "ISO8859_1"});
78          encodingNameMap.put("ISO-IR-100", new String[]{"ISO-8859-1", "ISO8859_1"});
79          encodingNameMap.put("LATIN1", new String[]{"ISO-8859-1", "ISO8859_1"});
80          encodingNameMap.put("CSISOLATIN1", new String[]{"ISO-8859-1", "ISO8859_1"});
81          encodingNameMap.put("L1", new String[]{"ISO-8859-1", "ISO8859_1"});
82          encodingNameMap.put("819", new String[]{"ISO-8859-1", "ISO8859_1"});
83  
84          encodingNameMap.put("US-ASCII", new String[]{"US-ASCII", "ASCII"});
85          encodingNameMap.put("ASCII", new String[]{"US-ASCII", "ASCII"});
86          encodingNameMap.put("ISO-IR-6", new String[]{"US-ASCII", "ASCII"});
87          encodingNameMap.put("CSASCII", new String[]{"US-ASCII", "ASCII"});
88          encodingNameMap.put("ISO646-US", new String[]{"US-ASCII", "ASCII"});
89          encodingNameMap.put("US", new String[]{"US-ASCII", "ASCII"});
90          encodingNameMap.put("367", new String[]{"US-ASCII", "ASCII"});
91  
92          encodingNameMap.put("UTF-8", new String[]{"UTF-8", "UTF8"});
93          encodingNameMap.put("UTF8", new String[]{"UTF-8", "UTF8"});
94          encodingNameMap.put("UTF-16", new String[]{"UTF-16", "Unicode"});
95          encodingNameMap.put("UNICODE", new String[]{"UTF-16", "Unicode"});
96          encodingNameMap.put("UTF16", new String[]{"UTF-16", "Unicode"});
97          encodingNameMap.put("UTF16", new String[]{"UTF-16", "Unicode"}); // tidy
98  
99          encodingNameMap.put("UTF-16BE", new String[]{"UTF-16BE", "UnicodeBig"});
100         encodingNameMap.put("UNICODEBIG", new String[]{"UTF-16BE", "UnicodeBig"});
101         encodingNameMap.put("UTF16-BE", new String[]{"UTF-16BE", "UnicodeBig"});
102         encodingNameMap.put("UTF-16LE", new String[]{"UTF-16LE", "UnicodeLittle"});
103         encodingNameMap.put("UNICODELITTLE", new String[]{"UTF-16LE", "UnicodeLittle"});
104         encodingNameMap.put("UTF16-LE", new String[]{"UTF-16LE", "UnicodeLittle"});
105         encodingNameMap.put("UTF16BE", new String[]{"UTF-16BE", "UnicodeBig"}); // tidy
106         encodingNameMap.put("UTF16LE", new String[]{"UTF-16LE", "UnicodeLittle"}); // tidy
107 
108         encodingNameMap.put("BIG5", new String[]{"BIG5", "Big5"});
109         encodingNameMap.put("CSBIG5", new String[]{"BIG5", "Big5"});
110 
111         encodingNameMap.put("SJIS", new String[]{"SHIFT_JIS", "SJIS"});
112         encodingNameMap.put("SHIFT_JIS", new String[]{"SHIFT_JIS", "SJIS"});
113         encodingNameMap.put("CSSHIFTJIS", new String[]{"CSSHIFTJIS", "SJIS"});
114         encodingNameMap.put("MS_KANJI", new String[]{"MS_KANJI", "SJIS"});
115         encodingNameMap.put("SHIFTJIS", new String[]{"SHIFT_JIS", "SJIS"}); // tidy
116 
117         encodingNameMap.put("JIS", new String[]{"ISO-2022-JP", "JIS"});
118         encodingNameMap.put("ISO-2022-JP", new String[]{"ISO-2022-JP", "JIS"});
119         encodingNameMap.put("CSISO2022JP", new String[]{"CSISO2022JP", "JIS"});
120         encodingNameMap.put("ISO2022", new String[]{"ISO-2022-JP", "JIS"}); // tidy
121 
122         encodingNameMap.put("ISO2022KR", new String[]{"ISO-2022-KR", "ISO2022KR"});
123         encodingNameMap.put("ISO-2022-KR", new String[]{"ISO-2022-KR", "ISO2022KR"});
124         encodingNameMap.put("CSISO2022KR", new String[]{"CSISO2022KR", "ISO2022KR"});
125         encodingNameMap.put("ISO-2022-CN", new String[]{"ISO-2022-CN", "ISO2022CN"});
126         encodingNameMap.put("ISO2022CN", new String[]{"ISO-2022-CN", "ISO2022CN"});
127 
128         encodingNameMap.put("MACROMAN", new String[]{"macintosh", "MacRoman"}); // tidy
129         encodingNameMap.put("MACINTOSH", new String[]{"macintosh", "MacRoman"});
130         encodingNameMap.put("MACINTOSH ROMAN", new String[]{"macintosh", "MacRoman"});
131 
132         encodingNameMap.put("37", new String[]{"IBM037", "CP037"});
133         encodingNameMap.put("273", new String[]{"IBM273", "CP273"});
134         encodingNameMap.put("277", new String[]{"IBM277", "CP277"});
135         encodingNameMap.put("278", new String[]{"IBM278", "CP278"});
136         encodingNameMap.put("280", new String[]{"IBM280", "CP280"});
137         encodingNameMap.put("284", new String[]{"IBM284", "CP284"});
138         encodingNameMap.put("285", new String[]{"IBM285", "CP285"});
139         encodingNameMap.put("290", new String[]{"IBM290", "CP290"});
140         encodingNameMap.put("297", new String[]{"IBM297", "CP297"});
141         encodingNameMap.put("420", new String[]{"IBM420", "CP420"});
142         encodingNameMap.put("424", new String[]{"IBM424", "CP424"});
143         encodingNameMap.put("437", new String[]{"IBM437", "CP437"});
144         encodingNameMap.put("500", new String[]{"IBM500", "CP500"});
145         encodingNameMap.put("775", new String[]{"IBM775", "CP775"});
146         encodingNameMap.put("850", new String[]{"IBM850", "CP850"});
147         encodingNameMap.put("852", new String[]{"IBM852", "CP852"});
148         encodingNameMap.put("CSPCP852", new String[]{"IBM852", "CP852"});
149         encodingNameMap.put("855", new String[]{"IBM855", "CP855"});
150         encodingNameMap.put("857", new String[]{"IBM857", "CP857"});
151         encodingNameMap.put("858", new String[]{"IBM00858", "Cp858"});
152         encodingNameMap.put("0858", new String[]{"IBM00858", "Cp858"});
153         encodingNameMap.put("860", new String[]{"IBM860", "CP860"});
154         encodingNameMap.put("861", new String[]{"IBM861", "CP861"});
155         encodingNameMap.put("IS", new String[]{"IBM861", "CP861"});
156         encodingNameMap.put("862", new String[]{"IBM862", "CP862"});
157         encodingNameMap.put("863", new String[]{"IBM863", "CP863"});
158         encodingNameMap.put("864", new String[]{"IBM864", "CP864"});
159         encodingNameMap.put("865", new String[]{"IBM865", "CP865"});
160         encodingNameMap.put("866", new String[]{"IBM866", "CP866"});
161         encodingNameMap.put("868", new String[]{"IBM868", "CP868"});
162         encodingNameMap.put("AR", new String[]{"IBM868", "CP868"});
163         encodingNameMap.put("869", new String[]{"IBM869", "CP869"});
164         encodingNameMap.put("GR", new String[]{"IBM869", "CP869"});
165         encodingNameMap.put("870", new String[]{"IBM870", "CP870"});
166         encodingNameMap.put("871", new String[]{"IBM871", "CP871"});
167         encodingNameMap.put("EBCDIC-CP-IS", new String[]{"IBM871", "CP871"});
168         encodingNameMap.put("918", new String[]{"CP918", "CP918"});
169         encodingNameMap.put("924", new String[]{"IBM00924", "CP924"});
170         encodingNameMap.put("0924", new String[]{"IBM00924", "CP924"});
171         encodingNameMap.put("1026", new String[]{"IBM1026", "CP1026"});
172         encodingNameMap.put("1047", new String[]{"IBM1047", "Cp1047"});
173         encodingNameMap.put("1140", new String[]{"IBM01140", "Cp1140"});
174         encodingNameMap.put("1141", new String[]{"IBM01141", "Cp1141"});
175         encodingNameMap.put("1142", new String[]{"IBM01142", "Cp1142"});
176         encodingNameMap.put("1143", new String[]{"IBM01143", "Cp1143"});
177         encodingNameMap.put("1144", new String[]{"IBM01144", "Cp1144"});
178         encodingNameMap.put("1145", new String[]{"IBM01145", "Cp1145"});
179         encodingNameMap.put("1146", new String[]{"IBM01146", "Cp1146"});
180         encodingNameMap.put("1147", new String[]{"IBM01147", "Cp1147"});
181         encodingNameMap.put("1148", new String[]{"IBM01148", "Cp1148"});
182         encodingNameMap.put("1149", new String[]{"IBM01149", "Cp1149"});
183         encodingNameMap.put("1250", new String[]{"WINDOWS-1250", "Cp1250"});
184         encodingNameMap.put("1251", new String[]{"WINDOWS-1251", "Cp1251"});
185         encodingNameMap.put("1252", new String[]{"WINDOWS-1252", "Cp1252"});
186         encodingNameMap.put("WIN1252", new String[]{"WINDOWS-1252", "Cp1252"}); // tidy
187         encodingNameMap.put("1253", new String[]{"WINDOWS-1253", "Cp1253"});
188         encodingNameMap.put("1254", new String[]{"WINDOWS-1254", "Cp1254"});
189         encodingNameMap.put("1255", new String[]{"WINDOWS-1255", "Cp1255"});
190         encodingNameMap.put("1256", new String[]{"WINDOWS-1256", "Cp1256"});
191         encodingNameMap.put("1257", new String[]{"WINDOWS-1257", "Cp1257"});
192         encodingNameMap.put("1258", new String[]{"WINDOWS-1258", "Cp1258"});
193 
194         encodingNameMap.put("EUC-JP", new String[]{"EUC-JP", "EUCJIS"});
195         encodingNameMap.put("EUCJIS", new String[]{"EUC-JP", "EUCJIS"});
196         encodingNameMap.put("EUC-KR", new String[]{"EUC-KR", "KSC5601"});
197         encodingNameMap.put("KSC5601", new String[]{"EUC-KR", "KSC5601"});
198         encodingNameMap.put("GB2312", new String[]{"GB2312", "GB2312"});
199         encodingNameMap.put("CSGB2312", new String[]{"GB2312", "GB2312"});
200         encodingNameMap.put("X0201", new String[]{"X0201", "JIS0201"});
201         encodingNameMap.put("JIS0201", new String[]{"X0201", "JIS0201"});
202         encodingNameMap.put("X0208", new String[]{"X0208", "JIS0208"});
203         encodingNameMap.put("JIS0208", new String[]{"X0208", "JIS0208"});
204         encodingNameMap.put("ISO-IR-87", new String[]{"ISO-IR-87", "JIS0208"});
205         encodingNameMap.put("JIS0208", new String[]{"ISO-IR-87", "JIS0208"});
206         encodingNameMap.put("X0212", new String[]{"X0212", "JIS0212"});
207         encodingNameMap.put("JIS0212", new String[]{"X0212", "JIS0212"});
208         encodingNameMap.put("ISO-IR-159", new String[]{"X0212", "JIS0212"});
209         encodingNameMap.put("GB18030", new String[]{"GB18030", "GB18030"});
210 
211         encodingNameMap.put("936", new String[]{"GBK", "GBK"});
212         encodingNameMap.put("MS936", new String[]{"GBK", "GBK"});
213 
214         encodingNameMap.put("MS932", new String[]{"WINDOWS-31J", "MS932"});
215         encodingNameMap.put("31J", new String[]{"WINDOWS-31J", "MS932"});
216         encodingNameMap.put("CSWINDOWS31J", new String[]{"WINDOWS-31J", "MS932"});
217         encodingNameMap.put("TIS-620", new String[]{"TIS-620", "TIS620"});
218         encodingNameMap.put("TIS620", new String[]{"TIS-620", "TIS620"});
219 
220         encodingNameMap.put("ISO-8859-2", new String[]{"ISO-8859-2", "ISO8859_2"});
221         encodingNameMap.put("ISO8859_2", new String[]{"ISO-8859-2", "ISO8859_2"});
222         encodingNameMap.put("ISO-IR-101", new String[]{"ISO-8859-2", "ISO8859_2"});
223         encodingNameMap.put("LATIN2", new String[]{"ISO-8859-2", "ISO8859_2"});
224         encodingNameMap.put("L2", new String[]{"ISO-8859-2", "ISO8859_2"});
225 
226         encodingNameMap.put("ISO-8859-3", new String[]{"ISO-8859-3", "ISO8859_3"});
227         encodingNameMap.put("ISO8859_3", new String[]{"ISO-8859-3", "ISO8859_3"});
228         encodingNameMap.put("ISO-IR-109", new String[]{"ISO-8859-3", "ISO8859_3"});
229         encodingNameMap.put("LATIN3", new String[]{"ISO-8859-3", "ISO8859_3"});
230         encodingNameMap.put("L3", new String[]{"ISO-8859-3", "ISO8859_3"});
231 
232         encodingNameMap.put("ISO-8859-4", new String[]{"ISO-8859-4", "ISO8859_4"});
233         encodingNameMap.put("ISO8859_4", new String[]{"ISO-8859-4", "ISO8859_4"});
234         encodingNameMap.put("ISO-IR-110", new String[]{"ISO-8859-4", "ISO8859_4"});
235         encodingNameMap.put("ISO-IR-110", new String[]{"ISO-8859-4", "ISO8859_4"});
236         encodingNameMap.put("L4", new String[]{"ISO-8859-4", "ISO8859_4"});
237 
238         encodingNameMap.put("ISO-8859-5", new String[]{"ISO-8859-5", "ISO8859_5"});
239         encodingNameMap.put("ISO8859_5", new String[]{"ISO-8859-5", "ISO8859_5"});
240         encodingNameMap.put("ISO-IR-144", new String[]{"ISO-8859-5", "ISO8859_5"});
241         encodingNameMap.put("CYRILLIC", new String[]{"ISO-8859-5", "ISO8859_5"});
242 
243         encodingNameMap.put("ISO-8859-6", new String[]{"ISO-8859-6", "ISO8859_6"});
244         encodingNameMap.put("ISO8859_6", new String[]{"ISO-8859-6", "ISO8859_6"});
245         encodingNameMap.put("ISO-IR-127", new String[]{"ISO-8859-6", "ISO8859_6"});
246         encodingNameMap.put("ARABIC", new String[]{"ISO-8859-6", "ISO8859_6"});
247 
248         encodingNameMap.put("ISO-8859-7", new String[]{"ISO-8859-7", "ISO8859_7"});
249         encodingNameMap.put("ISO8859_7", new String[]{"ISO-8859-7", "ISO8859_7"});
250         encodingNameMap.put("ISO-IR-126", new String[]{"ISO-8859-7", "ISO8859_7"});
251         encodingNameMap.put("GREEK", new String[]{"ISO-8859-7", "ISO8859_7"});
252 
253         encodingNameMap.put("ISO-8859-8", new String[]{"ISO-8859-8", "ISO8859_8"});
254         encodingNameMap.put("ISO8859_8", new String[]{"ISO-8859-8", "ISO8859_8"});
255         encodingNameMap.put("ISO-8859-8-I", new String[]{"ISO-8859-8", "ISO8859_8"});
256         encodingNameMap.put("ISO-IR-138", new String[]{"ISO-8859-8", "ISO8859_8"});
257         encodingNameMap.put("HEBREW", new String[]{"ISO-8859-8", "ISO8859_8"});
258 
259         encodingNameMap.put("ISO-8859-9", new String[]{"ISO-8859-9", "ISO8859_8"});
260         encodingNameMap.put("ISO8859_8", new String[]{"ISO-8859-9", "ISO8859_8"});
261         encodingNameMap.put("CSISOLATINHEBREW", new String[]{"ISO-8859-9", "ISO8859_9"});
262         encodingNameMap.put("ISO-IR-148", new String[]{"ISO-8859-9", "ISO8859_9"});
263         encodingNameMap.put("LATIN5", new String[]{"ISO-8859-9", "ISO8859_9"});
264         encodingNameMap.put("CSISOLATIN5", new String[]{"ISO-8859-9", "ISO8859_9"});
265         encodingNameMap.put("L5", new String[]{"ISO-8859-9", "ISO8859_9"});
266 
267         encodingNameMap.put("ISO-8859-15", new String[]{"ISO-8859-15", "ISO8859_15"});
268         encodingNameMap.put("ISO8859_15", new String[]{"ISO-8859-15", "ISO8859_15"});
269 
270         encodingNameMap.put("KOI8-R", new String[]{"KOI8-R", "KOI8_R"});
271         encodingNameMap.put("KOI8_R", new String[]{"CSKOI8R", "KOI8_R"});
272         encodingNameMap.put("CSKOI8R", new String[]{"CSKOI8R", "KOI8_R"});
273     }
274 
275     /**
276      * Convert a Java character encoding name to its IANA equivalent.
277      * @param encoding java encoding name or alias
278      * @return iana equivalent or null if no match is found.
279      */
280     public static String toIana(String encoding)
281     {
282         if (encoding == null)
283         {
284             return null;
285         }
286 
287         String[] values = (String[]) encodingNameMap.get(handlecommonAlias(encoding));
288         if (values != null)
289         {
290             return values[0];
291         }
292 
293         return null;
294     }
295 
296     /**
297      * "Fix" the name for common alias to reduce the number of entries needed in the hashmap. It actually removes CSIBM,
298      * CCSID, IBM-, IBM0, CP-0, IBM, CP0, CP-, CP, WINDOWS- prefixes from given name.
299      * @param encoding encoding name
300      * @return "fixed" encoding.
301      */
302     private static String handlecommonAlias(String encoding)
303     {
304         String key = encoding.toUpperCase();
305 
306         // handle common alias
307         if (key.startsWith("CSIBM") || key.startsWith("CCSID"))
308         {
309             key = key.substring(5);
310         }
311         else if (key.startsWith("IBM-") || key.startsWith("IBM0") || key.startsWith("CP-0"))
312         {
313             key = key.substring(4);
314         }
315         else if (key.startsWith("IBM") || key.startsWith("CP0") || key.startsWith("CP-"))
316         {
317             key = key.substring(3);
318         }
319         else if (key.startsWith("CP"))
320         {
321             key = key.substring(2);
322         }
323         else if (key.startsWith("WINDOWS-"))
324         {
325             key = key.substring(8);
326         }
327         else if (key.startsWith("ISO_"))
328         {
329             key = "ISO-" + key.substring(4);
330         }
331 
332         return key;
333     }
334 
335     /**
336      * Converts an encoding name to the standard java name. Handles IANA names, legacy names used in tidy and different
337      * java encoding alias. See http://www.iana.org/assignments/character-sets.
338      * @param encoding IANA encoding name or alias
339      * @return java equivalent or null if no match is found.
340      */
341     public static String toJava(String encoding)
342     {
343         if (encoding == null)
344         {
345             return null;
346         }
347 
348         String[] values = (String[]) encodingNameMap.get(handlecommonAlias(encoding));
349         if (values != null)
350         {
351             return values[1];
352         }
353 
354         return null;
355     }
356 }