1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54 package org.w3c.tidy;
55
56 import java.util.HashMap;
57 import java.util.Map;
58
59
60 /**
61 * Maps between Java and IANA character encoding names. Also handles encoding alias used in tidy c.
62 * @author Fabrizio Giustina
63 * @version $Revision: 804 $ ($Author: fgiust $)
64 * @see http://www.iana.org/assignments/character-sets
65 */
66 public abstract class EncodingNameMapper
67 {
68
69 /**
70 * Map containing uppercase alias - {standard iana, standard java}.
71 */
72 private static Map encodingNameMap = new HashMap();
73
74 static
75 {
76 encodingNameMap.put("ISO-8859-1", new String[]{"ISO-8859-1", "ISO8859_1"});
77 encodingNameMap.put("ISO8859_1", new String[]{"ISO-8859-1", "ISO8859_1"});
78 encodingNameMap.put("ISO-IR-100", new String[]{"ISO-8859-1", "ISO8859_1"});
79 encodingNameMap.put("LATIN1", new String[]{"ISO-8859-1", "ISO8859_1"});
80 encodingNameMap.put("CSISOLATIN1", new String[]{"ISO-8859-1", "ISO8859_1"});
81 encodingNameMap.put("L1", new String[]{"ISO-8859-1", "ISO8859_1"});
82 encodingNameMap.put("819", new String[]{"ISO-8859-1", "ISO8859_1"});
83
84 encodingNameMap.put("US-ASCII", new String[]{"US-ASCII", "ASCII"});
85 encodingNameMap.put("ASCII", new String[]{"US-ASCII", "ASCII"});
86 encodingNameMap.put("ISO-IR-6", new String[]{"US-ASCII", "ASCII"});
87 encodingNameMap.put("CSASCII", new String[]{"US-ASCII", "ASCII"});
88 encodingNameMap.put("ISO646-US", new String[]{"US-ASCII", "ASCII"});
89 encodingNameMap.put("US", new String[]{"US-ASCII", "ASCII"});
90 encodingNameMap.put("367", new String[]{"US-ASCII", "ASCII"});
91
92 encodingNameMap.put("UTF-8", new String[]{"UTF-8", "UTF8"});
93 encodingNameMap.put("UTF8", new String[]{"UTF-8", "UTF8"});
94 encodingNameMap.put("UTF-16", new String[]{"UTF-16", "Unicode"});
95 encodingNameMap.put("UNICODE", new String[]{"UTF-16", "Unicode"});
96 encodingNameMap.put("UTF16", new String[]{"UTF-16", "Unicode"});
97 encodingNameMap.put("UTF16", new String[]{"UTF-16", "Unicode"});
98
99 encodingNameMap.put("UTF-16BE", new String[]{"UTF-16BE", "UnicodeBig"});
100 encodingNameMap.put("UNICODEBIG", new String[]{"UTF-16BE", "UnicodeBig"});
101 encodingNameMap.put("UTF16-BE", new String[]{"UTF-16BE", "UnicodeBig"});
102 encodingNameMap.put("UTF-16LE", new String[]{"UTF-16LE", "UnicodeLittle"});
103 encodingNameMap.put("UNICODELITTLE", new String[]{"UTF-16LE", "UnicodeLittle"});
104 encodingNameMap.put("UTF16-LE", new String[]{"UTF-16LE", "UnicodeLittle"});
105 encodingNameMap.put("UTF16BE", new String[]{"UTF-16BE", "UnicodeBig"});
106 encodingNameMap.put("UTF16LE", new String[]{"UTF-16LE", "UnicodeLittle"});
107
108 encodingNameMap.put("BIG5", new String[]{"BIG5", "Big5"});
109 encodingNameMap.put("CSBIG5", new String[]{"BIG5", "Big5"});
110
111 encodingNameMap.put("SJIS", new String[]{"SHIFT_JIS", "SJIS"});
112 encodingNameMap.put("SHIFT_JIS", new String[]{"SHIFT_JIS", "SJIS"});
113 encodingNameMap.put("CSSHIFTJIS", new String[]{"CSSHIFTJIS", "SJIS"});
114 encodingNameMap.put("MS_KANJI", new String[]{"MS_KANJI", "SJIS"});
115 encodingNameMap.put("SHIFTJIS", new String[]{"SHIFT_JIS", "SJIS"});
116
117 encodingNameMap.put("JIS", new String[]{"ISO-2022-JP", "JIS"});
118 encodingNameMap.put("ISO-2022-JP", new String[]{"ISO-2022-JP", "JIS"});
119 encodingNameMap.put("CSISO2022JP", new String[]{"CSISO2022JP", "JIS"});
120 encodingNameMap.put("ISO2022", new String[]{"ISO-2022-JP", "JIS"});
121
122 encodingNameMap.put("ISO2022KR", new String[]{"ISO-2022-KR", "ISO2022KR"});
123 encodingNameMap.put("ISO-2022-KR", new String[]{"ISO-2022-KR", "ISO2022KR"});
124 encodingNameMap.put("CSISO2022KR", new String[]{"CSISO2022KR", "ISO2022KR"});
125 encodingNameMap.put("ISO-2022-CN", new String[]{"ISO-2022-CN", "ISO2022CN"});
126 encodingNameMap.put("ISO2022CN", new String[]{"ISO-2022-CN", "ISO2022CN"});
127
128 encodingNameMap.put("MACROMAN", new String[]{"macintosh", "MacRoman"});
129 encodingNameMap.put("MACINTOSH", new String[]{"macintosh", "MacRoman"});
130 encodingNameMap.put("MACINTOSH ROMAN", new String[]{"macintosh", "MacRoman"});
131
132 encodingNameMap.put("37", new String[]{"IBM037", "CP037"});
133 encodingNameMap.put("273", new String[]{"IBM273", "CP273"});
134 encodingNameMap.put("277", new String[]{"IBM277", "CP277"});
135 encodingNameMap.put("278", new String[]{"IBM278", "CP278"});
136 encodingNameMap.put("280", new String[]{"IBM280", "CP280"});
137 encodingNameMap.put("284", new String[]{"IBM284", "CP284"});
138 encodingNameMap.put("285", new String[]{"IBM285", "CP285"});
139 encodingNameMap.put("290", new String[]{"IBM290", "CP290"});
140 encodingNameMap.put("297", new String[]{"IBM297", "CP297"});
141 encodingNameMap.put("420", new String[]{"IBM420", "CP420"});
142 encodingNameMap.put("424", new String[]{"IBM424", "CP424"});
143 encodingNameMap.put("437", new String[]{"IBM437", "CP437"});
144 encodingNameMap.put("500", new String[]{"IBM500", "CP500"});
145 encodingNameMap.put("775", new String[]{"IBM775", "CP775"});
146 encodingNameMap.put("850", new String[]{"IBM850", "CP850"});
147 encodingNameMap.put("852", new String[]{"IBM852", "CP852"});
148 encodingNameMap.put("CSPCP852", new String[]{"IBM852", "CP852"});
149 encodingNameMap.put("855", new String[]{"IBM855", "CP855"});
150 encodingNameMap.put("857", new String[]{"IBM857", "CP857"});
151 encodingNameMap.put("858", new String[]{"IBM00858", "Cp858"});
152 encodingNameMap.put("0858", new String[]{"IBM00858", "Cp858"});
153 encodingNameMap.put("860", new String[]{"IBM860", "CP860"});
154 encodingNameMap.put("861", new String[]{"IBM861", "CP861"});
155 encodingNameMap.put("IS", new String[]{"IBM861", "CP861"});
156 encodingNameMap.put("862", new String[]{"IBM862", "CP862"});
157 encodingNameMap.put("863", new String[]{"IBM863", "CP863"});
158 encodingNameMap.put("864", new String[]{"IBM864", "CP864"});
159 encodingNameMap.put("865", new String[]{"IBM865", "CP865"});
160 encodingNameMap.put("866", new String[]{"IBM866", "CP866"});
161 encodingNameMap.put("868", new String[]{"IBM868", "CP868"});
162 encodingNameMap.put("AR", new String[]{"IBM868", "CP868"});
163 encodingNameMap.put("869", new String[]{"IBM869", "CP869"});
164 encodingNameMap.put("GR", new String[]{"IBM869", "CP869"});
165 encodingNameMap.put("870", new String[]{"IBM870", "CP870"});
166 encodingNameMap.put("871", new String[]{"IBM871", "CP871"});
167 encodingNameMap.put("EBCDIC-CP-IS", new String[]{"IBM871", "CP871"});
168 encodingNameMap.put("918", new String[]{"CP918", "CP918"});
169 encodingNameMap.put("924", new String[]{"IBM00924", "CP924"});
170 encodingNameMap.put("0924", new String[]{"IBM00924", "CP924"});
171 encodingNameMap.put("1026", new String[]{"IBM1026", "CP1026"});
172 encodingNameMap.put("1047", new String[]{"IBM1047", "Cp1047"});
173 encodingNameMap.put("1140", new String[]{"IBM01140", "Cp1140"});
174 encodingNameMap.put("1141", new String[]{"IBM01141", "Cp1141"});
175 encodingNameMap.put("1142", new String[]{"IBM01142", "Cp1142"});
176 encodingNameMap.put("1143", new String[]{"IBM01143", "Cp1143"});
177 encodingNameMap.put("1144", new String[]{"IBM01144", "Cp1144"});
178 encodingNameMap.put("1145", new String[]{"IBM01145", "Cp1145"});
179 encodingNameMap.put("1146", new String[]{"IBM01146", "Cp1146"});
180 encodingNameMap.put("1147", new String[]{"IBM01147", "Cp1147"});
181 encodingNameMap.put("1148", new String[]{"IBM01148", "Cp1148"});
182 encodingNameMap.put("1149", new String[]{"IBM01149", "Cp1149"});
183 encodingNameMap.put("1250", new String[]{"WINDOWS-1250", "Cp1250"});
184 encodingNameMap.put("1251", new String[]{"WINDOWS-1251", "Cp1251"});
185 encodingNameMap.put("1252", new String[]{"WINDOWS-1252", "Cp1252"});
186 encodingNameMap.put("WIN1252", new String[]{"WINDOWS-1252", "Cp1252"});
187 encodingNameMap.put("1253", new String[]{"WINDOWS-1253", "Cp1253"});
188 encodingNameMap.put("1254", new String[]{"WINDOWS-1254", "Cp1254"});
189 encodingNameMap.put("1255", new String[]{"WINDOWS-1255", "Cp1255"});
190 encodingNameMap.put("1256", new String[]{"WINDOWS-1256", "Cp1256"});
191 encodingNameMap.put("1257", new String[]{"WINDOWS-1257", "Cp1257"});
192 encodingNameMap.put("1258", new String[]{"WINDOWS-1258", "Cp1258"});
193
194 encodingNameMap.put("EUC-JP", new String[]{"EUC-JP", "EUCJIS"});
195 encodingNameMap.put("EUCJIS", new String[]{"EUC-JP", "EUCJIS"});
196 encodingNameMap.put("EUC-KR", new String[]{"EUC-KR", "KSC5601"});
197 encodingNameMap.put("KSC5601", new String[]{"EUC-KR", "KSC5601"});
198 encodingNameMap.put("GB2312", new String[]{"GB2312", "GB2312"});
199 encodingNameMap.put("CSGB2312", new String[]{"GB2312", "GB2312"});
200 encodingNameMap.put("X0201", new String[]{"X0201", "JIS0201"});
201 encodingNameMap.put("JIS0201", new String[]{"X0201", "JIS0201"});
202 encodingNameMap.put("X0208", new String[]{"X0208", "JIS0208"});
203 encodingNameMap.put("JIS0208", new String[]{"X0208", "JIS0208"});
204 encodingNameMap.put("ISO-IR-87", new String[]{"ISO-IR-87", "JIS0208"});
205 encodingNameMap.put("JIS0208", new String[]{"ISO-IR-87", "JIS0208"});
206 encodingNameMap.put("X0212", new String[]{"X0212", "JIS0212"});
207 encodingNameMap.put("JIS0212", new String[]{"X0212", "JIS0212"});
208 encodingNameMap.put("ISO-IR-159", new String[]{"X0212", "JIS0212"});
209 encodingNameMap.put("GB18030", new String[]{"GB18030", "GB18030"});
210
211 encodingNameMap.put("936", new String[]{"GBK", "GBK"});
212 encodingNameMap.put("MS936", new String[]{"GBK", "GBK"});
213
214 encodingNameMap.put("MS932", new String[]{"WINDOWS-31J", "MS932"});
215 encodingNameMap.put("31J", new String[]{"WINDOWS-31J", "MS932"});
216 encodingNameMap.put("CSWINDOWS31J", new String[]{"WINDOWS-31J", "MS932"});
217 encodingNameMap.put("TIS-620", new String[]{"TIS-620", "TIS620"});
218 encodingNameMap.put("TIS620", new String[]{"TIS-620", "TIS620"});
219
220 encodingNameMap.put("ISO-8859-2", new String[]{"ISO-8859-2", "ISO8859_2"});
221 encodingNameMap.put("ISO8859_2", new String[]{"ISO-8859-2", "ISO8859_2"});
222 encodingNameMap.put("ISO-IR-101", new String[]{"ISO-8859-2", "ISO8859_2"});
223 encodingNameMap.put("LATIN2", new String[]{"ISO-8859-2", "ISO8859_2"});
224 encodingNameMap.put("L2", new String[]{"ISO-8859-2", "ISO8859_2"});
225
226 encodingNameMap.put("ISO-8859-3", new String[]{"ISO-8859-3", "ISO8859_3"});
227 encodingNameMap.put("ISO8859_3", new String[]{"ISO-8859-3", "ISO8859_3"});
228 encodingNameMap.put("ISO-IR-109", new String[]{"ISO-8859-3", "ISO8859_3"});
229 encodingNameMap.put("LATIN3", new String[]{"ISO-8859-3", "ISO8859_3"});
230 encodingNameMap.put("L3", new String[]{"ISO-8859-3", "ISO8859_3"});
231
232 encodingNameMap.put("ISO-8859-4", new String[]{"ISO-8859-4", "ISO8859_4"});
233 encodingNameMap.put("ISO8859_4", new String[]{"ISO-8859-4", "ISO8859_4"});
234 encodingNameMap.put("ISO-IR-110", new String[]{"ISO-8859-4", "ISO8859_4"});
235 encodingNameMap.put("ISO-IR-110", new String[]{"ISO-8859-4", "ISO8859_4"});
236 encodingNameMap.put("L4", new String[]{"ISO-8859-4", "ISO8859_4"});
237
238 encodingNameMap.put("ISO-8859-5", new String[]{"ISO-8859-5", "ISO8859_5"});
239 encodingNameMap.put("ISO8859_5", new String[]{"ISO-8859-5", "ISO8859_5"});
240 encodingNameMap.put("ISO-IR-144", new String[]{"ISO-8859-5", "ISO8859_5"});
241 encodingNameMap.put("CYRILLIC", new String[]{"ISO-8859-5", "ISO8859_5"});
242
243 encodingNameMap.put("ISO-8859-6", new String[]{"ISO-8859-6", "ISO8859_6"});
244 encodingNameMap.put("ISO8859_6", new String[]{"ISO-8859-6", "ISO8859_6"});
245 encodingNameMap.put("ISO-IR-127", new String[]{"ISO-8859-6", "ISO8859_6"});
246 encodingNameMap.put("ARABIC", new String[]{"ISO-8859-6", "ISO8859_6"});
247
248 encodingNameMap.put("ISO-8859-7", new String[]{"ISO-8859-7", "ISO8859_7"});
249 encodingNameMap.put("ISO8859_7", new String[]{"ISO-8859-7", "ISO8859_7"});
250 encodingNameMap.put("ISO-IR-126", new String[]{"ISO-8859-7", "ISO8859_7"});
251 encodingNameMap.put("GREEK", new String[]{"ISO-8859-7", "ISO8859_7"});
252
253 encodingNameMap.put("ISO-8859-8", new String[]{"ISO-8859-8", "ISO8859_8"});
254 encodingNameMap.put("ISO8859_8", new String[]{"ISO-8859-8", "ISO8859_8"});
255 encodingNameMap.put("ISO-8859-8-I", new String[]{"ISO-8859-8", "ISO8859_8"});
256 encodingNameMap.put("ISO-IR-138", new String[]{"ISO-8859-8", "ISO8859_8"});
257 encodingNameMap.put("HEBREW", new String[]{"ISO-8859-8", "ISO8859_8"});
258
259 encodingNameMap.put("ISO-8859-9", new String[]{"ISO-8859-9", "ISO8859_8"});
260 encodingNameMap.put("ISO8859_8", new String[]{"ISO-8859-9", "ISO8859_8"});
261 encodingNameMap.put("CSISOLATINHEBREW", new String[]{"ISO-8859-9", "ISO8859_9"});
262 encodingNameMap.put("ISO-IR-148", new String[]{"ISO-8859-9", "ISO8859_9"});
263 encodingNameMap.put("LATIN5", new String[]{"ISO-8859-9", "ISO8859_9"});
264 encodingNameMap.put("CSISOLATIN5", new String[]{"ISO-8859-9", "ISO8859_9"});
265 encodingNameMap.put("L5", new String[]{"ISO-8859-9", "ISO8859_9"});
266
267 encodingNameMap.put("ISO-8859-15", new String[]{"ISO-8859-15", "ISO8859_15"});
268 encodingNameMap.put("ISO8859_15", new String[]{"ISO-8859-15", "ISO8859_15"});
269
270 encodingNameMap.put("KOI8-R", new String[]{"KOI8-R", "KOI8_R"});
271 encodingNameMap.put("KOI8_R", new String[]{"CSKOI8R", "KOI8_R"});
272 encodingNameMap.put("CSKOI8R", new String[]{"CSKOI8R", "KOI8_R"});
273 }
274
275 /**
276 * Convert a Java character encoding name to its IANA equivalent.
277 * @param encoding java encoding name or alias
278 * @return iana equivalent or null if no match is found.
279 */
280 public static String toIana(String encoding)
281 {
282 if (encoding == null)
283 {
284 return null;
285 }
286
287 String[] values = (String[]) encodingNameMap.get(handlecommonAlias(encoding));
288 if (values != null)
289 {
290 return values[0];
291 }
292
293 return null;
294 }
295
296 /**
297 * "Fix" the name for common alias to reduce the number of entries needed in the hashmap. It actually removes CSIBM,
298 * CCSID, IBM-, IBM0, CP-0, IBM, CP0, CP-, CP, WINDOWS- prefixes from given name.
299 * @param encoding encoding name
300 * @return "fixed" encoding.
301 */
302 private static String handlecommonAlias(String encoding)
303 {
304 String key = encoding.toUpperCase();
305
306
307 if (key.startsWith("CSIBM") || key.startsWith("CCSID"))
308 {
309 key = key.substring(5);
310 }
311 else if (key.startsWith("IBM-") || key.startsWith("IBM0") || key.startsWith("CP-0"))
312 {
313 key = key.substring(4);
314 }
315 else if (key.startsWith("IBM") || key.startsWith("CP0") || key.startsWith("CP-"))
316 {
317 key = key.substring(3);
318 }
319 else if (key.startsWith("CP"))
320 {
321 key = key.substring(2);
322 }
323 else if (key.startsWith("WINDOWS-"))
324 {
325 key = key.substring(8);
326 }
327 else if (key.startsWith("ISO_"))
328 {
329 key = "ISO-" + key.substring(4);
330 }
331
332 return key;
333 }
334
335 /**
336 * Converts an encoding name to the standard java name. Handles IANA names, legacy names used in tidy and different
337 * java encoding alias. See http://www.iana.org/assignments/character-sets.
338 * @param encoding IANA encoding name or alias
339 * @return java equivalent or null if no match is found.
340 */
341 public static String toJava(String encoding)
342 {
343 if (encoding == null)
344 {
345 return null;
346 }
347
348 String[] values = (String[]) encodingNameMap.get(handlecommonAlias(encoding));
349 if (values != null)
350 {
351 return values[1];
352 }
353
354 return null;
355 }
356 }