1 |
| |
2 |
| |
3 |
| |
4 |
| |
5 |
| |
6 |
| |
7 |
| |
8 |
| |
9 |
| |
10 |
| |
11 |
| |
12 |
| |
13 |
| |
14 |
| |
15 |
| |
16 |
| |
17 |
| |
18 |
| |
19 |
| |
20 |
| |
21 |
| |
22 |
| |
23 |
| |
24 |
| |
25 |
| |
26 |
| |
27 |
| |
28 |
| |
29 |
| |
30 |
| |
31 |
| |
32 |
| |
33 |
| |
34 |
| |
35 |
| |
36 |
| |
37 |
| |
38 |
| |
39 |
| |
40 |
| |
41 |
| |
42 |
| |
43 |
| |
44 |
| |
45 |
| |
46 |
| |
47 |
| |
48 |
| |
49 |
| |
50 |
| |
51 |
| |
52 |
| |
53 |
| |
54 |
| package org.w3c.tidy; |
55 |
| |
56 |
| |
57 |
| |
58 |
| |
59 |
| |
60 |
| public final class EncodingUtils |
61 |
| { |
62 |
| |
63 |
| |
64 |
| |
65 |
| |
66 |
| public static final int UNICODE_BOM_BE = 0xFEFF; |
67 |
| |
68 |
| |
69 |
| |
70 |
| |
71 |
| public static final int UNICODE_BOM = UNICODE_BOM_BE; |
72 |
| |
73 |
| |
74 |
| |
75 |
| |
76 |
| public static final int UNICODE_BOM_LE = 0xFFFE; |
77 |
| |
78 |
| |
79 |
| |
80 |
| |
81 |
| public static final int UNICODE_BOM_UTF8 = 0xEFBBBF; |
82 |
| |
83 |
| |
84 |
| |
85 |
| |
86 |
| |
87 |
| |
88 |
| public static final int FSM_ASCII = 0; |
89 |
| |
90 |
| |
91 |
| |
92 |
| |
93 |
| public static final int FSM_ESC = 1; |
94 |
| |
95 |
| |
96 |
| |
97 |
| |
98 |
| public static final int FSM_ESCD = 2; |
99 |
| |
100 |
| |
101 |
| |
102 |
| |
103 |
| public static final int FSM_ESCDP = 3; |
104 |
| |
105 |
| |
106 |
| |
107 |
| |
108 |
| public static final int FSM_ESCP = 4; |
109 |
| |
110 |
| |
111 |
| |
112 |
| |
113 |
| public static final int FSM_NONASCII = 5; |
114 |
| |
115 |
| |
116 |
| |
117 |
| |
118 |
| public static final int MAX_UTF8_FROM_UCS4 = 0x10FFFF; |
119 |
| |
120 |
| |
121 |
| |
122 |
| |
123 |
| public static final int MAX_UTF16_FROM_UCS4 = 0x10FFFF; |
124 |
| |
125 |
| |
126 |
| |
127 |
| |
128 |
| public static final int LOW_UTF16_SURROGATE = 0xD800; |
129 |
| |
130 |
| |
131 |
| |
132 |
| |
133 |
| public static final int UTF16_SURROGATES_BEGIN = 0x10000; |
134 |
| |
135 |
| |
136 |
| |
137 |
| |
138 |
| public static final int UTF16_LOW_SURROGATE_BEGIN = 0xD800; |
139 |
| |
140 |
| |
141 |
| |
142 |
| |
143 |
| public static final int UTF16_LOW_SURROGATE_END = 0xDBFF; |
144 |
| |
145 |
| |
146 |
| |
147 |
| |
148 |
| public static final int UTF16_HIGH_SURROGATE_BEGIN = 0xDC00; |
149 |
| |
150 |
| |
151 |
| |
152 |
| |
153 |
| public static final int UTF16_HIGH_SURROGATE_END = 0xDFFF; |
154 |
| |
155 |
| |
156 |
| |
157 |
| |
158 |
| public static final int HIGH_UTF16_SURROGATE = 0xDFFF; |
159 |
| |
160 |
| |
161 |
| |
162 |
| |
163 |
| private static final int UTF8_BYTE_SWAP_NOT_A_CHAR = 0xFFFE; |
164 |
| |
165 |
| |
166 |
| |
167 |
| |
168 |
| private static final int UTF8_NOT_A_CHAR = 0xFFFF; |
169 |
| |
170 |
| |
171 |
| |
172 |
| |
173 |
| private static final int[] WIN2UNICODE = { |
174 |
| 0x20AC, |
175 |
| 0x0000, |
176 |
| 0x201A, |
177 |
| 0x0192, |
178 |
| 0x201E, |
179 |
| 0x2026, |
180 |
| 0x2020, |
181 |
| 0x2021, |
182 |
| 0x02C6, |
183 |
| 0x2030, |
184 |
| 0x0160, |
185 |
| 0x2039, |
186 |
| 0x0152, |
187 |
| 0x0000, |
188 |
| 0x017D, |
189 |
| 0x0000, |
190 |
| 0x0000, |
191 |
| 0x2018, |
192 |
| 0x2019, |
193 |
| 0x201C, |
194 |
| 0x201D, |
195 |
| 0x2022, |
196 |
| 0x2013, |
197 |
| 0x2014, |
198 |
| 0x02DC, |
199 |
| 0x2122, |
200 |
| 0x0161, |
201 |
| 0x203A, |
202 |
| 0x0153, |
203 |
| 0x0000, |
204 |
| 0x017E, |
205 |
| 0x0178}; |
206 |
| |
207 |
| |
208 |
| |
209 |
| |
210 |
| private static final int[] MAC2UNICODE = { |
211 |
| |
212 |
| 0x00C4, |
213 |
| 0x00C5, |
214 |
| 0x00C7, |
215 |
| 0x00C9, |
216 |
| 0x00D1, |
217 |
| 0x00D6, |
218 |
| 0x00DC, |
219 |
| 0x00E1, |
220 |
| 0x00E0, |
221 |
| 0x00E2, |
222 |
| 0x00E4, |
223 |
| 0x00E3, |
224 |
| 0x00E5, |
225 |
| 0x00E7, |
226 |
| 0x00E9, |
227 |
| 0x00E8, |
228 |
| 0x00EA, |
229 |
| 0x00EB, |
230 |
| 0x00ED, |
231 |
| 0x00EC, |
232 |
| 0x00EE, |
233 |
| 0x00EF, |
234 |
| 0x00F1, |
235 |
| 0x00F3, |
236 |
| 0x00F2, |
237 |
| 0x00F4, |
238 |
| 0x00F6, |
239 |
| 0x00F5, |
240 |
| 0x00FA, |
241 |
| 0x00F9, |
242 |
| 0x00FB, |
243 |
| 0x00FC, |
244 |
| 0x2020, |
245 |
| 0x00B0, |
246 |
| 0x00A2, |
247 |
| 0x00A3, |
248 |
| 0x00A7, |
249 |
| 0x2022, |
250 |
| 0x00B6, |
251 |
| 0x00DF, |
252 |
| 0x00AE, |
253 |
| 0x00A9, |
254 |
| 0x2122, |
255 |
| 0x00B4, |
256 |
| 0x00A8, |
257 |
| 0x2260, |
258 |
| 0x00C6, |
259 |
| 0x00D8, |
260 |
| 0x221E, |
261 |
| 0x00B1, |
262 |
| 0x2264, |
263 |
| 0x2265, |
264 |
| 0x00A5, |
265 |
| 0x00B5, |
266 |
| 0x2202, |
267 |
| 0x2211, |
268 |
| |
269 |
| 0x220F, |
270 |
| 0x03C0, |
271 |
| 0x222B, |
272 |
| 0x00AA, |
273 |
| 0x00BA, |
274 |
| 0x03A9, |
275 |
| 0x00E6, |
276 |
| 0x00F8, |
277 |
| 0x00BF, |
278 |
| 0x00A1, |
279 |
| 0x00AC, |
280 |
| 0x221A, |
281 |
| 0x0192, |
282 |
| 0x2248, |
283 |
| 0x2206, |
284 |
| 0x00AB, |
285 |
| 0x00BB, |
286 |
| 0x2026, |
287 |
| 0x00A0, |
288 |
| 0x00C0, |
289 |
| 0x00C3, |
290 |
| 0x00D5, |
291 |
| 0x0152, |
292 |
| 0x0153, |
293 |
| 0x2013, |
294 |
| 0x2014, |
295 |
| 0x201C, |
296 |
| 0x201D, |
297 |
| 0x2018, |
298 |
| 0x2019, |
299 |
| 0x00F7, |
300 |
| 0x25CA, |
301 |
| |
302 |
| 0x00FF, |
303 |
| 0x0178, |
304 |
| 0x2044, |
305 |
| 0x20AC, |
306 |
| 0x2039, |
307 |
| 0x203A, |
308 |
| 0xFB01, |
309 |
| 0xFB02, |
310 |
| 0x2021, |
311 |
| 0x00B7, |
312 |
| 0x201A, |
313 |
| 0x201E, |
314 |
| 0x2030, |
315 |
| 0x00C2, |
316 |
| 0x00CA, |
317 |
| 0x00C1, |
318 |
| 0x00CB, |
319 |
| 0x00C8, |
320 |
| 0x00CD, |
321 |
| 0x00CE, |
322 |
| 0x00CF, |
323 |
| 0x00CC, |
324 |
| 0x00D3, |
325 |
| 0x00D4, |
326 |
| |
327 |
| |
328 |
| 0xF8FF, |
329 |
| 0x00D2, |
330 |
| 0x00DA, |
331 |
| 0x00DB, |
332 |
| 0x00D9, |
333 |
| 0x0131, |
334 |
| 0x02C6, |
335 |
| 0x02DC, |
336 |
| 0x00AF, |
337 |
| 0x02D8, |
338 |
| 0x02D9, |
339 |
| 0x02DA, |
340 |
| 0x00B8, |
341 |
| 0x02DD, |
342 |
| 0x02DB, |
343 |
| 0x02C7}; |
344 |
| |
345 |
| |
346 |
| |
347 |
| |
348 |
| |
349 |
| private static final int[] SYMBOL2UNICODE = { |
350 |
| 0x0000, |
351 |
| 0x0001, |
352 |
| 0x0002, |
353 |
| 0x0003, |
354 |
| 0x0004, |
355 |
| 0x0005, |
356 |
| 0x0006, |
357 |
| 0x0007, |
358 |
| 0x0008, |
359 |
| 0x0009, |
360 |
| 0x000A, |
361 |
| 0x000B, |
362 |
| 0x000C, |
363 |
| 0x000D, |
364 |
| 0x000E, |
365 |
| 0x000F, |
366 |
| |
367 |
| 0x0010, |
368 |
| 0x0011, |
369 |
| 0x0012, |
370 |
| 0x0013, |
371 |
| 0x0014, |
372 |
| 0x0015, |
373 |
| 0x0016, |
374 |
| 0x0017, |
375 |
| 0x0018, |
376 |
| 0x0019, |
377 |
| 0x001A, |
378 |
| 0x001B, |
379 |
| 0x001C, |
380 |
| 0x001D, |
381 |
| 0x001E, |
382 |
| 0x001F, |
383 |
| |
384 |
| 0x0020, |
385 |
| 0x0021, |
386 |
| 0x2200, |
387 |
| 0x0023, |
388 |
| 0x2203, |
389 |
| 0x0025, |
390 |
| 0x0026, |
391 |
| 0x220D, |
392 |
| 0x0028, |
393 |
| 0x0029, |
394 |
| 0x2217, |
395 |
| 0x002B, |
396 |
| 0x002C, |
397 |
| 0x2212, |
398 |
| 0x002E, |
399 |
| 0x002F, |
400 |
| |
401 |
| 0x0030, |
402 |
| 0x0031, |
403 |
| 0x0032, |
404 |
| 0x0033, |
405 |
| 0x0034, |
406 |
| 0x0035, |
407 |
| 0x0036, |
408 |
| 0x0037, |
409 |
| 0x0038, |
410 |
| 0x0039, |
411 |
| 0x003A, |
412 |
| 0x003B, |
413 |
| 0x003C, |
414 |
| 0x003D, |
415 |
| 0x003E, |
416 |
| 0x003F, |
417 |
| |
418 |
| 0x2245, |
419 |
| 0x0391, |
420 |
| 0x0392, |
421 |
| 0x03A7, |
422 |
| 0x0394, |
423 |
| 0x0395, |
424 |
| 0x03A6, |
425 |
| 0x0393, |
426 |
| 0x0397, |
427 |
| 0x0399, |
428 |
| 0x03D1, |
429 |
| 0x039A, |
430 |
| 0x039B, |
431 |
| 0x039C, |
432 |
| 0x039D, |
433 |
| 0x039F, |
434 |
| |
435 |
| 0x03A0, |
436 |
| 0x0398, |
437 |
| 0x03A1, |
438 |
| 0x03A3, |
439 |
| 0x03A4, |
440 |
| 0x03A5, |
441 |
| 0x03C2, |
442 |
| 0x03A9, |
443 |
| 0x039E, |
444 |
| 0x03A8, |
445 |
| 0x0396, |
446 |
| 0x005B, |
447 |
| 0x2234, |
448 |
| 0x005D, |
449 |
| 0x22A5, |
450 |
| 0x005F, |
451 |
| |
452 |
| 0x00AF, |
453 |
| 0x03B1, |
454 |
| 0x03B2, |
455 |
| 0x03C7, |
456 |
| 0x03B4, |
457 |
| 0x03B5, |
458 |
| 0x03C6, |
459 |
| 0x03B3, |
460 |
| 0x03B7, |
461 |
| 0x03B9, |
462 |
| 0x03D5, |
463 |
| 0x03BA, |
464 |
| 0x03BB, |
465 |
| 0x03BC, |
466 |
| 0x03BD, |
467 |
| 0x03BF, |
468 |
| |
469 |
| 0x03C0, |
470 |
| 0x03B8, |
471 |
| 0x03C1, |
472 |
| 0x03C3, |
473 |
| 0x03C4, |
474 |
| 0x03C5, |
475 |
| 0x03D6, |
476 |
| 0x03C9, |
477 |
| 0x03BE, |
478 |
| 0x03C8, |
479 |
| 0x03B6, |
480 |
| 0x007B, |
481 |
| 0x007C, |
482 |
| 0x007D, |
483 |
| 0x223C, |
484 |
| 0x003F, |
485 |
| |
486 |
| 0x0000, |
487 |
| 0x0000, |
488 |
| 0x0000, |
489 |
| 0x0000, |
490 |
| 0x0000, |
491 |
| 0x0000, |
492 |
| 0x0000, |
493 |
| 0x0000, |
494 |
| 0x0000, |
495 |
| 0x0000, |
496 |
| 0x0000, |
497 |
| 0x0000, |
498 |
| 0x0000, |
499 |
| 0x0000, |
500 |
| 0x0000, |
501 |
| 0x0000, |
502 |
| |
503 |
| 0x0000, |
504 |
| 0x0000, |
505 |
| 0x0000, |
506 |
| 0x0000, |
507 |
| 0x0000, |
508 |
| 0x0000, |
509 |
| 0x0000, |
510 |
| 0x0000, |
511 |
| 0x0000, |
512 |
| 0x0000, |
513 |
| 0x0000, |
514 |
| 0x0000, |
515 |
| 0x0000, |
516 |
| 0x0000, |
517 |
| 0x0000, |
518 |
| 0x0000, |
519 |
| |
520 |
| 0x00A0, |
521 |
| 0x03D2, |
522 |
| 0x2032, |
523 |
| 0x2264, |
524 |
| 0x2044, |
525 |
| 0x221E, |
526 |
| 0x0192, |
527 |
| 0x2663, |
528 |
| 0x2666, |
529 |
| 0x2665, |
530 |
| 0x2660, |
531 |
| 0x2194, |
532 |
| 0x2190, |
533 |
| 0x2191, |
534 |
| 0x2192, |
535 |
| 0x2193, |
536 |
| |
537 |
| 0x00B0, |
538 |
| 0x00B1, |
539 |
| 0x2033, |
540 |
| 0x2265, |
541 |
| 0x00D7, |
542 |
| 0x221D, |
543 |
| 0x2202, |
544 |
| 0x00B7, |
545 |
| 0x00F7, |
546 |
| 0x2260, |
547 |
| 0x2261, |
548 |
| 0x2248, |
549 |
| 0x2026, |
550 |
| 0x003F, |
551 |
| 0x003F, |
552 |
| 0x21B5, |
553 |
| |
554 |
| 0x2135, |
555 |
| 0x2111, |
556 |
| 0x211C, |
557 |
| 0x2118, |
558 |
| 0x2297, |
559 |
| 0x2295, |
560 |
| 0x2205, |
561 |
| 0x2229, |
562 |
| 0x222A, |
563 |
| 0x2283, |
564 |
| 0x2287, |
565 |
| 0x2284, |
566 |
| 0x2282, |
567 |
| 0x2286, |
568 |
| 0x2208, |
569 |
| 0x2209, |
570 |
| |
571 |
| 0x2220, |
572 |
| 0x2207, |
573 |
| 0x00AE, |
574 |
| 0x00A9, |
575 |
| 0x2122, |
576 |
| 0x220F, |
577 |
| 0x221A, |
578 |
| 0x22C5, |
579 |
| 0x00AC, |
580 |
| 0x2227, |
581 |
| 0x2228, |
582 |
| 0x21D4, |
583 |
| 0x21D0, |
584 |
| 0x21D1, |
585 |
| 0x21D2, |
586 |
| 0x21D3, |
587 |
| |
588 |
| 0x25CA, |
589 |
| 0x2329, |
590 |
| 0x00AE, |
591 |
| 0x00A9, |
592 |
| 0x2122, |
593 |
| 0x2211, |
594 |
| 0x003F, |
595 |
| 0x003F, |
596 |
| 0x003F, |
597 |
| 0x003F, |
598 |
| 0x003F, |
599 |
| 0x003F, |
600 |
| 0x003F, |
601 |
| 0x003F, |
602 |
| 0x003F, |
603 |
| 0x003F, |
604 |
| |
605 |
| 0x20AC, |
606 |
| 0x232A, |
607 |
| 0x222B, |
608 |
| 0x2320, |
609 |
| 0x003F, |
610 |
| 0x2321, |
611 |
| 0x003F, |
612 |
| 0x003F, |
613 |
| 0x003F, |
614 |
| 0x003F, |
615 |
| 0x003F, |
616 |
| 0x003F, |
617 |
| 0x003F, |
618 |
| 0x003F, |
619 |
| 0x003F, |
620 |
| 0x003F}; |
621 |
| |
622 |
| |
623 |
| |
624 |
| |
625 |
| private static final ValidUTF8Sequence[] VALID_UTF8 = { |
626 |
| new ValidUTF8Sequence(0x0000, 0x007F, 1, new char[]{0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), |
627 |
| new ValidUTF8Sequence(0x0080, 0x07FF, 2, new char[]{0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}), |
628 |
| new ValidUTF8Sequence(0x0800, 0x0FFF, 3, new char[]{0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}), |
629 |
| new ValidUTF8Sequence(0x1000, 0xFFFF, 3, new char[]{0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}), |
630 |
| new ValidUTF8Sequence(0x10000, 0x3FFFF, 4, new char[]{0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}), |
631 |
| new ValidUTF8Sequence(0x40000, 0xFFFFF, 4, new char[]{0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}), |
632 |
| new ValidUTF8Sequence(0x100000, 0x10FFFF, 4, new char[]{0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})}; |
633 |
| |
634 |
| |
635 |
| |
636 |
| |
637 |
| private static final int NUM_UTF8_SEQUENCES = VALID_UTF8.length; |
638 |
| |
639 |
| |
640 |
| |
641 |
| |
642 |
| private static final int[] OFFSET_UTF8_SEQUENCES = {0, |
643 |
| 1, |
644 |
| 2, |
645 |
| 4, |
646 |
| NUM_UTF8_SEQUENCES}; |
647 |
| |
648 |
| |
649 |
| |
650 |
| |
651 |
0
| private EncodingUtils()
|
652 |
| { |
653 |
| |
654 |
| } |
655 |
| |
656 |
| |
657 |
| |
658 |
| |
659 |
| |
660 |
| |
661 |
0
| protected static int decodeWin1252(int c)
|
662 |
| { |
663 |
0
| return WIN2UNICODE[c - 128];
|
664 |
| } |
665 |
| |
666 |
| |
667 |
| |
668 |
| |
669 |
| |
670 |
| |
671 |
0
| protected static int decodeMacRoman(int c)
|
672 |
| { |
673 |
0
| if (127 < c)
|
674 |
| { |
675 |
0
| c = MAC2UNICODE[c - 128];
|
676 |
| } |
677 |
0
| return c;
|
678 |
| } |
679 |
| |
680 |
| |
681 |
| |
682 |
| |
683 |
| |
684 |
| |
685 |
0
| static int decodeSymbolFont(int c)
|
686 |
| { |
687 |
0
| if (c > 255)
|
688 |
| { |
689 |
0
| return c;
|
690 |
| } |
691 |
| |
692 |
0
| return SYMBOL2UNICODE[c];
|
693 |
| } |
694 |
| |
695 |
| |
696 |
| |
697 |
| |
698 |
| |
699 |
| |
700 |
| |
701 |
| |
702 |
| |
703 |
| |
704 |
| |
705 |
1025
| static boolean decodeUTF8BytesToChar(int[] c, int firstByte, byte[] successorBytes, GetBytes getter, int[] count,
|
706 |
| int startInSuccessorBytesArray) |
707 |
| { |
708 |
1025
| byte[] buf = new byte[10];
|
709 |
| |
710 |
1025
| int ch = 0;
|
711 |
1025
| int n = 0;
|
712 |
1025
| int i, bytes = 0;
|
713 |
1025
| boolean hasError = false;
|
714 |
| |
715 |
1025
| if (successorBytes.length != 0)
|
716 |
| { |
717 |
1025
| buf = successorBytes;
|
718 |
| } |
719 |
| |
720 |
| |
721 |
1025
| if (firstByte == StreamIn.END_OF_STREAM)
|
722 |
| { |
723 |
| |
724 |
0
| c[0] = firstByte;
|
725 |
0
| count[0] = 1;
|
726 |
0
| return false;
|
727 |
| } |
728 |
| |
729 |
1025
| ch = TidyUtils.toUnsigned(firstByte);
|
730 |
| |
731 |
1025
| if (ch <= 0x7F)
|
732 |
| { |
733 |
3
| n = ch;
|
734 |
3
| bytes = 1;
|
735 |
| } |
736 |
1022
| else if ((ch & 0xE0) == 0xC0)
|
737 |
| { |
738 |
912
| n = ch & 31;
|
739 |
912
| bytes = 2;
|
740 |
| } |
741 |
110
| else if ((ch & 0xF0) == 0xE0)
|
742 |
| { |
743 |
110
| n = ch & 15;
|
744 |
110
| bytes = 3;
|
745 |
| } |
746 |
0
| else if ((ch & 0xF8) == 0xF0)
|
747 |
| { |
748 |
0
| n = ch & 7;
|
749 |
0
| bytes = 4;
|
750 |
| } |
751 |
0
| else if ((ch & 0xFC) == 0xF8)
|
752 |
| { |
753 |
0
| n = ch & 3;
|
754 |
0
| bytes = 5;
|
755 |
0
| hasError = true;
|
756 |
| } |
757 |
0
| else if ((ch & 0xFE) == 0xFC)
|
758 |
| { |
759 |
0
| n = ch & 1;
|
760 |
0
| bytes = 6;
|
761 |
0
| hasError = true;
|
762 |
| } |
763 |
| else |
764 |
| { |
765 |
| |
766 |
0
| n = ch;
|
767 |
0
| bytes = 1;
|
768 |
0
| hasError = true;
|
769 |
| } |
770 |
| |
771 |
1025
| for (i = 1; i < bytes; ++i)
|
772 |
| { |
773 |
1132
| int[] tempCount = new int[1];
|
774 |
| |
775 |
| |
776 |
1132
| if (getter != null && (bytes - i > 0))
|
777 |
| { |
778 |
0
| tempCount[0] = 1;
|
779 |
0
| int[] buftocopy = new int[]{buf[startInSuccessorBytesArray + i - 1]};
|
780 |
| |
781 |
0
| getter.doGet(buftocopy, tempCount, false);
|
782 |
| |
783 |
0
| if (tempCount[0] <= 0)
|
784 |
| { |
785 |
0
| hasError = true;
|
786 |
0
| bytes = i;
|
787 |
0
| break;
|
788 |
| } |
789 |
| } |
790 |
| |
791 |
1132
| if ((buf[startInSuccessorBytesArray + i - 1] & 0xC0) != 0x80)
|
792 |
| { |
793 |
| |
794 |
0
| hasError = true;
|
795 |
0
| bytes = i;
|
796 |
0
| if (getter != null)
|
797 |
| { |
798 |
0
| int[] buftocopy = new int[]{buf[startInSuccessorBytesArray + i - 1]};
|
799 |
0
| tempCount[0] = 1;
|
800 |
0
| getter.doGet(buftocopy, tempCount, true);
|
801 |
| } |
802 |
0
| break;
|
803 |
| } |
804 |
| |
805 |
1132
| n = (n << 6) | (buf[startInSuccessorBytesArray + i - 1] & 0x3F);
|
806 |
| } |
807 |
| |
808 |
1025
| if (!hasError && ((n == UTF8_BYTE_SWAP_NOT_A_CHAR) || (n == UTF8_NOT_A_CHAR)))
|
809 |
| { |
810 |
0
| hasError = true;
|
811 |
| } |
812 |
| |
813 |
1025
| if (!hasError && (n > MAX_UTF8_FROM_UCS4))
|
814 |
| { |
815 |
0
| hasError = true;
|
816 |
| } |
817 |
| |
818 |
1025
| if (!hasError && (n >= UTF16_LOW_SURROGATE_BEGIN) && (n <= UTF16_HIGH_SURROGATE_END))
|
819 |
| { |
820 |
| |
821 |
0
| hasError = true;
|
822 |
| } |
823 |
| |
824 |
1025
| if (!hasError)
|
825 |
| { |
826 |
1025
| int lo = OFFSET_UTF8_SEQUENCES[bytes - 1];
|
827 |
1025
| int hi = OFFSET_UTF8_SEQUENCES[bytes] - 1;
|
828 |
| |
829 |
| |
830 |
1025
| if ((n < VALID_UTF8[lo].lowChar) || (n > VALID_UTF8[hi].highChar))
|
831 |
| { |
832 |
0
| hasError = true;
|
833 |
| } |
834 |
| else |
835 |
| { |
836 |
1025
| hasError = true;
|
837 |
| |
838 |
1025
| for (i = lo; i <= hi; i++)
|
839 |
| { |
840 |
1135
| int tempCount;
|
841 |
1135
| char theByte;
|
842 |
| |
843 |
1135
| for (tempCount = 0; tempCount < bytes; tempCount++)
|
844 |
| { |
845 |
2291
| if (!TidyUtils.toBoolean(tempCount))
|
846 |
| { |
847 |
1135
| theByte = (char) firstByte;
|
848 |
| } |
849 |
| else |
850 |
| { |
851 |
1156
| theByte = (char) buf[startInSuccessorBytesArray + tempCount - 1];
|
852 |
| } |
853 |
2291
| if ((theByte >= VALID_UTF8[i].validBytes[(tempCount * 2)])
|
854 |
| && (theByte <= VALID_UTF8[i].validBytes[(tempCount * 2) + 1])) |
855 |
| { |
856 |
1025
| hasError = false;
|
857 |
| } |
858 |
2291
| if (hasError)
|
859 |
| { |
860 |
98
| break;
|
861 |
| } |
862 |
| } |
863 |
| } |
864 |
| } |
865 |
| } |
866 |
| |
867 |
1025
| count[0] = bytes;
|
868 |
| |
869 |
1025
| c[0] = n;
|
870 |
| |
871 |
| |
872 |
| |
873 |
1025
| return hasError;
|
874 |
| |
875 |
| } |
876 |
| |
877 |
| |
878 |
| |
879 |
| |
880 |
| |
881 |
| |
882 |
| |
883 |
| |
884 |
| |
885 |
331874
| static boolean encodeCharToUTF8Bytes(int c, byte[] encodebuf, PutBytes putter, int[] count)
|
886 |
| { |
887 |
331874
| int bytes = 0;
|
888 |
| |
889 |
331874
| byte[] buf = new byte[10];
|
890 |
| |
891 |
331874
| if (encodebuf != null)
|
892 |
| { |
893 |
331874
| buf = encodebuf;
|
894 |
| } |
895 |
| |
896 |
331874
| boolean hasError = false;
|
897 |
| |
898 |
331874
| if (c <= 0x7F)
|
899 |
| { |
900 |
319513
| buf[0] = (byte) c;
|
901 |
319513
| bytes = 1;
|
902 |
| } |
903 |
12361
| else if (c <= 0x7FF)
|
904 |
| { |
905 |
12251
| buf[0] = (byte) (0xC0 | (c >> 6));
|
906 |
12251
| buf[1] = (byte) (0x80 | (c & 0x3F));
|
907 |
12251
| bytes = 2;
|
908 |
| } |
909 |
110
| else if (c <= 0xFFFF)
|
910 |
| { |
911 |
110
| buf[0] = (byte) (0xE0 | (c >> 12));
|
912 |
110
| buf[1] = (byte) (0x80 | ((c >> 6) & 0x3F));
|
913 |
110
| buf[2] = (byte) (0x80 | (c & 0x3F));
|
914 |
110
| bytes = 3;
|
915 |
110
| if ((c == UTF8_BYTE_SWAP_NOT_A_CHAR) || (c == UTF8_NOT_A_CHAR))
|
916 |
| { |
917 |
0
| hasError = true;
|
918 |
| } |
919 |
110
| else if ((c >= UTF16_LOW_SURROGATE_BEGIN) && (c <= UTF16_HIGH_SURROGATE_END))
|
920 |
| { |
921 |
| |
922 |
0
| hasError = true;
|
923 |
| } |
924 |
| } |
925 |
0
| else if (c <= 0x1FFFFF)
|
926 |
| { |
927 |
0
| buf[0] = (byte) (0xF0 | (c >> 18));
|
928 |
0
| buf[1] = (byte) (0x80 | ((c >> 12) & 0x3F));
|
929 |
0
| buf[2] = (byte) (0x80 | ((c >> 6) & 0x3F));
|
930 |
0
| buf[3] = (byte) (0x80 | (c & 0x3F));
|
931 |
0
| bytes = 4;
|
932 |
0
| if (c > MAX_UTF8_FROM_UCS4)
|
933 |
| { |
934 |
0
| hasError = true;
|
935 |
| } |
936 |
| } |
937 |
0
| else if (c <= 0x3FFFFFF)
|
938 |
| { |
939 |
0
| buf[0] = (byte) (0xF8 | (c >> 24));
|
940 |
0
| buf[1] = (byte) (0x80 | (c >> 18));
|
941 |
0
| buf[2] = (byte) (0x80 | ((c >> 12) & 0x3F));
|
942 |
0
| buf[3] = (byte) (0x80 | ((c >> 6) & 0x3F));
|
943 |
0
| buf[4] = (byte) (0x80 | (c & 0x3F));
|
944 |
0
| bytes = 5;
|
945 |
0
| hasError = true;
|
946 |
| } |
947 |
0
| else if (c <= 0x7FFFFFFF)
|
948 |
| { |
949 |
0
| buf[0] = (byte) (0xFC | (c >> 30));
|
950 |
0
| buf[1] = (byte) (0x80 | ((c >> 24) & 0x3F));
|
951 |
0
| buf[2] = (byte) (0x80 | ((c >> 18) & 0x3F));
|
952 |
0
| buf[3] = (byte) (0x80 | ((c >> 12) & 0x3F));
|
953 |
0
| buf[4] = (byte) (0x80 | ((c >> 6) & 0x3F));
|
954 |
0
| buf[5] = (byte) (0x80 | (c & 0x3F));
|
955 |
0
| bytes = 6;
|
956 |
0
| hasError = true;
|
957 |
| } |
958 |
| else |
959 |
| { |
960 |
0
| hasError = true;
|
961 |
| } |
962 |
| |
963 |
331874
| if (!hasError && putter != null)
|
964 |
| { |
965 |
0
| int[] tempCount = new int[]{bytes};
|
966 |
0
| putter.doPut(buf, tempCount);
|
967 |
| |
968 |
0
| if (tempCount[0] < bytes)
|
969 |
| { |
970 |
0
| hasError = true;
|
971 |
| } |
972 |
| } |
973 |
| |
974 |
331874
| count[0] = bytes;
|
975 |
331874
| return hasError;
|
976 |
| } |
977 |
| |
978 |
| |
979 |
| |
980 |
| |
981 |
| |
982 |
| static interface GetBytes |
983 |
| { |
984 |
| |
985 |
| |
986 |
| |
987 |
| |
988 |
| |
989 |
| |
990 |
| |
991 |
| void doGet(int[] buf, int[] count, boolean unget); |
992 |
| } |
993 |
| |
994 |
| |
995 |
| |
996 |
| |
997 |
| static interface PutBytes |
998 |
| { |
999 |
| |
1000 |
| |
1001 |
| |
1002 |
| |
1003 |
| |
1004 |
| |
1005 |
| void doPut(byte[] buf, int[] count); |
1006 |
| } |
1007 |
| } |