View Javadoc

1   /*
2    *  Java HTML Tidy - JTidy
3    *  HTML parser and pretty printer
4    *
5    *  Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
6    *  Institute of Technology, Institut National de Recherche en
7    *  Informatique et en Automatique, Keio University). All Rights
8    *  Reserved.
9    *
10   *  Contributing Author(s):
11   *
12   *     Dave Raggett <dsr@w3.org>
13   *     Andy Quick <ac.quick@sympatico.ca> (translation to Java)
14   *     Gary L Peskin <garyp@firstech.com> (Java development)
15   *     Sami Lempinen <sami@lempinen.net> (release management)
16   *     Fabrizio Giustina <fgiust at users.sourceforge.net>
17   *
18   *  The contributing author(s) would like to thank all those who
19   *  helped with testing, bug fixes, and patience.  This wouldn't
20   *  have been possible without all of you.
21   *
22   *  COPYRIGHT NOTICE:
23   *
24   *  This software and documentation is provided "as is," and
25   *  the copyright holders and contributing author(s) make no
26   *  representations or warranties, express or implied, including
27   *  but not limited to, warranties of merchantability or fitness
28   *  for any particular purpose or that the use of the software or
29   *  documentation will not infringe any third party patents,
30   *  copyrights, trademarks or other rights.
31   *
32   *  The copyright holders and contributing author(s) will not be
33   *  liable for any direct, indirect, special or consequential damages
34   *  arising out of any use of the software or documentation, even if
35   *  advised of the possibility of such damage.
36   *
37   *  Permission is hereby granted to use, copy, modify, and distribute
38   *  this source code, or portions hereof, documentation and executables,
39   *  for any purpose, without fee, subject to the following restrictions:
40   *
41   *  1. The origin of this source code must not be misrepresented.
42   *  2. Altered versions must be plainly marked as such and must
43   *     not be misrepresented as being the original source.
44   *  3. This Copyright notice may not be removed or altered from any
45   *     source or altered source distribution.
46   *
47   *  The copyright holders and contributing author(s) specifically
48   *  permit, without fee, and encourage the use of this source code
49   *  as a component for supporting the Hypertext Markup Language in
50   *  commercial products. If you use this source code in a product,
51   *  acknowledgment is not required but would be appreciated.
52   *
53   */
54  
55  package org.w3c.tidy;
56  
57  /**
58   * Utility class with handy methods, mainly for String handling or for reproducing c behaviours.
59   * @author Fabrizio Giustina
60   * @version $Revision $ ($Author $)
61   */
62  public final class TidyUtils
63  {
64  
65      /**
66       * char type: digit.
67       */
68      private static final short DIGIT = 1;
69  
70      /**
71       * char type: letter.
72       */
73      private static final short LETTER = 2;
74  
75      /**
76       * char type: namechar.
77       */
78      private static final short NAMECHAR = 4;
79  
80      /**
81       * char type: whitespace.
82       */
83      private static final short WHITE = 8;
84  
85      /**
86       * char type: newline.
87       */
88      private static final short NEWLINE = 16;
89  
90      /**
91       * char type: lowercase.
92       */
93      private static final short LOWERCASE = 32;
94  
95      /**
96       * char type: uppercase.
97       */
98      private static final short UPPERCASE = 64;
99  
100     /**
101      * used to classify chars for lexical purposes.
102      */
103     private static short[] lexmap = new short[128];
104 
105     static
106     {
107         mapStr("\r\n\f", (short) (NEWLINE | WHITE));
108         mapStr(" \t", WHITE);
109         mapStr("-.:_", NAMECHAR);
110         mapStr("0123456789", (short) (DIGIT | NAMECHAR));
111         mapStr("abcdefghijklmnopqrstuvwxyz", (short) (LOWERCASE | LETTER | NAMECHAR));
112         mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short) (UPPERCASE | LETTER | NAMECHAR));
113     }
114 
115     /**
116      * utility class, don't instantiate.
117      */
118     private TidyUtils()
119     {
120         // unused
121     }
122 
123     /**
124      * Converts a int to a boolean.
125      * @param value int value
126      * @return <code>true</code> if value is != 0
127      */
128     static boolean toBoolean(int value)
129     {
130         return value != 0;
131     }
132 
133     /**
134      * convert an int to unsigned (& 0xFF).
135      * @param c signed int
136      * @return unsigned int
137      */
138     static int toUnsigned(int c)
139     {
140         return c & 0xFF;
141     }
142 
143     /**
144      * check if the first String contains the second one.
145      * @param s1 full String
146      * @param len1 maximum position in String
147      * @param s2 String to search for
148      * @return true if s1 contains s2 in the range 0-len1
149      */
150     static boolean wsubstrn(String s1, int len1, String s2)
151     {
152         int searchIndex = s1.indexOf(s2);
153         return searchIndex > -1 && searchIndex <= len1;
154     }
155 
156     /**
157      * check if the first String contains the second one (ignore case).
158      * @param s1 full String
159      * @param len1 maximum position in String
160      * @param s2 String to search for
161      * @return true if s1 contains s2 in the range 0-len1
162      */
163     static boolean wsubstrncase(String s1, int len1, String s2)
164     {
165         return wsubstrn(s1.toLowerCase(), len1, s2.toLowerCase());
166     }
167 
168     /**
169      * return offset of cc from beginning of s1, -1 if not found.
170      * @param s1 String
171      * @param len1 maximum offset (values > than lenl are ignored and returned as -1)
172      * @param cc character to search for
173      * @return index of cc in s1
174      */
175     static int wstrnchr(String s1, int len1, char cc)
176     {
177         int indexOf = s1.indexOf(cc);
178         if (indexOf < len1)
179         {
180             return indexOf;
181         }
182 
183         return -1;
184     }
185 
186     /**
187      * Same as wsubstrn, but without a specified length.
188      * @param s1 full String
189      * @param s2 String to search for
190      * @return <code>true</code> if s2 is found in s2 (case insensitive search)
191      */
192     static boolean wsubstr(String s1, String s2)
193     {
194         int i;
195         int len1 = s1.length();
196         int len2 = s2.length();
197 
198         for (i = 0; i <= len1 - len2; ++i)
199         {
200             if (s2.equalsIgnoreCase(s1.substring(i)))
201             {
202                 return true;
203             }
204         }
205 
206         return false;
207     }
208 
209     /**
210      * Is the character a hex digit?
211      * @param c char
212      * @return <code>true</code> if he given character is a hex digit
213      */
214     static boolean isxdigit(char c)
215     {
216         return Character.isDigit(c) || (Character.toLowerCase(c) >= 'a' && Character.toLowerCase(c) <= 'f');
217     }
218 
219     /**
220      * Check if the string valueToCheck is contained in validValues array (case insesitie comparison).
221      * @param validValues array of valid values
222      * @param valueToCheck value to search for
223      * @return <code>true</code> if valueToCheck is found in validValues
224      */
225     static boolean isInValuesIgnoreCase(String[] validValues, String valueToCheck)
226     {
227         int len = validValues.length;
228         for (int j = 0; j < len; j++)
229         {
230             if (validValues[j].equalsIgnoreCase(valueToCheck))
231             {
232                 return true;
233             }
234         }
235         return false;
236     }
237 
238     /**
239      * Return true if substring s is in p and isn't all in upper case. This is used to check the case of SYSTEM, PUBLIC,
240      * DTD and EN.
241      * @param s substring
242      * @param p full string
243      * @param len how many chars to check in p
244      * @return true if substring s is in p and isn't all in upper case
245      */
246     public static boolean findBadSubString(String s, String p, int len)
247     {
248         int n = s.length();
249         int i = 0;
250         String ps;
251 
252         while (n < len)
253         {
254             ps = p.substring(i, i + n);
255             if (s.equalsIgnoreCase(ps))
256             {
257                 return (!ps.equals(s.substring(0, n)));
258             }
259 
260             ++i;
261             --len;
262         }
263 
264         return false;
265     }
266 
267     /**
268      * Is the given char a valid xml letter?
269      * @param c char
270      * @return <code>true</code> if the char is a valid xml letter
271      */
272     static boolean isXMLLetter(char c)
273     {
274         return ((c >= 0x41 && c <= 0x5a)
275             || (c >= 0x61 && c <= 0x7a)
276             || (c >= 0xc0 && c <= 0xd6)
277             || (c >= 0xd8 && c <= 0xf6)
278             || (c >= 0xf8 && c <= 0xff)
279             || (c >= 0x100 && c <= 0x131)
280             || (c >= 0x134 && c <= 0x13e)
281             || (c >= 0x141 && c <= 0x148)
282             || (c >= 0x14a && c <= 0x17e)
283             || (c >= 0x180 && c <= 0x1c3)
284             || (c >= 0x1cd && c <= 0x1f0)
285             || (c >= 0x1f4 && c <= 0x1f5)
286             || (c >= 0x1fa && c <= 0x217)
287             || (c >= 0x250 && c <= 0x2a8)
288             || (c >= 0x2bb && c <= 0x2c1)
289             || c == 0x386
290             || (c >= 0x388 && c <= 0x38a)
291             || c == 0x38c
292             || (c >= 0x38e && c <= 0x3a1)
293             || (c >= 0x3a3 && c <= 0x3ce)
294             || (c >= 0x3d0 && c <= 0x3d6)
295             || c == 0x3da
296             || c == 0x3dc
297             || c == 0x3de
298             || c == 0x3e0
299             || (c >= 0x3e2 && c <= 0x3f3)
300             || (c >= 0x401 && c <= 0x40c)
301             || (c >= 0x40e && c <= 0x44f)
302             || (c >= 0x451 && c <= 0x45c)
303             || (c >= 0x45e && c <= 0x481)
304             || (c >= 0x490 && c <= 0x4c4)
305             || (c >= 0x4c7 && c <= 0x4c8)
306             || (c >= 0x4cb && c <= 0x4cc)
307             || (c >= 0x4d0 && c <= 0x4eb)
308             || (c >= 0x4ee && c <= 0x4f5)
309             || (c >= 0x4f8 && c <= 0x4f9)
310             || (c >= 0x531 && c <= 0x556)
311             || c == 0x559
312             || (c >= 0x561 && c <= 0x586)
313             || (c >= 0x5d0 && c <= 0x5ea)
314             || (c >= 0x5f0 && c <= 0x5f2)
315             || (c >= 0x621 && c <= 0x63a)
316             || (c >= 0x641 && c <= 0x64a)
317             || (c >= 0x671 && c <= 0x6b7)
318             || (c >= 0x6ba && c <= 0x6be)
319             || (c >= 0x6c0 && c <= 0x6ce)
320             || (c >= 0x6d0 && c <= 0x6d3)
321             || c == 0x6d5
322             || (c >= 0x6e5 && c <= 0x6e6)
323             || (c >= 0x905 && c <= 0x939)
324             || c == 0x93d
325             || (c >= 0x958 && c <= 0x961)
326             || (c >= 0x985 && c <= 0x98c)
327             || (c >= 0x98f && c <= 0x990)
328             || (c >= 0x993 && c <= 0x9a8)
329             || (c >= 0x9aa && c <= 0x9b0)
330             || c == 0x9b2
331             || (c >= 0x9b6 && c <= 0x9b9)
332             || (c >= 0x9dc && c <= 0x9dd)
333             || (c >= 0x9df && c <= 0x9e1)
334             || (c >= 0x9f0 && c <= 0x9f1)
335             || (c >= 0xa05 && c <= 0xa0a)
336             || (c >= 0xa0f && c <= 0xa10)
337             || (c >= 0xa13 && c <= 0xa28)
338             || (c >= 0xa2a && c <= 0xa30)
339             || (c >= 0xa32 && c <= 0xa33)
340             || (c >= 0xa35 && c <= 0xa36)
341             || (c >= 0xa38 && c <= 0xa39)
342             || (c >= 0xa59 && c <= 0xa5c)
343             || c == 0xa5e
344             || (c >= 0xa72 && c <= 0xa74)
345             || (c >= 0xa85 && c <= 0xa8b)
346             || c == 0xa8d
347             || (c >= 0xa8f && c <= 0xa91)
348             || (c >= 0xa93 && c <= 0xaa8)
349             || (c >= 0xaaa && c <= 0xab0)
350             || (c >= 0xab2 && c <= 0xab3)
351             || (c >= 0xab5 && c <= 0xab9)
352             || c == 0xabd
353             || c == 0xae0
354             || (c >= 0xb05 && c <= 0xb0c)
355             || (c >= 0xb0f && c <= 0xb10)
356             || (c >= 0xb13 && c <= 0xb28)
357             || (c >= 0xb2a && c <= 0xb30)
358             || (c >= 0xb32 && c <= 0xb33)
359             || (c >= 0xb36 && c <= 0xb39)
360             || c == 0xb3d
361             || (c >= 0xb5c && c <= 0xb5d)
362             || (c >= 0xb5f && c <= 0xb61)
363             || (c >= 0xb85 && c <= 0xb8a)
364             || (c >= 0xb8e && c <= 0xb90)
365             || (c >= 0xb92 && c <= 0xb95)
366             || (c >= 0xb99 && c <= 0xb9a)
367             || c == 0xb9c
368             || (c >= 0xb9e && c <= 0xb9f)
369             || (c >= 0xba3 && c <= 0xba4)
370             || (c >= 0xba8 && c <= 0xbaa)
371             || (c >= 0xbae && c <= 0xbb5)
372             || (c >= 0xbb7 && c <= 0xbb9)
373             || (c >= 0xc05 && c <= 0xc0c)
374             || (c >= 0xc0e && c <= 0xc10)
375             || (c >= 0xc12 && c <= 0xc28)
376             || (c >= 0xc2a && c <= 0xc33)
377             || (c >= 0xc35 && c <= 0xc39)
378             || (c >= 0xc60 && c <= 0xc61)
379             || (c >= 0xc85 && c <= 0xc8c)
380             || (c >= 0xc8e && c <= 0xc90)
381             || (c >= 0xc92 && c <= 0xca8)
382             || (c >= 0xcaa && c <= 0xcb3)
383             || (c >= 0xcb5 && c <= 0xcb9)
384             || c == 0xcde
385             || (c >= 0xce0 && c <= 0xce1)
386             || (c >= 0xd05 && c <= 0xd0c)
387             || (c >= 0xd0e && c <= 0xd10)
388             || (c >= 0xd12 && c <= 0xd28)
389             || (c >= 0xd2a && c <= 0xd39)
390             || (c >= 0xd60 && c <= 0xd61)
391             || (c >= 0xe01 && c <= 0xe2e)
392             || c == 0xe30
393             || (c >= 0xe32 && c <= 0xe33)
394             || (c >= 0xe40 && c <= 0xe45)
395             || (c >= 0xe81 && c <= 0xe82)
396             || c == 0xe84
397             || (c >= 0xe87 && c <= 0xe88)
398             || c == 0xe8a
399             || c == 0xe8d
400             || (c >= 0xe94 && c <= 0xe97)
401             || (c >= 0xe99 && c <= 0xe9f)
402             || (c >= 0xea1 && c <= 0xea3)
403             || c == 0xea5
404             || c == 0xea7
405             || (c >= 0xeaa && c <= 0xeab)
406             || (c >= 0xead && c <= 0xeae)
407             || c == 0xeb0
408             || (c >= 0xeb2 && c <= 0xeb3)
409             || c == 0xebd
410             || (c >= 0xec0 && c <= 0xec4)
411             || (c >= 0xf40 && c <= 0xf47)
412             || (c >= 0xf49 && c <= 0xf69)
413             || (c >= 0x10a0 && c <= 0x10c5)
414             || (c >= 0x10d0 && c <= 0x10f6)
415             || c == 0x1100
416             || (c >= 0x1102 && c <= 0x1103)
417             || (c >= 0x1105 && c <= 0x1107)
418             || c == 0x1109
419             || (c >= 0x110b && c <= 0x110c)
420             || (c >= 0x110e && c <= 0x1112)
421             || c == 0x113c
422             || c == 0x113e
423             || c == 0x1140
424             || c == 0x114c
425             || c == 0x114e
426             || c == 0x1150
427             || (c >= 0x1154 && c <= 0x1155)
428             || c == 0x1159
429             || (c >= 0x115f && c <= 0x1161)
430             || c == 0x1163
431             || c == 0x1165
432             || c == 0x1167
433             || c == 0x1169
434             || (c >= 0x116d && c <= 0x116e)
435             || (c >= 0x1172 && c <= 0x1173)
436             || c == 0x1175
437             || c == 0x119e
438             || c == 0x11a8
439             || c == 0x11ab
440             || (c >= 0x11ae && c <= 0x11af)
441             || (c >= 0x11b7 && c <= 0x11b8)
442             || c == 0x11ba
443             || (c >= 0x11bc && c <= 0x11c2)
444             || c == 0x11eb
445             || c == 0x11f0
446             || c == 0x11f9
447             || (c >= 0x1e00 && c <= 0x1e9b)
448             || (c >= 0x1ea0 && c <= 0x1ef9)
449             || (c >= 0x1f00 && c <= 0x1f15)
450             || (c >= 0x1f18 && c <= 0x1f1d)
451             || (c >= 0x1f20 && c <= 0x1f45)
452             || (c >= 0x1f48 && c <= 0x1f4d)
453             || (c >= 0x1f50 && c <= 0x1f57)
454             || c == 0x1f59
455             || c == 0x1f5b
456             || c == 0x1f5d
457             || (c >= 0x1f5f && c <= 0x1f7d)
458             || (c >= 0x1f80 && c <= 0x1fb4)
459             || (c >= 0x1fb6 && c <= 0x1fbc)
460             || c == 0x1fbe
461             || (c >= 0x1fc2 && c <= 0x1fc4)
462             || (c >= 0x1fc6 && c <= 0x1fcc)
463             || (c >= 0x1fd0 && c <= 0x1fd3)
464             || (c >= 0x1fd6 && c <= 0x1fdb)
465             || (c >= 0x1fe0 && c <= 0x1fec)
466             || (c >= 0x1ff2 && c <= 0x1ff4)
467             || (c >= 0x1ff6 && c <= 0x1ffc)
468             || c == 0x2126
469             || (c >= 0x212a && c <= 0x212b)
470             || c == 0x212e
471             || (c >= 0x2180 && c <= 0x2182)
472             || (c >= 0x3041 && c <= 0x3094)
473             || (c >= 0x30a1 && c <= 0x30fa)
474             || (c >= 0x3105 && c <= 0x312c)
475             || (c >= 0xac00 && c <= 0xd7a3)
476             || (c >= 0x4e00 && c <= 0x9fa5)
477             || c == 0x3007
478             || (c >= 0x3021 && c <= 0x3029)
479             || (c >= 0x4e00 && c <= 0x9fa5)
480             || c == 0x3007 || (c >= 0x3021 && c <= 0x3029));
481     }
482 
483     /**
484      * Is the given char valid in xml name?
485      * @param c char
486      * @return <code>true</code> if the char is a valid xml name char
487      */
488     static boolean isXMLNamechar(char c)
489     {
490         return (isXMLLetter(c)
491             || c == '.'
492             || c == '_'
493             || c == ':'
494             || c == '-'
495             || (c >= 0x300 && c <= 0x345)
496             || (c >= 0x360 && c <= 0x361)
497             || (c >= 0x483 && c <= 0x486)
498             || (c >= 0x591 && c <= 0x5a1)
499             || (c >= 0x5a3 && c <= 0x5b9)
500             || (c >= 0x5bb && c <= 0x5bd)
501             || c == 0x5bf
502             || (c >= 0x5c1 && c <= 0x5c2)
503             || c == 0x5c4
504             || (c >= 0x64b && c <= 0x652)
505             || c == 0x670
506             || (c >= 0x6d6 && c <= 0x6dc)
507             || (c >= 0x6dd && c <= 0x6df)
508             || (c >= 0x6e0 && c <= 0x6e4)
509             || (c >= 0x6e7 && c <= 0x6e8)
510             || (c >= 0x6ea && c <= 0x6ed)
511             || (c >= 0x901 && c <= 0x903)
512             || c == 0x93c
513             || (c >= 0x93e && c <= 0x94c)
514             || c == 0x94d
515             || (c >= 0x951 && c <= 0x954)
516             || (c >= 0x962 && c <= 0x963)
517             || (c >= 0x981 && c <= 0x983)
518             || c == 0x9bc
519             || c == 0x9be
520             || c == 0x9bf
521             || (c >= 0x9c0 && c <= 0x9c4)
522             || (c >= 0x9c7 && c <= 0x9c8)
523             || (c >= 0x9cb && c <= 0x9cd)
524             || c == 0x9d7
525             || (c >= 0x9e2 && c <= 0x9e3)
526             || c == 0xa02
527             || c == 0xa3c
528             || c == 0xa3e
529             || c == 0xa3f
530             || (c >= 0xa40 && c <= 0xa42)
531             || (c >= 0xa47 && c <= 0xa48)
532             || (c >= 0xa4b && c <= 0xa4d)
533             || (c >= 0xa70 && c <= 0xa71)
534             || (c >= 0xa81 && c <= 0xa83)
535             || c == 0xabc
536             || (c >= 0xabe && c <= 0xac5)
537             || (c >= 0xac7 && c <= 0xac9)
538             || (c >= 0xacb && c <= 0xacd)
539             || (c >= 0xb01 && c <= 0xb03)
540             || c == 0xb3c
541             || (c >= 0xb3e && c <= 0xb43)
542             || (c >= 0xb47 && c <= 0xb48)
543             || (c >= 0xb4b && c <= 0xb4d)
544             || (c >= 0xb56 && c <= 0xb57)
545             || (c >= 0xb82 && c <= 0xb83)
546             || (c >= 0xbbe && c <= 0xbc2)
547             || (c >= 0xbc6 && c <= 0xbc8)
548             || (c >= 0xbca && c <= 0xbcd)
549             || c == 0xbd7
550             || (c >= 0xc01 && c <= 0xc03)
551             || (c >= 0xc3e && c <= 0xc44)
552             || (c >= 0xc46 && c <= 0xc48)
553             || (c >= 0xc4a && c <= 0xc4d)
554             || (c >= 0xc55 && c <= 0xc56)
555             || (c >= 0xc82 && c <= 0xc83)
556             || (c >= 0xcbe && c <= 0xcc4)
557             || (c >= 0xcc6 && c <= 0xcc8)
558             || (c >= 0xcca && c <= 0xccd)
559             || (c >= 0xcd5 && c <= 0xcd6)
560             || (c >= 0xd02 && c <= 0xd03)
561             || (c >= 0xd3e && c <= 0xd43)
562             || (c >= 0xd46 && c <= 0xd48)
563             || (c >= 0xd4a && c <= 0xd4d)
564             || c == 0xd57
565             || c == 0xe31
566             || (c >= 0xe34 && c <= 0xe3a)
567             || (c >= 0xe47 && c <= 0xe4e)
568             || c == 0xeb1
569             || (c >= 0xeb4 && c <= 0xeb9)
570             || (c >= 0xebb && c <= 0xebc)
571             || (c >= 0xec8 && c <= 0xecd)
572             || (c >= 0xf18 && c <= 0xf19)
573             || c == 0xf35
574             || c == 0xf37
575             || c == 0xf39
576             || c == 0xf3e
577             || c == 0xf3f
578             || (c >= 0xf71 && c <= 0xf84)
579             || (c >= 0xf86 && c <= 0xf8b)
580             || (c >= 0xf90 && c <= 0xf95)
581             || c == 0xf97
582             || (c >= 0xf99 && c <= 0xfad)
583             || (c >= 0xfb1 && c <= 0xfb7)
584             || c == 0xfb9
585             || (c >= 0x20d0 && c <= 0x20dc)
586             || c == 0x20e1
587             || (c >= 0x302a && c <= 0x302f)
588             || c == 0x3099
589             || c == 0x309a
590             || (c >= 0x30 && c <= 0x39)
591             || (c >= 0x660 && c <= 0x669)
592             || (c >= 0x6f0 && c <= 0x6f9)
593             || (c >= 0x966 && c <= 0x96f)
594             || (c >= 0x9e6 && c <= 0x9ef)
595             || (c >= 0xa66 && c <= 0xa6f)
596             || (c >= 0xae6 && c <= 0xaef)
597             || (c >= 0xb66 && c <= 0xb6f)
598             || (c >= 0xbe7 && c <= 0xbef)
599             || (c >= 0xc66 && c <= 0xc6f)
600             || (c >= 0xce6 && c <= 0xcef)
601             || (c >= 0xd66 && c <= 0xd6f)
602             || (c >= 0xe50 && c <= 0xe59)
603             || (c >= 0xed0 && c <= 0xed9)
604             || (c >= 0xf20 && c <= 0xf29)
605             || c == 0xb7
606             || c == 0x2d0
607             || c == 0x2d1
608             || c == 0x387
609             || c == 0x640
610             || c == 0xe46
611             || c == 0xec6
612             || c == 0x3005
613             || (c >= 0x3031 && c <= 0x3035)
614             || (c >= 0x309d && c <= 0x309e) || (c >= 0x30fc && c <= 0x30fe));
615     }
616 
617     /**
618      * Is the given character a single or double quote?
619      * @param c char
620      * @return <code>true</code> if c is " or '
621      */
622     static boolean isQuote(int c)
623     {
624         return (c == '\'' || c == '\"');
625     }
626 
627     /**
628      * Should always be able convert to/from UTF-8, so encoding exceptions are converted to an Error to avoid adding
629      * throws declarations in lots of methods.
630      * @param str String
631      * @return utf8 bytes
632      * @see String#getBytes()
633      */
634     public static byte[] getBytes(String str)
635     {
636         try
637         {
638             return str.getBytes("UTF8");
639         }
640         catch (java.io.UnsupportedEncodingException e)
641         {
642             throw new Error("String to UTF-8 conversion failed: " + e.getMessage());
643         }
644     }
645 
646     /**
647      * Should always be able convert to/from UTF-8, so encoding exceptions are converted to an Error to avoid adding
648      * throws declarations in lots of methods.
649      * @param bytes byte array
650      * @param offset starting offset in byte array
651      * @param length length in byte array starting from offset
652      * @return same as <code>new String(bytes, offset, length, "UTF8")</code>
653      */
654     public static String getString(byte[] bytes, int offset, int length)
655     {
656         try
657         {
658             return new String(bytes, offset, length, "UTF8");
659         }
660         catch (java.io.UnsupportedEncodingException e)
661         {
662             throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
663         }
664     }
665 
666     /**
667      * Return the last char in string. This is useful when trailing quotemark is missing on an attribute
668      * @param str String
669      * @return last char in String
670      */
671     public static int lastChar(String str)
672     {
673         if (str != null && str.length() > 0)
674         {
675             return str.charAt(str.length() - 1);
676         }
677 
678         return 0;
679     }
680 
681     /**
682      * Determines if the specified character is whitespace.
683      * @param c char
684      * @return <code>true</code> if char is whitespace.
685      */
686     public static boolean isWhite(char c)
687     {
688         short m = map(c);
689         return TidyUtils.toBoolean(m & WHITE);
690     }
691 
692     /**
693      * Is the given char a digit?
694      * @param c char
695      * @return <code>true</code> if the given char is a digit
696      */
697     public static boolean isDigit(char c)
698     {
699         short m;
700         m = map(c);
701         return TidyUtils.toBoolean(m & DIGIT);
702     }
703 
704     /**
705      * Is the given char a letter?
706      * @param c char
707      * @return <code>true</code> if the given char is a letter
708      */
709     public static boolean isLetter(char c)
710     {
711         short m;
712         m = map(c);
713         return TidyUtils.toBoolean(m & LETTER);
714     }
715 
716     /**
717      * Is the given char valid in name? (letter, digit or "-", ".", ":", "_")
718      * @param c char
719      * @return <code>true</code> if char is a name char.
720      */
721     public static boolean isNamechar(char c)
722     {
723         short map = map(c);
724 
725         return TidyUtils.toBoolean(map & NAMECHAR);
726     }
727 
728     /**
729      * Determines if the specified character is a lowercase character.
730      * @param c char
731      * @return <code>true</code> if char is lower case.
732      */
733     public static boolean isLower(char c)
734     {
735         short map = map(c);
736 
737         return TidyUtils.toBoolean(map & LOWERCASE);
738     }
739 
740     /**
741      * Determines if the specified character is a uppercase character.
742      * @param c char
743      * @return <code>true</code> if char is upper case.
744      */
745     public static boolean isUpper(char c)
746     {
747         short map = map(c);
748 
749         return TidyUtils.toBoolean(map & UPPERCASE);
750     }
751 
752     /**
753      * Maps the given character to its lowercase equivalent.
754      * @param c char
755      * @return lowercase char.
756      */
757     public static char toLower(char c)
758     {
759         short m = map(c);
760 
761         if (TidyUtils.toBoolean(m & UPPERCASE))
762         {
763             c = (char) (c + 'a' - 'A');
764         }
765 
766         return c;
767     }
768 
769     /**
770      * Maps the given character to its uppercase equivalent.
771      * @param c char
772      * @return uppercase char.
773      */
774     public static char toUpper(char c)
775     {
776         short m = map(c);
777 
778         if (TidyUtils.toBoolean(m & LOWERCASE))
779         {
780             c = (char) (c + 'A' - 'a');
781         }
782 
783         return c;
784     }
785 
786     /**
787      * Fold case of a char.
788      * @param c char
789      * @param tocaps convert to caps
790      * @param xmlTags use xml tags? If true no change will be performed
791      * @return folded char
792      * @todo check the use of xmlTags parameter
793      */
794     public static char foldCase(char c, boolean tocaps, boolean xmlTags)
795     {
796 
797         if (!xmlTags)
798         {
799 
800             if (tocaps)
801             {
802                 if (isLower(c))
803                 {
804                     c = toUpper(c);
805                 }
806             }
807             else
808             {
809                 // force to lower case
810                 if (isUpper(c))
811                 {
812                     c = toLower(c);
813                 }
814             }
815         }
816 
817         return c;
818     }
819 
820     /**
821      * Classify chars in String and put them in lexmap.
822      * @param str String
823      * @param code code associated to chars in the String
824      */
825     private static void mapStr(String str, short code)
826     {
827         int c;
828         for (int i = 0; i < str.length(); i++)
829         {
830             c = str.charAt(i);
831             lexmap[c] |= code;
832         }
833     }
834 
835     /**
836      * Returns the constant which defines the classification of char in lexmap.
837      * @param c char
838      * @return char type
839      */
840     private static short map(char c)
841     {
842         return (c < 128 ? lexmap[c] : 0);
843     }
844 
845     /**
846      * Is the given character encoding supported?
847      * @param name character encoding name
848      * @return <code>true</code> if encoding is supported, false otherwhise.
849      */
850     public static boolean isCharEncodingSupported(String name)
851     {
852         name = EncodingNameMapper.toJava(name);
853         if (name == null)
854         {
855             return false;
856         }
857 
858         try
859         {
860             "".getBytes(name);
861         }
862         catch (java.io.UnsupportedEncodingException e)
863         {
864             return false;
865         }
866         return true;
867     }
868 }