java.nio.charset.Charset class but knows many
* more aliases and is compatible with Java 1.3. It will use a simple detection
* mechanism to detect what character sets the current VM supports. This will
* be a sub-set of the character sets listed in the
*
* Java 1.5 (J2SE5.0) Supported Encodings document.
* * The * IANA Character Sets document has been used to determine the preferred * MIME character set names and to get a list of known aliases. *
* This is a complete list of the character sets known to this class: *
| Canonical (Java) name | *MIME preferred | *Aliases | *
| ASCII | *US-ASCII | *ANSI_X3.4-1968 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ISO646-US us IBM367 cp367 csASCII ascii7 646 iso_646.irv:1983 | *
| Big5 | *Big5 | *csBig5 CN-Big5 BIG-FIVE BIGFIVE | *
| Big5_HKSCS | *Big5-HKSCS | *big5hkscs | *
| Big5_Solaris | *? | ** |
| Cp037 | *IBM037 | *ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 | *
| Cp1006 | *? | ** |
| Cp1025 | *? | ** |
| Cp1026 | *IBM1026 | *csIBM1026 | *
| Cp1046 | *? | ** |
| Cp1047 | *IBM1047 | *IBM-1047 | *
| Cp1097 | *? | ** |
| Cp1098 | *? | ** |
| Cp1112 | *? | ** |
| Cp1122 | *? | ** |
| Cp1123 | *? | ** |
| Cp1124 | *? | ** |
| Cp1140 | *IBM01140 | *CCSID01140 CP01140 ebcdic-us-37+euro | *
| Cp1141 | *IBM01141 | *CCSID01141 CP01141 ebcdic-de-273+euro | *
| Cp1142 | *IBM01142 | *CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro | *
| Cp1143 | *IBM01143 | *CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro | *
| Cp1144 | *IBM01144 | *CCSID01144 CP01144 ebcdic-it-280+euro | *
| Cp1145 | *IBM01145 | *CCSID01145 CP01145 ebcdic-es-284+euro | *
| Cp1146 | *IBM01146 | *CCSID01146 CP01146 ebcdic-gb-285+euro | *
| Cp1147 | *IBM01147 | *CCSID01147 CP01147 ebcdic-fr-297+euro | *
| Cp1148 | *IBM01148 | *CCSID01148 CP01148 ebcdic-international-500+euro | *
| Cp1149 | *IBM01149 | *CCSID01149 CP01149 ebcdic-is-871+euro | *
| Cp1250 | *windows-1250 | ** |
| Cp1251 | *windows-1251 | ** |
| Cp1252 | *windows-1252 | ** |
| Cp1253 | *windows-1253 | ** |
| Cp1254 | *windows-1254 | ** |
| Cp1255 | *windows-1255 | ** |
| Cp1256 | *windows-1256 | ** |
| Cp1257 | *windows-1257 | ** |
| Cp1258 | *windows-1258 | ** |
| Cp1381 | *? | ** |
| Cp1383 | *? | ** |
| Cp273 | *IBM273 | *csIBM273 | *
| Cp277 | *IBM277 | *EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 | *
| Cp278 | *IBM278 | *CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 | *
| Cp280 | *IBM280 | *ebcdic-cp-it csIBM280 | *
| Cp284 | *IBM284 | *ebcdic-cp-es csIBM284 | *
| Cp285 | *IBM285 | *ebcdic-cp-gb csIBM285 | *
| Cp297 | *IBM297 | *ebcdic-cp-fr csIBM297 | *
| Cp33722 | *? | ** |
| Cp420 | *IBM420 | *ebcdic-cp-ar1 csIBM420 | *
| Cp424 | *IBM424 | *ebcdic-cp-he csIBM424 | *
| Cp437 | *IBM437 | *437 csPC8CodePage437 | *
| Cp500 | *IBM500 | *ebcdic-cp-be ebcdic-cp-ch csIBM500 | *
| Cp737 | *? | ** |
| Cp775 | *IBM775 | *csPC775Baltic | *
| Cp838 | *IBM-Thai | ** |
| Cp850 | *IBM850 | *850 csPC850Multilingual | *
| Cp852 | *IBM852 | *852 csPCp852 | *
| Cp855 | *IBM855 | *855 csIBM855 | *
| Cp856 | *? | ** |
| Cp857 | *IBM857 | *857 csIBM857 | *
| Cp858 | *IBM00858 | *CCSID00858 CP00858 PC-Multilingual-850+euro | *
| Cp860 | *IBM860 | *860 csIBM860 | *
| Cp861 | *IBM861 | *861 cp-is csIBM861 | *
| Cp862 | *IBM862 | *862 csPC862LatinHebrew | *
| Cp863 | *IBM863 | *863 csIBM863 | *
| Cp864 | *IBM864 | *cp864 csIBM864 | *
| Cp865 | *IBM865 | *865 csIBM865 | *
| Cp866 | *IBM866 | *866 csIBM866 | *
| Cp868 | *IBM868 | *cp-ar csIBM868 | *
| Cp869 | *IBM869 | *cp-gr csIBM869 | *
| Cp870 | *IBM870 | *ebcdic-cp-roece ebcdic-cp-yu csIBM870 | *
| Cp871 | *IBM871 | *ebcdic-cp-is csIBM871 | *
| Cp875 | *? | ** |
| Cp918 | *IBM918 | *ebcdic-cp-ar2 csIBM918 | *
| Cp921 | *? | ** |
| Cp922 | *? | ** |
| Cp930 | *? | ** |
| Cp933 | *? | ** |
| Cp935 | *? | ** |
| Cp937 | *? | ** |
| Cp939 | *? | ** |
| Cp942 | *? | ** |
| Cp942C | *? | ** |
| Cp943 | *? | ** |
| Cp943C | *? | ** |
| Cp948 | *? | ** |
| Cp949 | *? | ** |
| Cp949C | *? | ** |
| Cp950 | *? | ** |
| Cp964 | *? | ** |
| Cp970 | *? | ** |
| EUC_CN | *GB2312 | *x-EUC-CN csGB2312 euccn euc-cn gb2312-80 gb2312-1980 CN-GB CN-GB-ISOIR165 | *
| EUC_JP | *EUC-JP | *csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese eucjis x-eucjp eucjp x-euc-jp | *
| EUC_JP_LINUX | *? | ** |
| EUC_JP_Solaris | *? | ** |
| EUC_KR | *EUC-KR | *csEUCKR ksc5601 5601 ksc5601_1987 ksc_5601 ksc5601-1987 ks_c_5601-1987 euckr | *
| EUC_TW | *EUC-TW | *x-EUC-TW cns11643 euctw | *
| GB18030 | *GB18030 | *gb18030-2000 | *
| GBK | *windows-936 | *CP936 MS936 ms_936 x-mswin-936 | *
| ISCII91 | *? | *x-ISCII91 iscii | *
| ISO2022CN | *ISO-2022-CN | ** |
| ISO2022JP | *ISO-2022-JP | *csISO2022JP JIS jis_encoding csjisencoding | *
| ISO2022KR | *ISO-2022-KR | *csISO2022KR | *
| ISO2022_CN_CNS | *? | ** |
| ISO2022_CN_GB | *? | ** |
| ISO8859_1 | *ISO-8859-1 | *ISO_8859-1:1987 iso-ir-100 ISO_8859-1 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 819 IBM-819 ISO8859-1 ISO_8859_1 | *
| ISO8859_13 | *ISO-8859-13 | ** |
| ISO8859_15 | *ISO-8859-15 | *ISO_8859-15 Latin-9 8859_15 csISOlatin9 IBM923 cp923 923 L9 IBM-923 ISO8859-15 LATIN9 LATIN0 csISOlatin0 ISO8859_15_FDIS | *
| ISO8859_2 | *ISO-8859-2 | *ISO_8859-2:1987 iso-ir-101 ISO_8859-2 latin2 l2 csISOLatin2 8859_2 iso8859_2 | *
| ISO8859_3 | *ISO-8859-3 | *ISO_8859-3:1988 iso-ir-109 ISO_8859-3 latin3 l3 csISOLatin3 8859_3 | *
| ISO8859_4 | *ISO-8859-4 | *ISO_8859-4:1988 iso-ir-110 ISO_8859-4 latin4 l4 csISOLatin4 8859_4 | *
| ISO8859_5 | *ISO-8859-5 | *ISO_8859-5:1988 iso-ir-144 ISO_8859-5 cyrillic csISOLatinCyrillic 8859_5 | *
| ISO8859_6 | *ISO-8859-6 | *ISO_8859-6:1987 iso-ir-127 ISO_8859-6 ECMA-114 ASMO-708 arabic csISOLatinArabic 8859_6 | *
| ISO8859_7 | *ISO-8859-7 | *ISO_8859-7:1987 iso-ir-126 ISO_8859-7 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 sun_eu_greek | *
| ISO8859_8 | *ISO-8859-8 | *ISO_8859-8:1988 iso-ir-138 ISO_8859-8 hebrew csISOLatinHebrew 8859_8 | *
| ISO8859_9 | *ISO-8859-9 | *ISO_8859-9:1989 iso-ir-148 ISO_8859-9 latin5 l5 csISOLatin5 8859_9 | *
| JISAutoDetect | *? | ** |
| JIS_C6626-1983 | *JIS_C6626-1983 | *x-JIS0208 JIS0208 csISO87JISX0208 x0208 JIS_X0208-1983 iso-ir-87 | *
| JIS_X0201 | *JIS_X0201 | *X0201 JIS0201 csHalfWidthKatakana | *
| JIS_X0212-1990 | *JIS_X0212-1990 | *iso-ir-159 x0212 JIS0212 csISO159JISX02121990 | *
| KOI8_R | *KOI8-R | *csKOI8R koi8 | *
| MS874 | *windows-874 | *cp874 | *
| MS932 | *Windows-31J | *windows-932 csWindows31J x-ms-cp932 | *
| MS949 | *windows-949 | *windows949 ms_949 x-windows-949 | *
| MS950 | *windows-950 | *x-windows-950 | *
| MS950_HKSCS | ** | * |
| MacArabic | *? | ** |
| MacCentralEurope | *? | ** |
| MacCroatian | *? | ** |
| MacCyrillic | *? | ** |
| MacDingbat | *? | ** |
| MacGreek | *MacGreek | ** |
| MacHebrew | *? | ** |
| MacIceland | *? | ** |
| MacRoman | *MacRoman | *Macintosh MAC csMacintosh | *
| MacRomania | *? | ** |
| MacSymbol | *? | ** |
| MacThai | *? | ** |
| MacTurkish | *? | ** |
| MacUkraine | *? | ** |
| SJIS | *Shift_JIS | *MS_Kanji csShiftJIS shift-jis x-sjis pck | *
| TIS620 | *TIS-620 | ** |
| UTF-16 | *UTF-16 | *UTF_16 | *
| UTF8 | *UTF-8 | ** |
| UnicodeBig | *? | ** |
| UnicodeBigUnmarked | *UTF-16BE | *X-UTF-16BE UTF_16BE ISO-10646-UCS-2 | *
| UnicodeLittle | *? | ** |
| UnicodeLittleUnmarked | *UTF-16LE | *UTF_16LE X-UTF-16LE | *
| x-Johab | *johab | *johab cp1361 ms1361 ksc5601-1992 ksc5601_1992 | *
| x-iso-8859-11 | *? | ** |
true if the specified character falls into the US
* ASCII character set (Unicode range 0000 to 007f).
*
* @param ch
* character to test.
* @return true if the specified character falls into the US
* ASCII character set, false otherwise.
*/
public static boolean isASCII(char ch) {
return (0xFF80 & ch) == 0;
}
/**
* Returns true if the specified string consists entirely of
* US ASCII characters.
*
* @param s
* string to test.
* @return true if the specified string consists entirely of
* US ASCII characters, false otherwise.
*/
public static boolean isASCII(final String s) {
if (s == null) {
throw new IllegalArgumentException("String may not be null");
}
final int len = s.length();
for (int i = 0; i < len; i++) {
if (!isASCII(s.charAt(i))) {
return false;
}
}
return true;
}
/**
* Returns true if the specified character is a whitespace
* character (CR, LF, SP or HT).
*
* @param ch
* character to test.
* @return true if the specified character is a whitespace
* character, false otherwise.
*/
public static boolean isWhitespace(char ch) {
return ch == SP || ch == HT || ch == CR || ch == LF;
}
/**
* Returns true if the specified string consists entirely of
* whitespace characters.
*
* @param s
* string to test.
* @return true if the specified string consists entirely of
* whitespace characters, false otherwise.
*/
public static boolean isWhitespace(final String s) {
if (s == null) {
throw new IllegalArgumentException("String may not be null");
}
final int len = s.length();
for (int i = 0; i < len; i++) {
if (!isWhitespace(s.charAt(i))) {
return false;
}
}
return true;
}
/**
* Determines if the VM supports encoding (chars to bytes) the
* specified character set. NOTE: the given character set name may
* not be known to the VM even if this method returns true.
* Use {@link #toJavaCharset(String)} to get the canonical Java character
* set name.
*
* @param charsetName the characters set name.
* @return true if encoding is supported, false
* otherwise.
*/
public static boolean isEncodingSupported(String charsetName) {
return encodingSupported.contains(charsetName.toLowerCase());
}
/**
* Determines if the VM supports decoding (bytes to chars) the
* specified character set. NOTE: the given character set name may
* not be known to the VM even if this method returns true.
* Use {@link #toJavaCharset(String)} to get the canonical Java character
* set name.
*
* @param charsetName the characters set name.
* @return true if decoding is supported, false
* otherwise.
*/
public static boolean isDecodingSupported(String charsetName) {
return decodingSupported.contains(charsetName.toLowerCase());
}
/**
* Gets the preferred MIME character set name for the specified
* character set or null if not known.
*
* @param charsetName the character set name to look for.
* @return the MIME preferred name or null if not known.
*/
public static String toMimeCharset(String charsetName) {
Charset c = charsetMap.get(charsetName.toLowerCase());
if (c != null) {
return c.mime;
}
return null;
}
/**
* Gets the canonical Java character set name for the specified
* character set or null if not known. This should be
* called before doing any conversions using the Java API. NOTE:
* you must use {@link #isEncodingSupported(String)} or
* {@link #isDecodingSupported(String)} to make sure the returned
* Java character set is supported by the current VM.
*
* @param charsetName the character set name to look for.
* @return the canonical Java name or null if not known.
*/
public static String toJavaCharset(String charsetName) {
Charset c = charsetMap.get(charsetName.toLowerCase());
if (c != null) {
return c.canonical;
}
return null;
}
public static java.nio.charset.Charset getCharset(String charsetName) {
String defaultCharset = "ISO-8859-1";
// Use the default chareset if given charset is null
if(charsetName == null) charsetName = defaultCharset;
try {
return java.nio.charset.Charset.forName(charsetName);
} catch (IllegalCharsetNameException e) {
// Use default charset on exception
return java.nio.charset.Charset.forName(defaultCharset);
} catch (UnsupportedCharsetException ex) {
// Use default charset on exception
return java.nio.charset.Charset.forName(defaultCharset);
}
}
/*
* Uncomment the code below and run the main method to regenerate the
* Javadoc table above when the known charsets change.
*/
/*
private static String dumpHtmlTable() {
List| Canonical (Java) name | \n"); sb.append(" *MIME preferred | \n"); sb.append(" *Aliases | \n"); sb.append(" *
| " + c.canonical + " | \n"); sb.append(" *" + (c.mime == null ? "?" : c.mime)+ " | \n"); sb.append(" *"); for (int i = 0; c.aliases != null && i < c.aliases.length; i++) { sb.append(c.aliases[i] + " "); } sb.append(" | \n"); sb.append(" *