Mega Code Archive

Returns the string constructed from the specified character sequence by deaccenting each of its characters

/* * LingPipe v. 3.9 * Copyright (C) 2003-2010 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */ //package com.aliasi.util; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.text.DecimalFormat; /** * Static utility methods for processing strings, characters and * string buffers. * * @author Bob Carpenter * @version 4.0.1 * @since LingPipe1.0 * @see java.lang.Character * @see java.lang.String * @see java.lang.StringBuilder */ public class Strings { /** * Returns the string constructed from the specified character * sequence by deaccenting each of its characters. See {@link * #deAccentLatin1(char)} for details of the de-accenting. * * @param cSeq Character sequence to de accent. * @return De-accented version of input. */ public static String deAccentLatin1(CharSequence cSeq) { char[] cs = new char[cSeq.length()]; for (int i = 0; i < cs.length; ++i) cs[i] = deAccentLatin1(cSeq.charAt(i)); return new String(cs); } /** * Returns the equivalent de-accented character for characters in * the Latin-1 (ISO-8859-1) range (0000-00FF). Characters not in * the Latin-1 range are returned as-is. * * Note that Latin-1 is a superset of ASCII, and the unsigned byte * encoding of Latin-1 characters (ISO-8859-1) provides the same * code points as Unicode for characters. * * <p>The <code>unicode.org</code> site supplies a complete <a * href="http://unicode.org/charts/PDF/U0080.pdf">Latin-1 * Supplement</code>, listing the code points for each character. * * @param c Character to de-accent. * @return Equivalent character without accent. */ public static char deAccentLatin1(char c) { switch (c) { case '\u00C0': return 'A'; case '\u00C1': return 'A'; case '\u00C2': return 'A'; case '\u00C3': return 'A'; case '\u00C4': return 'A'; case '\u00C5': return 'A'; case '\u00C6': return 'A'; // capital AE ligature case '\u00C7': return 'C'; case '\u00C8': return 'E'; case '\u00C9': return 'E'; case '\u00CA': return 'E'; case '\u00CB': return 'E'; case '\u00CC': return 'I'; case '\u00CD': return 'I'; case '\u00CE': return 'I'; case '\u00CF': return 'I'; case '\u00D0': return 'D'; case '\u00D1': return 'N'; case '\u00D2': return 'O'; case '\u00D3': return 'O'; case '\u00D4': return 'O'; case '\u00D5': return 'O'; case '\u00D6': return 'O'; case '\u00D8': return 'O'; case '\u00D9': return 'U'; case '\u00DA': return 'U'; case '\u00DB': return 'U'; case '\u00DC': return 'U'; case '\u00DD': return 'Y'; case '\u00DE': return 'P'; // runic letter thorn case '\u00DF': return 's'; // upper case is SS case '\u00E0': return 'a'; case '\u00E1': return 'a'; case '\u00E2': return 'a'; case '\u00E3': return 'a'; case '\u00E4': return 'a'; case '\u00E5': return 'a'; case '\u00E6': return 'a'; // ae ligature case '\u00E7': return 'c'; case '\u00E8': return 'e'; case '\u00E9': return 'e'; case '\u00EA': return 'e'; case '\u00EB': return 'e'; case '\u00EC': return 'i'; case '\u00ED': return 'i'; case '\u00EE': return 'i'; case '\u00EF': return 'i'; case '\u00F0': return 'd'; case '\u00F1': return 'n'; case '\u00F2': return 'o'; case '\u00F3': return 'o'; case '\u00F4': return 'o'; case '\u00F5': return 'o'; case '\u00F6': return 'o'; case '\u00F8': return 'o'; case '\u00F9': return 'u'; case '\u00FA': return 'u'; case '\u00FB': return 'u'; case '\u00FC': return 'u'; case '\u00FD': return 'y'; case '\u00FE': return 'p'; // runic letter thorn case '\u00FF': return 'y'; default: return c; } } }