/*
* Copyright 2009 David Jurgens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package edu.ucla.sspace.text;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.IOError;
import java.util.ArrayList;
import java.util.List;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* A collection of static methods for processing text.
*
* @author David Jurgens
*/
public class StringUtils {
/**
* Uninstantiable
*/
private StringUtils() {}
/**
* A mapping from HTML codes for escaped special characters to their unicode
* character equivalents.
*/
private static final Map<String,String> HTML_CODES
= new HashMap<String,String>();
private static final Map<String,String> LATIN1_CODES
= new HashMap<String,String>();
static {
HTML_CODES.put(" "," ");
HTML_CODES.put("À","À");
HTML_CODES.put("Á","Á");
HTML_CODES.put("Â","Â");
HTML_CODES.put("Ã","Ã");
HTML_CODES.put("Ä","Ä");
HTML_CODES.put("Å","Å");
HTML_CODES.put("Æ","Æ");
HTML_CODES.put("Ç","Ç");
HTML_CODES.put("È","È");
HTML_CODES.put("É","É");
HTML_CODES.put("Ê","Ê");
HTML_CODES.put("Ë","Ë");
HTML_CODES.put("Ì","Ì");
HTML_CODES.put("Í","Í");
HTML_CODES.put("Î","Î");
HTML_CODES.put("Ï","Ï");
HTML_CODES.put("Ð","Ð");
HTML_CODES.put("Ñ","Ñ");
HTML_CODES.put("Ò","Ò");
HTML_CODES.put("Ó","Ó");
HTML_CODES.put("Ô","Ô");
HTML_CODES.put("Õ","Õ");
HTML_CODES.put("Ö","Ö");
HTML_CODES.put("Ø","Ø");
HTML_CODES.put("Ù","Ù");
HTML_CODES.put("Ú","Ú");
HTML_CODES.put("Û","Û");
HTML_CODES.put("Ü","Ü");
HTML_CODES.put("Ý","Ý");
HTML_CODES.put("Þ","Þ");
HTML_CODES.put("ß","ß");
HTML_CODES.put("à","à");
HTML_CODES.put("á","á");
HTML_CODES.put("â","â");
HTML_CODES.put("ã","ã");
HTML_CODES.put("ä","ä");
HTML_CODES.put("å","å");
HTML_CODES.put("æ","æ");
HTML_CODES.put("ç","ç");
HTML_CODES.put("è","è");
HTML_CODES.put("é","é");
HTML_CODES.put("ê","ê");
HTML_CODES.put("ë","ë");
HTML_CODES.put("ì","ì");
HTML_CODES.put("í","í");
HTML_CODES.put("î","î");
HTML_CODES.put("ï","ï");
HTML_CODES.put("ð","ð");
HTML_CODES.put("ñ","ñ");
HTML_CODES.put("ò","ò");
HTML_CODES.put("ó","ó");
HTML_CODES.put("ô","ô");
HTML_CODES.put("õ","õ");
HTML_CODES.put("ö","ö");
HTML_CODES.put("ø","ø");
HTML_CODES.put("ù","ù");
HTML_CODES.put("ú","ú");
HTML_CODES.put("û","û");
HTML_CODES.put("ü","ü");
HTML_CODES.put("ý","ý");
HTML_CODES.put("þ","þ");
HTML_CODES.put("ÿ","ÿ");
HTML_CODES.put("<","<");
HTML_CODES.put(">",">");
HTML_CODES.put(""","\"");
HTML_CODES.put("&","&");
LATIN1_CODES.put("'", "'");
LATIN1_CODES.put(" ", " ");
LATIN1_CODES.put("¢", "¢");
LATIN1_CODES.put("¤", "¤");
LATIN1_CODES.put("¦", "¦");
LATIN1_CODES.put("¨", "¨");
LATIN1_CODES.put("ª", "ª");
LATIN1_CODES.put("¬", "¬");
LATIN1_CODES.put("®", "®");
LATIN1_CODES.put("°", "°");
LATIN1_CODES.put("²", "²");
LATIN1_CODES.put("´", "´");
LATIN1_CODES.put("¶", "¶");
LATIN1_CODES.put("¸", "¸");
LATIN1_CODES.put("º", "º");
LATIN1_CODES.put("¼", "¼");
LATIN1_CODES.put("¾", "¾");
LATIN1_CODES.put("À", "À");
LATIN1_CODES.put("Â", "Â");
LATIN1_CODES.put("Ä", "Ä");
LATIN1_CODES.put("Æ", "Æ");
LATIN1_CODES.put("È", "È");
LATIN1_CODES.put("Ê", "Ê");
LATIN1_CODES.put("Ì", "Ì");
LATIN1_CODES.put("Î", "Î");
LATIN1_CODES.put("Ð", "Ð");
LATIN1_CODES.put("Ò", "Ò");
LATIN1_CODES.put("Ô", "Ô");
LATIN1_CODES.put("Ö", "Ö");
LATIN1_CODES.put("Ø", "Ø");
LATIN1_CODES.put("Ú", "Ú");
LATIN1_CODES.put("Ü", "Ü");
LATIN1_CODES.put("Þ", "Þ");
LATIN1_CODES.put("à", "à");
LATIN1_CODES.put("â", "â");
LATIN1_CODES.put("ä", "ä");
LATIN1_CODES.put("æ", "æ");
LATIN1_CODES.put("è", "è");
LATIN1_CODES.put("ê", "ê");
LATIN1_CODES.put("ì", "ì");
LATIN1_CODES.put("î", "î");
LATIN1_CODES.put("ð", "ð");
LATIN1_CODES.put("ò", "ò");
LATIN1_CODES.put("ô", "ô");
LATIN1_CODES.put("ö", "ö");
LATIN1_CODES.put("ø", "ø");
LATIN1_CODES.put("ú", "ú");
LATIN1_CODES.put("ü", "ü");
LATIN1_CODES.put("þ", "þ");
LATIN1_CODES.put(""", "\"");
LATIN1_CODES.put("&", "&");
LATIN1_CODES.put("’", "'");
}
/**
* Loads each line of the file as a list of strings.
*
* @throws IOError if any exception occurs while reading the file
*/
public static List<String> loadFileAsList(File f) {
try {
List<String> s = new ArrayList<String>();
BufferedReader br = new BufferedReader(new FileReader(f));
for (String line = null; (line = br.readLine()) != null; )
s.add(line);
br.close();
return s;
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* Loads the contents of a file as a set of strings, with each line being
* treated as a separate instance.
*
* @throws IOError if any exception occurs while reading the file
*/
public static Set<String> loadFileAsSet(File f) {
try {
Set<String> s = new HashSet<String>();
BufferedReader br = new BufferedReader(new FileReader(f));
for (String line = null; (line = br.readLine()) != null; )
s.add(line);
br.close();
return s;
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* Returns the provided string where all HTML special characters
* (e.g. <pre> </pre>) have been replaced with their utf8 equivalents.
*
* @param source a String possibly containing escaped HTML characters
*/
public static final String unescapeHTML(String source) {
StringBuilder sb = new StringBuilder(source.length());
// position markers for the & and ;
int start = -1, end = -1;
// the end position of the last escaped HTML character
int last = 0;
start = source.indexOf("&");
end = source.indexOf(";", start);
while (start > -1 && end > start) {
String encoded = source.substring(start, end + 1);
String decoded = HTML_CODES.get(encoded);
// if encoded form wasn't in the HTML codes, try checking to see if
// it was a Latin-1 code
if (decoded == null) {
decoded = LATIN1_CODES.get(encoded);
}
if (decoded != null) {
// append the string containing all characters from the last escaped
// character to the current one
String s = source.substring(last, start);
sb.append(s).append(decoded);
last = end + 1;
}
start = source.indexOf("&", end);
end = source.indexOf(";", start);
}
// if there weren't any substitutions, don't both to create a new String
if (sb.length() == 0)
return source;
// otherwise finish the substitution by appending all the text from the
// last substitution until the end of the string
sb.append(source.substring(last));
return sb.toString();
}
/**
* Modifies the provided {@link StringBuilder} by replacing all HTML special
* characters (e.g. <pre> </pre>) with their utf8 equivalents.
*
* @param source a String possibly containing escaped HTML characters
*/
public static final void unescapeHTML(StringBuilder source) {
// position markers for the & and ;
int start = -1, end = -1;
// the end position of the last escaped HTML character
int last = 0;
start = source.indexOf("&");
end = source.indexOf(";", start);
while (start > -1 && end > start) {
String encoded = source.substring(start, end + 1);
String decoded = HTML_CODES.get(encoded);
// if encoded form wasn't in the HTML codes, try checking to see if
// it was a Latin-1 code
if (decoded == null) {
decoded = LATIN1_CODES.get(encoded);
}
// If the string had encoded HTML that was recognized, replace it
// with the decoded version
if (decoded != null) {
source.replace(start, end + 1, decoded);
}
// Use the start+1 rather than end, since the decoded text may be
// smaller than the encoded version. However, don't use start in
// case the decoded character was actually a '&'.
start = source.indexOf("&", start + 1);
end = source.indexOf(";", start);
}
}
}