// BlogBridge -- RSS feed reader, manager, and web based service
// Copyright (C) 2002-2006 by R. Pito Salas
//
// This program is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free Software Foundation;
// either version 2 of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along with this program;
// if not, write to the Free Software Foundation, Inc., 59 Temple Place,
// Suite 330, Boston, MA 02111-1307 USA
//
// Contact: R. Pito Salas
// mailto:pitosalas@users.sourceforge.net
// More information: about BlogBridge
// http://www.blogbridge.com
// http://sourceforge.net/projects/blogbridge
//
// $Id: StringUtils.java,v 1.42 2007/11/07 17:16:48 spyromus Exp $
//
package com.salas.bb.utils;
import com.salas.bb.utils.i18n.Strings;
import sun.io.Converters;
import sun.misc.BASE64Encoder;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Collection of string utilities.
*/
public final class StringUtils extends org.apache.commons.lang.StringUtils
{
private static final Logger LOG = Logger.getLogger(StringUtils.class.getName());
private static final String[] SIZE_UNIT = { "Bytes", "Kb", "Mb", "Gb" };
private static final DecimalFormat FORMAT = new DecimalFormat();
private static final Pattern PATTERN_KEYWORDS =
Pattern.compile("\\s*((\\\"([^\\\"]*)\\\"|([^\\s\\\"]+))\\s*)");
private static final Pattern PATTERN_URL_WITH_PROTOCOL =
Pattern.compile("^[a-zA-Z]+:/");
private static final Pattern PATTERN_PUNCTUATION =
Pattern.compile("([\\-\\+#\\$%^&\\_*\\s,.\\(\\)\\[\\]<>!\\?\"':;/\\\\])+");
private static final Map<String, Character> ENTITIES = new HashMap<String, Character>();
static
{
FORMAT.setMaximumFractionDigits(1);
FORMAT.setMinimumFractionDigits(1);
String[] fromA0 = {
"nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", "sect",
"uml", "copy", "ordf", "laquo", "not", "shy", "reg", "macr",
"deg", "plusmn", "sup2", "sup3", "acute", "micro", "para", "middot",
"cedil", "sup1", "ordm", "raquo", "frac14", "frac12", "frac34", "iquest",
"Agrave", "Aacute", "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil",
"Egrave", "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", "Iuml",
"ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", "times",
"Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", "Yacute", "THORN", "szlig",
"agrave", "aacute", "acirc", "atilde", "auml", "aring", "aelig", "ccedil",
"egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", "iuml",
"eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", "ouml", "divide",
"oslash", "ugrave", "uacute", "ucirc", "uuml", "yacute", "thorn", "yuml"
};
for (int i = 0; i < fromA0.length; i++) ENTITIES.put(fromA0[i], (char)(0xa0 + i));
ENTITIES.put("trade", (char)8482);
ENTITIES.put("OElig", (char)338);
ENTITIES.put("oelig", (char)339);
ENTITIES.put("Scaron", (char)352);
ENTITIES.put("scaron", (char)353);
ENTITIES.put("Yuml", (char)376);
ENTITIES.put("circ", (char)710);
ENTITIES.put("tilde", (char)732);
ENTITIES.put("ensp", (char)8194);
ENTITIES.put("emsp", (char)8195);
ENTITIES.put("thinsp", (char)8201);
ENTITIES.put("zwnj", (char)8204);
ENTITIES.put("zwj", (char)8205);
ENTITIES.put("lrm", (char)8206);
ENTITIES.put("rrm", (char)8207);
ENTITIES.put("ndash", (char)8211);
ENTITIES.put("mdash", (char)8212);
ENTITIES.put("lsquo", (char)8216);
ENTITIES.put("rsquo", (char)8217);
ENTITIES.put("sbquo", (char)8218);
ENTITIES.put("ldquo", (char)8220);
ENTITIES.put("rdquo", (char)8221);
ENTITIES.put("bdquo", (char)8222);
ENTITIES.put("dagger", (char)8224);
ENTITIES.put("Dagger", (char)8225);
ENTITIES.put("hellip", (char)8230);
ENTITIES.put("permil", (char)8240);
ENTITIES.put("lsaquo", (char)8249);
ENTITIES.put("rsaquo", (char)8250);
ENTITIES.put("euro", (char)8364);
ENTITIES.put("amp", '&');
ENTITIES.put("lt", '<');
ENTITIES.put("gt", '>');
ENTITIES.put("apos", '\'');
ENTITIES.put("quot", '"');
}
/**
* Hidden utility class constructor.
*/
private StringUtils()
{
}
/**
* Converts array of bytes in UTF-8 encoding to appropriate string. If encoding
* isn't supported then the array will be converted into string using default
* encoding and record will be put in log with severe priority.
*
* @param string bytes forming string.
*
* @return resulting string.
*/
public static String fromUTF8(byte[] string)
{
if (string == null) return null;
String str;
try
{
str = new String(string, "UTF-8");
} catch (UnsupportedEncodingException e)
{
LOG.severe(Strings.error("utf8.not.supported"));
str = new String(string);
}
return str;
}
/**
* Converts array of byte arrays in UTF-8 encoding to array of strings. The notes are
* the same as for <code>fromUTF8(String)</code> method.
*
* @param strings array of byte arrays to decode.
*
* @return resulting array of strings.
*/
public static String[] fromUTF8(byte[][] strings)
{
if (strings == null) return null;
String[] strs = new String[strings.length];
for (int i = 0; i < strings.length; i++)
{
byte[] string = strings[i];
strs[i] = fromUTF8(string);
}
return strs;
}
/**
* Converts string into array of bytes in UTF-8 encoding. If UTF-8 encoding isn't supported
* then the tring is converted into bytes in default encoding and record is put in log
* with severe priority.
*
* @param string string to convert.
*
* @return resulting array of bytes.
*/
public static byte[] toUTF8(String string)
{
if (string == null) return null;
byte[] result;
try
{
result = string.getBytes("UTF-8");
} catch (UnsupportedEncodingException e)
{
LOG.severe(Strings.error("utf8.not.supported"));
result = string.getBytes();
}
return result;
}
/**
* Converts multi-line text into the array of strings.
*
* @param text text.
*
* @return array of strings.
*/
public static String[] multilineToArray(String text)
{
return text == null ? null : split(text, "\n");
}
/**
* Converts any value to multi-line text. Arrays of strings are converted that each string
* appears on its own line.
*
* @param value arbitrary value.
*
* @return multi-line text.
*/
public static String anyToMultiline(Object value)
{
String result = Constants.EMPTY_STRING;
if (value instanceof String[])
{
result = arrayToMultiline((String[])value);
} else if (value != null)
{
result = value.toString();
}
return result;
}
/**
* Converts array of strings to multi-line text where each string appears on it's own line.
*
* @param aStrings array of strings.
*
* @return multi-line.
*/
public static String arrayToMultiline(String[] aStrings)
{
return aStrings == null ? null : join(aStrings, "\n");
}
/**
* Converts text from source encoding into Unicode. If encoding isn't supported you will get
* the original text.
*
* @param text text.
* @param sourceEncoding source encoding.
*
* @return converted text.
*/
public static String decodeForced(String text, String sourceEncoding)
{
if (text == null) return null;
if (sourceEncoding == null || sourceEncoding.equals(Converters.getDefaultEncodingName()))
return text;
try
{
text = new String(text.getBytes("ISO8859-1"), sourceEncoding);
} catch (UnsupportedEncodingException e)
{
// We don't cate about it.
}
return text;
}
/**
* Returns the first line of article.
*
* @param aText text to scan.
*
* @return first text line.
*/
public static String getFirstSentense(String aText)
{
if (aText == null) return null;
int size = aText.length();
int start;
int length;
for (start = 0; start < size && Character.isWhitespace(aText.charAt(start)); start++);
for (length = 0; start + length < size &&
(!isSentenseTerminator(aText.charAt(start + length))); length++);
return length > 0 ? aText.substring(start, start + length).trim() : Constants.EMPTY_STRING;
}
/**
* Returns TRUE if char is a sentense terminator.
*
* @param ch char to test.
*
* @return TRUE if char is a sentense terminator.
*/
public static boolean isSentenseTerminator(char ch)
{
return ch == '.' || ch == '?' || ch == '!';
}
/**
* Returns the stringified size.
*
* @param size size in bytes.
*
* @return string representation.
*/
public static String sizeToString(double size)
{
return sizeToString(size, 0);
}
/**
* Returns the stringified size.
*
* @param size size in units.
* @param unitIndex unit index.
*
* @return string represenation.
*/
private static String sizeToString(double size, int unitIndex)
{
String value;
if (size < 512 || unitIndex == SIZE_UNIT.length - 1)
{
value = FORMAT.format(size) + " " + SIZE_UNIT[unitIndex];
} else {
value = sizeToString(size / 1024, unitIndex + 1);
}
return value;
}
/**
* Encodes string to be put in URL.
*
* <p>Example:</p>
* <pre>
* input string: 'a &?b'
* output string: 'a+%26%3Fb'
* </pre>
*
* @param str string to encode.
*
* @return encoded string or NULL if source was NULL.
*/
public static String encodeForURL(String str)
{
if (str == null) return null;
try
{
str = URLEncoder.encode(str, "UTF-8");
} catch (UnsupportedEncodingException e)
{
throw new RuntimeException(Strings.error("utf8.not.supported"), e);
}
return str;
}
/**
* Converts list of keywords separated by whitespace. Each keyword can actually
* contain several words if enclosed in double quotes.
*
* @param keywords keywords string.
*
* @return list of keywords or <code>NULL</code> if <code>keywords</code>
* were <code>NULL</code>.
*/
public static String[] keywordsToArray(String keywords)
{
List<String> matches = keywordsToList(keywords);
return matches == null ? null : matches.toArray(new String[matches.size()]);
}
/**
* Converts list of keywords separated by whitespace. Each keyword can actually
* contain several words if enclosed in double quotes.
*
* @param keywords keywords string.
*
* @return list of keywords or <code>NULL</code> if <code>keywords</code>
* were <code>NULL</code>.
*/
public static List<String> keywordsToList(String keywords)
{
if (keywords == null) return null;
Matcher mat = PATTERN_KEYWORDS.matcher(keywords);
List<String> matches = new ArrayList<String>();
while (mat.find())
{
String keyword = mat.group(3);
if (keyword == null) keyword = mat.group(2);
if (keyword != null &&
!"*".equals(keyword) &&
!"+".equals(keyword) &&
!matches.contains(keyword)) matches.add(keyword);
}
return matches;
}
/**
* Places multi-word keyword in quotes only if it isn't in quotes already.
*
* @param keyword param to quote if necessary.
*
* @return updated keyword.
*/
public static String quoteKeywordIfNecessary(String keyword)
{
keyword = keyword.trim();
if (keyword.indexOf(' ') != -1 &&
keyword.charAt(0) != '"' && keyword.charAt(keyword.length() - 1) != '"')
{
keyword = "\"" + keyword + "\"";
}
return keyword;
}
/**
* Converts keywords from <code>a|b|c d|e</code> or <code>a, b, c d, e</code>
* looks to <code>a b "c d" e</code>.
*
* @param aKeywords keywords to convert.
*
* @return new-look keywords.
*/
public static String convertKeywordsToNewFormat(String aKeywords)
{
String result;
if (aKeywords.indexOf('|') != -1)
{
result = breakAndRejoinKeywords(aKeywords, "|");
} else if (aKeywords.indexOf(',') != -1)
{
result = breakAndRejoinKeywords(aKeywords, ",");
} else result = aKeywords;
return result;
}
/**
* Breaks current keywords list appart and rejoins it using curren keywords
* rules.
*
* @param aKeywords list of keywords.
* @param currentSeparator the separator to be used for breaking.
*
* @return newly formed keywords list.
*/
private static String breakAndRejoinKeywords(String aKeywords, String currentSeparator)
{
String[] keywordsList = split(aKeywords, currentSeparator);
return arrayToQuotedKeywords(keywordsList);
}
/**
* Converts the array of keywords into the space-delimited list with quoted multi-word
* items.
*
* @param aKeywordsList list.
*
* @return space-delimete and quoted list of keywords.
*/
public static String arrayToQuotedKeywords(String[] aKeywordsList)
{
String result = null;
if (aKeywordsList != null)
{
if (aKeywordsList.length > 0)
{
StringBuffer buf = new StringBuffer();
buf.append(quoteKeywordIfNecessary(aKeywordsList[0]));
for (int i = 1; i < aKeywordsList.length; i++)
{
buf.append(" ").append(quoteKeywordIfNecessary(aKeywordsList[i]));
}
result = buf.toString();
} else result = "";
}
return result;
}
/**
* Digests the buffer with key using MD5 algorithm.
*
* @param buffer buffer.
* @param key key is secret key (password or something else which isn't
* going to be passed over network).
*
* @return digested buffer.
*
* @throws NoSuchAlgorithmException if there's no MD5 algorithm implemetation.
*/
public static byte[] digestMD5(String buffer, String key)
throws NoSuchAlgorithmException
{
MessageDigest md5 = MessageDigest.getInstance("MD5");
md5.update(buffer.getBytes());
return md5.digest(key.getBytes());
}
/**
* Creates basic authentication token given user name and password.
*
* @param user user name.
* @param password password.
*
* @return token.
*/
public static String createBasicAuthToken(String user, String password)
{
String token = user + ":" + password;
String base64Token = new BASE64Encoder().encode(token.getBytes());
return "Basic " + base64Token;
}
/**
* Creates pattern from the keywords list.
*
* @param keywords keywords list.
*
* @return keywords regex pattern.
*/
public static String keywordsToPattern(String keywords)
{
return keywordsToPattern(keywordsToArray(keywords));
}
/**
* Creates pattern from the keywords list.
*
* @param aKeywords keywords list.
*
* @return keywords regex pattern.
*/
public static String keywordsToPattern(String[] aKeywords)
{
String pattern;
if (aKeywords != null && aKeywords.length > 0)
{
pattern = join(aKeywords, "|");
pattern = pattern.replaceAll("\\\\", "\\\\\\\\");
pattern = pattern.replaceAll("\\.", "\\\\.");
pattern = pattern.replaceAll("\\n+", "|");
pattern = pattern.replaceAll("\\?", "\\\\?");
pattern = pattern.replaceAll("\\(", "\\\\(");
pattern = pattern.replaceAll("\\)", "\\\\)");
pattern = pattern.replaceAll("\\[", "\\\\[");
pattern = pattern.replaceAll("\\]", "\\\\]");
pattern = pattern.replaceAll("\\++", "\\\\w+");
pattern = pattern.replaceAll("\\s+", "\\\\s+");
pattern = pattern.replaceAll("\\*+", "\\\\w*");
pattern = pattern.replaceAll("\\\\s\\+\\\\w\\*\\\\s\\+", "\\\\s+(\\\\w*\\\\s+)?");
pattern = pattern.replaceAll("\\|\\|+", "\\|");
pattern = pattern.replaceAll("(^\\||\\|$)", "");
String start = "\\W";
String end = start;
if (pattern.startsWith("\\s+"))
{
start = "\\s";
pattern = pattern.substring(3);
}
if (pattern.endsWith("\\s+"))
{
end = "\\s";
pattern = pattern.substring(0, pattern.length() - 3);
}
pattern = "(^|" + start + ")(" + pattern.trim() + ")($|" + end + ")";
} else pattern = null;
return pattern;
}
/**
* Performs different cleanups of URL. Removes extra spaces, converts "feed://" into "http://",
* takes only first line of draggeed URL which is actual link under FireFox 1.5 (Win).
*
* @param link link being dragged into application.
*
* @return final link.
*/
public static String cleanDraggedURL(String link)
{
if (link == null) return null;
link = link.trim();
// If URL starts with feed:// we change it to http://
if (link.startsWith("feed:")) link = "http:" + link.substring(5);
// FireFox 1.5 under Win has two lines: URL and description taken from page
// We leave only the first line -- the URL
int index = link.indexOf(0x0a);
if (index != -1) link = link.substring(0, index).trim();
return link;
}
/**
* Scans for tags definitions in micro-format. The text should contain A-links to
* tag categories with "rel" attribute equal to "tag". The last section of URL is
* taken as tag.
*
* @param aText text to parse.
*
* @return list of tags detected.
*/
public static String[] collectTags(String aText)
{
List<String> tagsList = null;
if (aText != null)
{
Pattern pat = Pattern.compile("<a\\s+[^>]*rel\\s*=\\s*['\"]tag['\"][^>]*>",
Pattern.CASE_INSENSITIVE);
Matcher matcher = pat.matcher(aText);
Pattern patTag = null;
while (matcher.find())
{
if (tagsList == null)
{
tagsList = new ArrayList<String>();
patTag = Pattern.compile(
"href\\s*=\\s*['\"]([^'\"/]+/+)+([\\+a-zA-Z0-9]+)['\"]");
}
Matcher m2 = patTag.matcher(matcher.group());
if (m2.find()) tagsList.add(m2.group(2).replaceAll("\\+", " "));
}
}
return tagsList == null
? Constants.EMPTY_STRING_LIST
: tagsList.toArray(new String[tagsList.size()]);
}
/**
* Adds protocol part to URL (http://) if none is specified and removes spaces around text.
*
* @param url source URL.
*
* @return modified URL.
*/
public static String fixURL(String url)
{
if (url != null)
{
url = url.trim();
if (url.length() == 0)
{
url = null;
} else if (url.startsWith("feed:"))
{
url = url.substring(5).replaceAll("^/+", "");
url = fixURL(url);
} else if (!PATTERN_URL_WITH_PROTOCOL.matcher(url).find())
{
url = "http://" + url;
}
}
return url;
}
/**
* Unescapes the string.
*
* @param str string.
*
* @return unescaped version.
*/
public static String quickUnescape(String str)
{
if (str == null) return null;
str = str.replaceAll("&", "&");
str = str.replaceAll("<", "<");
str = str.replaceAll(">", ">");
str = str.replaceAll("'", "'");
str = str.replaceAll(""", "\"");
return str;
}
/**
* Complete recoding of all HTML entities into Unicode symbols.
*
* @param str string.
*
* @return result.
*/
public static String unescape(String str)
{
if (isEmpty(str)) return str;
Pattern p = Pattern.compile("&(([^#;\\s]{3,6})|#([0-9]{1,4})|#x([0-9a-fA-F]{1,4}));");
Matcher m = p.matcher(str);
StringBuffer sb = new StringBuffer();
while (m.find())
{
Character c;
String strEntity = m.group(2);
String decEntity = m.group(3);
String hexEntity = m.group(4);
if (strEntity != null)
{
// String entity
c = ENTITIES.get(strEntity);
} else
{
c = decEntity != null ? (char)Integer.parseInt(decEntity) : (char)Integer.parseInt(hexEntity, 16);
}
m.appendReplacement(sb, c == null ? m.group() : c.toString());
}
m.appendTail(sb);
return sb.toString();
}
/**
* Checks whether the given strings is a valid e-mail address.
*
* @param email address.
*
* @return <code>TRUE</code> if valid.
*/
public static boolean isValidEmail(String email)
{
return !isEmpty(email) && email.trim().matches("^[^@]+@[^\\.]+(\\.[^\\.]+)+$");
}
/**
* Converts the list of URLs in string form into the array of URL objects.
*
* @param str URLs string.
*
* @return array of URLs.
*/
public static URL[] strToURLs(String str)
{
URL[] newURLs = null;
if (isNotEmpty(str))
{
String[] urls = split(str, Constants.URL_SEPARATOR);
List<URL> urlsList = new ArrayList<URL>(urls.length);
for (String url : urls)
{
try
{
urlsList.add(new URL(url));
} catch (MalformedURLException e)
{
// Wrong url specified -- skipping
}
}
newURLs = urlsList.toArray(new URL[urlsList.size()]);
}
return newURLs;
}
/**
* Returns space-separated list of first <code>N</code> words.
*
* @param str string.
* @param n number of words.
*
* @return list or <code>NULL</code> if string is <code>NULL</code> or <code>N</code> is less than <code>1</code>.
*/
public static String getUpToNWords(String str, int n)
{
if (str == null || n < 1) return null;
String[] split = split(str, " ,.()[]<>!?\"':;/\\", n + 1);
if (split.length > 1) split[split.length - 1] = "";
return join(split, " ").trim();
}
private static Pattern lastPattern;
private static int lastSentences = -1;
/**
* Returns the excerpt consisting of given number of sentences unless they are shorter or longer than given
* limits. In this case minimum or maximum allowed number of characters plus "..." are returned.
*
* @param str string to process.
* @param sentences the number of sentences.
* @param min minimum characters.
* @param max maximum characters.
*
* @return the result.
*/
public static String excerpt(String str, int sentences, int min, int max)
{
if (str == null) return str;
String res;
// Create pattern or reuse
Pattern pat;
if (lastSentences != sentences)
{
String patS = "^([^\\.!?]+(\\.+|!+|\\?+)+){" + sentences + "}";
pat = Pattern.compile(patS);
lastSentences = sentences;
lastPattern = pat;
} else pat = lastPattern;
// Match the string
Matcher m = pat.matcher(str);
if (m.find())
{
res = m.group().trim();
int len = res.length();
if (len < min) res = excerpt(str, min); else
if (len > max) res = excerpt(str, max);
} else res = excerpt(str, max);
return res;
}
/**
* Returns the excerpt with given number of characters plus "..." unless
* the string is shorter than limit.
*
* @param str string to process.
* @param len number of characters.
*
* @return the result.
*/
public static String excerpt(String str, int len)
{
return str == null || str.length() <= len ? str : str.substring(0, len) + "...";
}
/**
* Takes the string, removes punctuation, lowercases it and returns the words in a given range glued
* together with spaces. If there's not enough words, the maximum available number of them is returned.
*
* @param str string.
* @param from the first word to return.
* @param to the last word to return.
*
* @return the result.
*/
public static String getWordsInRange(String str, int from, int to)
{
if (isEmpty(str)) return str;
if (from > to) throw new IllegalArgumentException("From can't be bigger than To.");
// Remove all punctuation and collapse spaces plus lowercase
str = PATTERN_PUNCTUATION.matcher(str).replaceAll(" ").toLowerCase().trim();
String[] strs = str.split(" ");
// Figure out what are our limits
from = Math.min(from, strs.length);
to = Math.min(to + 1, strs.length);
// Glue words back together
String[] arr = new String[to - from];
System.arraycopy(strs, from,arr, 0, to - from);
return join(arr, " ");
}
/**
* Safely intern's a string.
*
* @param s string.
*
* @return intern'ed version.
*/
public static String intern(String s)
{
return s == null ? null : s.intern();
}
/**
* Finds all http://* links.
*
* @param text text.
*
* @return links list.
*/
public static List<String> collectLinks(String text)
{
ArrayList<String> links = new ArrayList<String>();
Pattern pattern = Pattern.compile("(https?://[^\\s,]*)(\\s|,|$)", Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(text);
while (matcher.find())
{
String link = matcher.group(1);
// Remove trailing dots and commas
link = link.replaceAll("[,.]+$", "");
links.add(link);
}
return links;
}
}