/**
* Vosao CMS. Simple CMS for Google App Engine.
*
* Copyright (C) 2009-2010 Vosao development team.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* email: vosao.dev@gmail.com
*/
package org.vosao.utils;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class StrUtil {
private static Log logger = LogFactory.getLog(StrUtil.class);
public static final String DESCRIPTION_REGEX =
"<?(meta|META)\\s.*?((content|CONTENT)=\"(.*?)\"\\s+(name|NAME)=\"(description|DESCRIPTION)\"|(name|NAME)=\"(description|DESCRIPTION)\"\\s+(content|CONTENT)=\"(.*?)\")\\s*/>";
public static final Pattern DESCRIPTION_PATTERN = Pattern.compile(DESCRIPTION_REGEX);
public static final String KEYWORDS_REGEX =
"<?(meta|META)\\s.*?((content|CONTENT)=\"(.*?)\"\\s+(name|NAME)=\"(keywords|KEYWORDS)\"|(name|NAME)=\"(keywords|KEYWORDS)\"\\s+(content|CONTENT)=\"(.*?)\")\\s*/>";
public static final Pattern KEYWORDS_PATTERN = Pattern.compile(KEYWORDS_REGEX);
public static final String HEAD_CLOSE_REGEX = "</(head|HEAD)>";
public static final Pattern HEAD_CLOSE_PATTERN = Pattern.compile(HEAD_CLOSE_REGEX);
private static String _toCSV(Collection<String> list) {
StringBuffer result = new StringBuffer();
int count = 0;
for (String item : list) {
result.append((count == 0 ? "" : ",")).append(item);
count++;
}
return result.toString();
}
public static String toCSV(Set<String> list) {
return _toCSV(list);
}
public static String toCSV(List<String> list) {
return _toCSV(list);
}
public static List<String> fromCSV(String data) {
List<String> result = new ArrayList<String>();
if (!StringUtils.isEmpty(data)) {
if (data.indexOf(',') == -1) {
result.add(data);
return result;
}
for (String s : data.split(",")) {
result.add(s);
}
}
return result;
}
/**
* Gzip the input string into a byte[].
*
* @param input
* @return
* @throws IOException
*/
public static byte[] zipStringToBytes(String input) throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
BufferedOutputStream bufos = new BufferedOutputStream(
new GZIPOutputStream(bos));
bufos.write(input.getBytes("UTF-8"));
bufos.close();
byte[] retval = bos.toByteArray();
bos.close();
return retval;
}
/**
* Unzip a string out of the given gzipped byte array.
*
* @param bytes
* @return
* @throws IOException
*/
public static String unzipStringFromBytes(byte[] bytes) throws IOException {
ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
BufferedInputStream bufis = new BufferedInputStream(
new GZIPInputStream(bis));
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] buf = new byte[1024];
int len;
while ((len = bufis.read(buf)) > 0) {
bos.write(buf, 0, len);
}
String retval = bos.toString("UTF-8");
bis.close();
bufis.close();
bos.close();
return retval;
}
public static String removeJavascript(String s) {
StringBuffer buf = new StringBuffer(s);
int scriptStart = buf.indexOf("<script");
while (scriptStart > 0) {
int scriptEnd = buf.indexOf("</script>", scriptStart) + 9;
if (scriptEnd > 0) {
buf.replace(scriptStart, scriptEnd, "");
}
scriptStart = buf.indexOf("<script", scriptStart);
}
return buf.toString();
}
private static final String[] VELOCITY_PATTERNS = {"\\$\\{.*\\}",
"##.*$",
"#for\\s*\\(.*",
"#set\\s*\\(.*",
"\\$\\w+\\.\\w+\\(.*\\)",
"\\$\\w+\\.\\w+",
"#if\\s*\\(.*\\)",
"#end"};
private static final String[] XML_PATTERNS = {"<\\!\\[CDATA\\[", "\\]\\]>"};
private static final String[] HTML_PATTERNS = {"\\>", "\\<", "\\ ",
"[\\n.,`~@#$%\\^\\&*\\(\\)\\[\\]\\-\\=\\/\\|]"};
public static String extractSearchTextFromHTML(String html) {
String result = removeJavascript(html).replaceAll("<.*?>", "");
for (String pattern : VELOCITY_PATTERNS) {
result = result.replaceAll(pattern, "");
}
for (String pattern : XML_PATTERNS) {
result = result.replaceAll(pattern, "");
}
for (String pattern : HTML_PATTERNS) {
result = result.replaceAll(pattern, " ");
}
return result;
}
public static String extractTextFromHTML(String html) {
return removeJavascript(html).replaceAll("<.*?>", " ")
.replaceAll(" ", " ")
.replaceAll(" ", " ")
.replaceAll("\n+", " ")
.replaceAll("\t+", " ")
.replaceAll(" +", " ");
}
public static List<Long> toLong(List<String> list) {
List<Long> result = new ArrayList<Long>();
for (String s : list) {
try {
result.add(Long.valueOf(s));
}
catch (NumberFormatException e) {
logger.error("Wrong number format " + s);
}
}
return result;
}
public static String[] splitByWord(String data) {
return data.split("[ ,.:?!~#\n\t]+");
}
/**
* Unpack title from old csv format: enTitle1,ruTitle2
* @param data - data to unpack.
* @return - result map.
*/
public static Map<String, String> unpack06Title(String data) {
Map<String, String> result = new HashMap<String, String>();
if (StringUtils.isEmpty(data)) {
return result;
}
String[] items = data.split(",");
for (String item : items) {
if (item.length() > 2) {
String key = item.substring(0, 2);
String value = item.substring(2);
result.put(key, value);
}
}
return result;
}
}