package com.multysite.util;
import java.io.UnsupportedEncodingException;
import java.util.regex.Pattern;
import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
public class StringHelper {
private static boolean isLoadLanguageProfile = false;
public static boolean checkRemove(String str) {
boolean boo = false;
str = str.toLowerCase();
String remove = "theo,Nhưng,Vì,Có,Các,Vì thế,Chính vì thế,Thậm chí,Tuy nhiên,Vì vậy,Nhiều,Đừng,Tùy,Hiện nay,Tuỳ theo,Tuy vậy,sau,Nay,Thông,Thế mà,Bữa nay,Kế đến,Người,Chẳng,Trong,Chuyện,Không,Những,Chiếc,trước,source,chuyến,thường,nguồn,chúng";
String[] listRemove = remove.split(",");
for (int i = 0; i < listRemove.length; i++) {
if (str.startsWith(listRemove[i].toLowerCase()))
boo = true;
}
if (str.equals("Source")) {
boo = true;
}
return boo;
}
public static String keyword(String str) {
String string = "";
str = str.trim();
str = str.replaceAll("[ ]*[.]+[ ]*", "daucham");
str = str.replaceAll("[ ]+", "-");
str = str.replaceAll("[\\,\\=\\+.!@#$%^&*()_]+", ".-")
.replace("“", ".").replace("”", ".");
str = str.replaceAll("daucham", ".");
String[] words = str.split("-");
int count = 0;
String check = "";
String[] check_arr = new String[5];
String s_word = "";
String s_check = "";
for (int i = 0; i < (words.length - 1); i++) {
if (Pattern
.matches(
"^[A-ZĐÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝĂĨŨƠƯẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼẾỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰ]+[\\w\\W]+",
words[i])
&& count <= 4
&& i != 0
&& !Pattern.matches("^[^.]+[.]+[\\w\\W]*", words[i])) {
String wk = "";
if (i < (words.length - 2)
&& Pattern
.matches(
"^[A-ZĐÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝĂĨŨƠƯẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼẾỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰ]+[\\w\\W]+",
words[i + 1])
&& !Pattern
.matches("^[^.]+[.]+[\\w\\W]+", words[i + 1])
&& !Pattern.matches("^[^.]+[.]+[\\w\\W]*", words[i])) {
words[i] = words[i] + " " + words[i + 1];
wk = words[i];
i = i + 1;
}
if (i < (words.length - 2)
&& Pattern
.matches(
"^[A-ZĐÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝĂĨŨƠƯẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼẾỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰ]+[\\w\\W]+",
words[i + 1])
&& !Pattern
.matches("^[^.]+[.]+[\\w\\W]+", words[i + 1])
&& !Pattern.matches("^[^.]+[.]+[\\w\\W]*", words[i])) {
words[i] = words[i - 1] + " " + words[i + 1];
wk = words[i];
i = i + 1;
}
if (i < (words.length - 2)
&& Pattern
.matches(
"^[A-ZĐÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝĂĨŨƠƯẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼẾỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰ]+[\\w\\W]+",
words[i + 1])
&& !Pattern
.matches("^[^.]+[.]+[\\w\\W]+", words[i + 1])
&& !Pattern.matches("^[^.]+[.]+[\\w\\W]*", words[i])) {
words[i] = words[i - 1] + " " + words[i + 1];
wk = words[i];
i = i + 1;
}
check = "";
if (!wk.equals(""))
s_word = wk;
else
s_word = words[i];
for (int j = 0; j < check_arr.length; j++) {
if (check_arr[j] != null)
s_check = check_arr[j].toLowerCase();
if (s_word.toLowerCase().equals(s_check)) {
check = "1";
}
}
if (checkRemove(s_word))
check = "1";
if (check.equals("") && s_word.length() >= 5) {
if (count == 0) {
string += s_word
.replaceAll("[\\,\\=\\+ .!@#$%^&*()_-]+", " ")
.replace("“", "").trim();
} else {
string += ","
+ s_word.replaceAll(
"[\\,\\=\\+ .!@#$%^&*()_-]+", " ")
.replace("“", "").trim();
}
check_arr[count] = s_word;
count++;
}
}
if (count >= 6) {
break;
}
}
return string;
}
public static String replace(String str) {
str = str.replaceAll("[\\'\\/]+", "");
str = str
.replaceAll(
"[-\\!\"\\@\\#\\,\\.\\$%\\^&\\*\\(\\)_\\+\\=\\?\\;\\:\\~\\`\\{\\}\\[\\]\\|\\\\]+",
"-");
str = str.replaceAll(" ", "-");
str = str.replaceAll("[-]+", "-");
str = str.replaceAll("[\\W]+", "-");
str = str.replaceAll("^-", "");
str = str.replaceAll("-$", "");
str = str.toLowerCase();
return str;
}
public static String cleanContent(String str) {
str = str.replaceAll("width[ ]*:[ ]*[0-9a-zA-Z ]+[;]*", "");
return str;
}
public static String cutDescription(String str) {
String des = "";
str = str.replaceAll("\\<.*?\\>", "").trim();
if (str.length() > 250) {
des = str.substring(0, 250);
des = des.replaceAll("[ ]+[^ ]*$", " ...");
} else {
des = str;
}
return des;
}
public static String replaceSpace(String str) {
str = str.replaceAll("[\\'\\/]+", "");
str = str
.replaceAll(
"[-\\!\"\\@\\#\\,\\.\\$%\\^&\\*\\(\\)_\\+\\=\\?\\;\\:\\~\\`\\{\\}\\[\\]\\|\\\\]+",
"-");
str = str.replaceAll(" ", "-");
str = str.replaceAll("[-]+", "-");
str = str.replaceAll("^-", "");
str = str.replaceAll("-$", "");
str = str.toLowerCase();
return str;
}
public static String replaceVietnamese(String str) {
String[] aArray = { "ấ", "ầ", "ẩ", "ẫ", "ậ", "Ấ", "Ầ", "Ẩ", "Ẫ", "Ậ",
"ắ", "ằ", "ẳ", "ẵ", "ặ", "Ắ", "Ằ", "Ẳ", "Ẵ", "Ặ", "á", "à",
"ả", "ã", "ạ", "â", "ă", "Á", "À", "Ả", "Ã", "Ạ", "Â", "Ă" };
String[] eArray = { "ế", "ề", "ể", "ễ", "ệ", "Ế", "Ề", "Ể", "Ễ", "Ệ",
"é", "è", "ẻ", "ẽ", "ẹ", "ê", "É", "È", "Ẻ", "Ẽ", "Ẹ", "Ê" };
String[] iArray = { "í", "ì", "ỉ", "ĩ", "ị", "Í", "Ì", "Ỉ", "Ĩ", "Ị" };
String[] oArray = { "ố", "ồ", "ổ", "ỗ", "ộ", "Ố", "Ồ", "Ổ", "Ô", "Ộ",
"ớ", "ờ", "ở", "ỡ", "ợ", "Ớ", "Ờ", "Ở", "Ỡ", "Ợ", "ó", "ò",
"ỏ", "õ", "ọ", "ô", "ơ", "Ó", "Ò", "Ỏ", "Õ", "Ọ", "Ô", "Ơ" };
String[] uArray = { "ứ", "ừ", "ử", "ữ", "ự", "Ứ", "Ừ", "Ử", "Ữ", "Ự",
"ú", "ù", "ủ", "ũ", "ụ", "ư", "Ú", "Ù", "Ủ", "Ũ", "Ụ", "Ư" };
String[] yArray = { "ý", "ỳ", "ỷ", "ỹ", "ỵ", "Ý", "Ỳ", "Ỷ", "Ỹ", "Ỵ" };
String[] dArray = { "đ", "Đ" };
for (String item : aArray) {
str = str.replaceAll(item, "a");
}
for (String item : eArray) {
str = str.replaceAll(item, "e");
}
for (String item : iArray) {
str = str.replaceAll(item, "i");
}
for (String item : oArray) {
str = str.replaceAll(item, "o");
}
for (String item : uArray) {
str = str.replaceAll(item, "u");
}
for (String item : yArray) {
str = str.replaceAll(item, "y");
}
for (String item : dArray) {
str = str.replaceAll(item, "d");
}
str = str.replaceAll("[\\'\\/]+", "");
str = str
.replaceAll(
"[-\\!\"\\@\\#\\,\\.\\$%\\^&\\*\\(\\)_\\+\\=\\?\\;\\:\\~\\`\\{\\}\\[\\]\\|\\\\]+",
"-");
str = str.replaceAll(" ", "-");
str = str.replaceAll("[-]+", "-");
// str = str.replaceAll("[\\W]+", "-");
str = str.replaceAll("^-", "");
str = str.replaceAll("-$", "");
str = str.toLowerCase();
return str;
}
public static String tag(String str) {
str = str.replaceAll("\\#", "Sharp");
str = str.replaceAll("\\+", "Plus");
str = str.replaceAll("[^.\\w]+", "-");
str = str.replaceAll("^-", "");
str = str.replaceAll("-$", "");
str = str.toLowerCase();
return str;
}
public static String remove(String str) {
str = str.replaceAll("[\'\"]+", """);
return str;
}
public static boolean StringIsNullOrEmpty(String string) {
if (string == null || string.equals("")) {
return true;
}
return false;
}
public static String getAliasByLanguage(String title) {
String alias = "";
String lang = "";
try {
if (!isLoadLanguageProfile) {
DetectorFactory.loadProfile("language_detect");
isLoadLanguageProfile = true;
}
Detector detector = DetectorFactory.create();
detector.append(title);
lang = detector.detect();
} catch (LangDetectException e) {
e.printStackTrace();
}
int type = 1;
if (lang.equalsIgnoreCase("en")) {
type = 1;
}
if (lang.equalsIgnoreCase("es")) {
type = 1;
}
if (lang.equalsIgnoreCase("zh-cn")) {
type = 2;
}
if (lang.equalsIgnoreCase("zh-tw")) {
type = 2;
}
if (lang.equalsIgnoreCase("ko")) {
type = 2;
}
if (lang.equalsIgnoreCase("ja")) {
type = 2;
}
if (lang.equalsIgnoreCase("vi")) {
type = 3;
}
if (type == 1) {
alias = replace(title);
} else if (type == 2) {
alias = replaceSpace(title);
} else if (type == 3) {
alias = replaceVietnamese(title);
}
if (alias.length() < 8) {
return title.replaceAll(" ", "-").toLowerCase();
} else {
return alias.toLowerCase();
}
}
public static String getUTF8FromString(String input) {
String result = "";
try {
result = new String(input.getBytes(("ISO-8859-1")), "UTF-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return result;
}
}