/**
* Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.util;
import com.google.api.translate.Language;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import com.google.inject.Module;
import de.jetwick.config.DefaultModule;
import de.jetwick.data.JTag;
import de.jetwick.es.ElasticTweetSearch;
import de.jetwick.data.JTweet;
import de.jetwick.data.JUser;
import de.jetwick.es.ElasticTagSearch;
import de.jetwick.es.ElasticUserSearch;
import de.jetwick.es.TweetQuery;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.elasticsearch.action.admin.indices.optimize.OptimizeResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Execute eg. via
* ./myjava de.jetwick.util.Statistics exportNoiseWords=solr/conf/stopwords.txt
*
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class Statistics {
private static Logger logger = LoggerFactory.getLogger(Statistics.class);
public static void main(String[] args) throws Exception {
Map<String, String> map = Helper.parseArguments(args);
logger.info("arguments:" + map);
if (args.length == 0)
map.put("print", "timetabling");
Module module = new DefaultModule();
Injector injector = Guice.createInjector(module);
injector.getInstance(Statistics.class).start(map);
}
@Inject
private ElasticTweetSearch tweetSearch;
@Inject
private ElasticUserSearch userSearch;
@Inject
private ElasticTagSearch tagSearch;
public Statistics() {
}
public void start(Map<String, String> map) throws Exception {
String argStr = map.get("optimize");
if (argStr != null) {
int segments = 1;
logger.info("Start optimizing for twindex");
OptimizeResponse rsp = tweetSearch.optimize(tweetSearch.getIndexName(), segments);
logger.info("Optimized twindex to " + segments + " segments for " + rsp.getSuccessfulShards() + "/" + rsp.getTotalShards() + " shards.\n Now uindex");
rsp = tweetSearch.optimize(tweetSearch.getIndexName(), segments);
logger.info("Optimized uindex to " + segments + " segments for " + rsp.getSuccessfulShards() + "/" + rsp.getTotalShards() + " shards.");
}
argStr = map.get("listTweets");
if (argStr != null) {
if ("true".equals(argStr))
argStr = "**:*";
List<JUser> list = new ArrayList<JUser>();
long ret = tweetSearch.query(list, new TweetQuery(argStr, false)).
getHits().getTotalHits();
logger.info("Found: " + ret + " users. Returned: " + list.size());
print(list);
return;
}
// specify file via exportNoiseWords=stopwords.txt
argStr = map.get("exportNoiseWords");
if (argStr != null) {
write(new TreeSet<String>(JTweet.NOISE_WORDS.keySet()), argStr);
return;
}
argStr = map.get("importTags");
if (argStr != null)
importTags(map.get("tagFile"));
argStr = map.get("clearPropertiesOfTags");
if (argStr != null)
clearPropertiesOfTags();
argStr = map.get("readStopAndClear");
if (argStr != null)
readStopwords(JTweet.class.getResourceAsStream("noise_words_pt.txt"));//noise_words_fr.txt, lang_det_sp.txt
argStr = map.get("translate");
if (argStr != null)
translate(Language.PORTUGUESE);
}
public void print(List list) {
for (Object o : list) {
System.out.println(o);
}
}
public void importTags(String file) throws IOException {
Set<String> newTags = new TreeSet<String>();
for (String str : Helper.readFile(file)) {
if (str.trim().length() > 1)
newTags.add(JTag.toLowerCaseOnlyOnTerms(str.trim()));
} // do only delete those where we don't have a new one
// do only store tags which are new
boolean ignoreSearchError = false;
try {
for (JTag tag : tagSearch.findAll(0, 1000)) {
if (!newTags.contains(tag.getTerm()))
tagSearch.deleteByName(tag.getTerm());
else
newTags.remove(tag.getTerm());
}
} catch (Exception ex) {
ignoreSearchError = true;
logger.info("Tag index seems to be not available or empty! Message:" + ex.getMessage());
}
tagSearch.addAll(newTags, true, ignoreSearchError);
tagSearch.optimize();
logger.info("Imported tag:" + newTags.size() + " all tags:" + tagSearch.findAll(0, 1000).size());
}
public void clearPropertiesOfTags() throws IOException {
Set<JTag> newTags = new LinkedHashSet<JTag>();
int counter = 0;
for (JTag tag : tagSearch.findAll(0, 1000)) {
counter++;
newTags.add(tag.clearProperties());
}
tagSearch.bulkUpdate(newTags, tagSearch.getIndexName(), true);
tagSearch.optimize();
logger.info(counter + " Updated:" + newTags.size() + " tags " + newTags);
}
public void write(Set<String> words, String file) throws Exception {
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), Helper.UTF8));
writer.write("# Written from YTweet via Statistics class! " + new Date());
for (String str : words) {
writer.write(str);
writer.write("\n");
}
writer.close();
}
public void readStopwords(InputStream is) throws Exception {
List<String> list = Helper.readFile(Helper.createBuffReader(is));
Set<String> set = new TreeSet<String>();
for (String str : list) {
if (str.isEmpty() || str.startsWith("//"))
continue;
str = str.toLowerCase();
if (str.contains(" "))
for (String tmp : str.split(" ")) {
set.add(tmp.trim());
}
else
set.add(str.trim());
}
for (String str : set) {
System.out.println(str);
}
}
public void translate(Language lang) throws Exception {
List<String> list = Helper.readFile(Helper.createBuffReader(JTweet.class.getResourceAsStream("lang_det_en.txt")));
Set<String> res = new TreeSet<String>();
Set<String> cache = new LinkedHashSet<String>();
int charCounter = 0;
for (String str : list) {
if (str.isEmpty() || str.startsWith("//"))
continue;
str = str.toLowerCase().trim();
charCounter += str.length();
cache.add(str);
if (charCounter > 1500) {
try {
String gTranslated = Translate.execute(cache.toString(), Language.ENGLISH, lang);
for (String tmp : gTranslated.split(",")) {
tmp = tmp.toLowerCase().trim().replaceAll("\\[", "").replaceAll("\\]", "");
res.add(tmp);
}
// System.out.println(tmp);
} catch (Exception ex) {
logger.error("Cannot translate " + cache.size() + " lines", ex);
}
charCounter = 0;
cache.clear();
}
}
System.out.println("=======================\n\n");
for (String str : res) {
System.out.println(str);
}
}
}