/**
* Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.tw.cmd;
import de.jetwick.data.UrlEntry;
import de.jetwick.data.JTweet;
import de.jetwick.util.AnyExecutor;
import de.jetwick.tw.TweetDetector;
import de.jetwick.util.StopWatch;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class TermCreateCommand implements AnyExecutor<JTweet> {
private Logger logger = LoggerFactory.getLogger(getClass());
private boolean termRemoving = true;
private StopWatch sw1 = new StopWatch();
private StopWatch sw2 = new StopWatch();
private StopWatch sw3 = new StopWatch();
private StopWatch sw4 = new StopWatch();
public TermCreateCommand() {
//http://en.wikipedia.org/wiki/Phonetic_algorithm
//http://en.wikipedia.org/wiki/Approximate_string_matching
//http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
//BUT we are using a technic from TextProfileSignature:
// create a list of tokens and their frequency, separated by spaces, in the order of decreasing frequency.
// This list is then submitted to an MD5 hash calculation.
// Only suited for short strings: JaroWinklerDistance, LevensteinDistance,new NGramDistance.
// Use relative termMinFrequency!!
}
public TermCreateCommand setSw1(StopWatch sw1) {
this.sw1 = sw1;
return this;
}
public TermCreateCommand setSw2(StopWatch sw2) {
this.sw2 = sw2;
return this;
}
public TermCreateCommand setSw3(StopWatch sw3) {
this.sw3 = sw3;
return this;
}
public TermCreateCommand setSw4(StopWatch sw4) {
this.sw4 = sw4;
return this;
}
public TermCreateCommand(boolean termRemoving) {
this.termRemoving = termRemoving;
}
@Override
public JTweet execute(JTweet tw) {
// HINT: do only modify the current tweets' quality!
double qual = tw.getQuality();
calcTermsWithoutNoise(tw);
int maxTerms = 0;
for (Entry<String, Integer> entry : tw.getTextTerms().getSortedFreqLimit(0.05f)) {
if (entry.getValue() > maxTerms)
maxTerms = entry.getValue();
}
// term occurs more than one time on the current tweet?
if (maxTerms > 4) {
qual = Math.max(0, 100 - maxTerms * 8);
// tw.addQualAction("MT,");
}
// now calculate quality via comparing to existing tweets
StringFreqMap otherTerms = new StringFreqMap();
StringFreqMap otherLangs = new StringFreqMap();
tw.setQuality((int) qual);
qual = checkSpamInExistingTweets(tw, otherTerms, otherLangs);
tw.setQuality((int) qual);
// prepare indexing and remove terms which do NOT occur in other tweets (i.e. are 'unimportant')
if (termRemoving) {
Iterator<String> iter = tw.getTextTerms().keySet().iterator();
while (iter.hasNext()) {
String term = iter.next();
Integer integ = otherTerms.get(term);
if (integ == null || integ < 1)
iter.remove();
}
}
// language detection from tw.getLanguages()
tw.setLanguage(detectLanguage(tw, otherLangs));
return tw;
}
public String detectLanguage(JTweet tweet, StringFreqMap languages) {
if (tweet.getLanguages().size() > 0) {
List<Entry<String, Integer>> list = tweet.getLanguages().getSorted();
int index = 0;
Entry<String, Integer> lEntry = list.get(index);
if (tweet.getLanguages().size() > index + 1 && TweetDetector.UNKNOWN_LANG.equals(lEntry.getKey())) {
index++;
lEntry = list.get(index);
}
if (tweet.getLanguages().size() > index + 1 && !TweetDetector.UNKNOWN_LANG.equals(lEntry.getKey())) {
if (lEntry.getValue() - 1 < list.get(index + 1).getValue())
// the second language seems also important
return TweetDetector.UNKNOWN_LANG;
}
if (languages.containsKey(lEntry.getKey()) || lEntry.getValue() > 2)
return lEntry.getKey();
}
return TweetDetector.UNKNOWN_LANG;
}
public double checkSpamInExistingTweets(JTweet currentTweet,
StringFreqMap mergedTerms, StringFreqMap mergedLangs) {
double qual = currentTweet.getQuality();
sw1.start();
StringFreqMap urlMap = new StringFreqMap();
for (JTweet older : currentTweet.getFromUser().getOwnTweets()) {
for (UrlEntry entry : older.getUrlEntries()) {
urlMap.inc(entry.getResolvedUrl(), 1);
}
}
sw1.stop();
sw2.start();
boolean sameUrl = false;
for (JTweet older : currentTweet.getFromUser().getOwnTweets()) {
if (older == currentTweet)
continue;
sw3.start();
// create tags to decide if tags of currentTweet are important
calcTermsWithoutNoise(older);
sw3.stop();
// count only one term per tweet
mergedTerms.addOne2All(older.getTextTerms());
// count languages as they appear
mergedLangs.addValue2All(older.getLanguages());
// compare only to older tweets
if (currentTweet.getCreatedAt().getTime() <= older.getCreatedAt().getTime())
continue;
// we don't need the signature because we have the jaccard index
// performance improvement of comparison: use int[] instead of byte[]
// if (Arrays.equals(tw.getTextSignature(), older.getTextSignature()))
// qual = qual * 0.7;
double ji = calcJaccardIndex(currentTweet.getTextTerms(), older.getTextTerms());
if (ji >= 0.8) {
// nearly equal terms
qual *= JTweet.QUAL_BAD / 100.0;
// currentTweet.addQualAction("JB," + older.getTwitterId() + ",");
} else if (ji >= 0.6 && currentTweet.getQualReductions() < 3) {
// e.g. if 3 of 4 terms occur in both tweets
qual *= JTweet.QUAL_LOW / 100.0;
// currentTweet.addQualAction("JL," + older.getTwitterId() + ",");
}
if (!sameUrl) {
sw4.start();
for (UrlEntry entry : older.getUrlEntries()) {
Integer urlCounts = urlMap.get(entry.getResolvedUrl());
if (urlCounts != null) {
if ((urlCounts == 2 || urlCounts == 3) && currentTweet.getQualReductions() < 3) {
sameUrl = true;
qual *= JTweet.QUAL_LOW / 100.0;
// currentTweet.addQualAction("UL," + older.getTwitterId() + ",");
} else if (urlCounts > 3) {
sameUrl = true;
// tweeted about the identical url title!
qual *= JTweet.QUAL_BAD / 100.0;
// currentTweet.addQualAction("UB," + older.getTwitterId() + ",");
}
}
}
sw4.stop();
}
}
sw2.stop();
return qual;
}
public static double calcJaccardIndex(StringFreqMap map1, StringFreqMap map2) {
int a = map1.andSize(map2);
double b = map1.orSize(map2);
return a / b;
}
public void calcTermsWithoutNoise(JTweet tw) {
if (tw.getTextTerms().size() > 0)
return;
TweetDetector extractor = new TweetDetector().runOne(tw.getText());
tw.setTextTerms(extractor.getTerms());
tw.setLanguages(extractor.getLanguages());
// create text signature, sort against frequency
// int termsAtOnce = 2;
// int packs = 3;
// int maxTerms = packs * termsAtOnce;
// Signature sig = null;
// List<Signature> sigs = new ArrayList<Signature>();
// List<Entry<String, Integer>> frequentTerms = tw.getTextTerms().getSortedTermLimited(maxTerms);
//// Set<String> sortedTerms = new TreeSet<String>();
//// for (Entry<String, Integer> e : frequentTerms) {
//// sortedTerms.add(e.getKey());
//// }
//
// // now sort the remaining terms alphabetically to compensate term mixure
// // TODO: it would be better to sort terms alphabetically only if they have equal count
// int counter = 0;
// for (Entry<String, Integer> term : frequentTerms) {
//// for (String term : sortedTerms) {
// if (counter++ % termsAtOnce == 0) {
// //sig = new MD5Signature();
// // we can convert this easily to long because it is only 64 bit
// sig = new Lookup3Signature();
// sigs.add(sig);
// }
//
// sig.add(term.getKey());
//// sig.add(term);
// }
//
// for (Signature tmpSig : sigs) {
// tw.addTextSignature(Helper.byteArray2long(tmpSig.getSignature()));
// }
}
}