/**
* Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.data;
import de.jetwick.tw.TweetDetector;
import de.jetwick.tw.Twitter4JTweet;
import de.jetwick.tw.cmd.StringFreqMap;
import de.jetwick.util.Helper;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import twitter4j.Tweet;
/**
*
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class JTweet implements ElasticObject<JTweet>, Serializable {
private static final long serialVersionUID = 1L;
public static final int MAX_LENGTH = 800;
public static final Comparator tweetIdComparator = new TwitterIdComparator();
public static final int QUAL_MAX = 100;
//
// detect three other *similar* tweets THEN BAD
// (LOW/100)^2 = 0.5625; (LOW/100)^3 = 0.4219 < BAD/100
public static final int QUAL_LOW = 75;
//
// detect two other nearly *identical* tweet THEN SPAM
// (BAD/100)^2 = 0.25 < SPAM/100
public static final int QUAL_BAD = 50;
public static final int QUAL_SPAM = 26;
private final long twitterId;
private String text;
private Set<JTweet> replies = new LinkedHashSet<JTweet>();
private int retweetCount;
private boolean retweet = false;
private boolean daemon = false;
private long version;
private Date createdAt;
private Date updatedAt;
private JUser fromUser;
private JTweet inReplyOf;
private long inReplyTwitterId = -1L;
private String location;
private StringFreqMap textTerms = new StringFreqMap(8);
private StringFreqMap languages = new StringFreqMap(4);
private String language = TweetDetector.UNKNOWN_LANG;
private int quality;
private String lowerCaseText;
private List<UrlEntry> urlEntries;
private int replyCount;
private String qualDebug;
private int qualReductions = 0;
private Collection<Long> duplicates = new LinkedHashSet<Long>(4);
private Date instantiatedAt = new Date();
private String feedSource;
private double latitude;
private double longitude;
private int updateCount;
private boolean isProtected = false;
/**
* You'll need to call init after that constructor
*/
public JTweet(Tweet tw, JUser fromUser) {
this(tw);
setFromUser(fromUser);
}
public JTweet(long id, String text, JUser fromUser) {
this(id, text, new Date());
setFromUser(fromUser);
}
/**
* For tests only! Use contructor instead which initialized user too!
*/
public JTweet(Tweet tw) {
this(tw.getId(), tw.getText(), tw.getCreatedAt());
// if tweet was retrieved via Status object
if (tw instanceof Twitter4JTweet) {
Twitter4JTweet myTw = (Twitter4JTweet) tw;
inReplyTwitterId = myTw.getInReplyToStatusId();
urlEntries = myTw.getUrlEntries();
}
// most tweets have location == null. See user.location
if (tw.getGeoLocation() != null)
setGeoLocation(tw.getGeoLocation().getLatitude(),
tw.getGeoLocation().getLongitude());
location = tw.getLocation();
}
/**
* for tests only
*/
public JTweet(long id, String text, Date createdAt) {
quality = QUAL_MAX;
this.twitterId = id;
setText_(text);
this.createdAt = createdAt;
if (urlEntries == null)
urlEntries = new ArrayList<UrlEntry>(1);
}
public JTweet addUrlEntry(UrlEntry ue) {
urlEntries.add(ue);
return this;
}
public Collection<UrlEntry> getUrlEntries() {
return urlEntries;
}
public void setUrlEntries(Collection<UrlEntry> entries) {
getUrlEntries().clear();
getUrlEntries().addAll(entries);
}
@Override
public long getVersion() {
return version;
}
@Override
public JTweet setVersion(long version) {
if (version < 0)
throw new IllegalStateException("version cannot be negative:" + version);
this.version = version;
return this;
}
public int getUpdateCount() {
return updateCount;
}
public JTweet setUpdateCount(int updateCount) {
this.updateCount = updateCount;
return this;
}
public String getLowerCaseText() {
if (lowerCaseText == null)
lowerCaseText = getText().toLowerCase();
return lowerCaseText;
}
public String getLocation() {
return location;
}
public void setLocation(String location) {
this.location = location;
}
public StringFreqMap getLanguages() {
return languages;
}
public void setLanguages(StringFreqMap languages) {
this.languages = languages;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public StringFreqMap getTextTerms() {
return textTerms;
}
public void setTextTerms(StringFreqMap textTerms) {
this.textTerms = textTerms;
}
public long getInReplyTwitterId() {
return inReplyTwitterId;
}
public JTweet setInReplyTwitterId(long inReplyTwitterId) {
this.inReplyTwitterId = inReplyTwitterId;
return this;
}
public Long getTwitterId() {
return twitterId;
}
public Date getCreatedAt() {
return createdAt;
}
public JTweet setCreatedAt(Date createdAt) {
this.createdAt = createdAt;
return this;
}
public Date getUpdatedAt() {
return updatedAt;
}
public JTweet setUpdatedAt(Date updatedAt) {
this.updatedAt = updatedAt;
return this;
}
public JTweet makePersistent() {
setUpdatedAt(new Date());
return this;
}
/**
* @return false if this tweet should be deleted after some days
*/
public boolean isPersistent() {
return updatedAt != null;
}
public void setFromUser(JUser fromUser, boolean reverse) {
this.fromUser = fromUser;
if (reverse)
fromUser.addOwnTweet(this, false);
}
public JTweet setFromUser(JUser fromUser) {
setFromUser(fromUser, true);
return this;
}
public JUser getFromUser() {
return fromUser;
}
public void setReplyCount(int rp) {
this.replyCount = rp;
}
public JTweet addReply(JTweet tw) {
replies.add(tw);
tw.setInReplyOf(this);
return this;
}
public int getReplyCount() {
// TODO better design! (do not mix count and replies)
return replyCount + replies.size();
}
public JTweet getInReplyOf() {
return inReplyOf;
}
public void setInReplyOf(JTweet inReplyOf) {
this.inReplyOf = inReplyOf;
if (inReplyOf == null)
inReplyTwitterId = -1L;
else
inReplyTwitterId = inReplyOf.getTwitterId();
}
public JTweet setRetweetCount(int rt) {
this.retweetCount = rt;
return this;
}
public int getRetweetCount() {
// TODO better design! (do not mix count and replies)
int tmp = 0;
for (JTweet tw : replies) {
if (tw.isRetweet())
tmp++;
}
return retweetCount + tmp;
}
public String getText() {
return text;
}
private void setText_(String t) {
text = t;
// skip none-utf8 characters, otherwise we have major problems while
// querying solr
this.text = Helper.xmlCharacterWhitelist(text);
retweet = getLowerCaseText().contains("rt @");
}
public boolean isRetweet() {
return retweet;
}
public String extractRTText() {
int index1 = getLowerCaseText().indexOf("rt @");
if (index1 < 0)
return "";
index1 = getText().indexOf(" ", index1 + 4);
if (index1 < 0)
return "";
return getText().substring(index1 + 1).trim();
}
public boolean isRetweetOf(JTweet tw) {
// e.g. return true if this.text == RT @userA: text
// to lower case is necessary because the case of the fromUser isn't important
if (!isRetweet())
return false;
String thisT = getLowerCaseText();
String extT = tw.getLowerCaseText();
return thisT.contains("rt @" + tw.getFromUser() + ": " + extT) || thisT.contains("rt @" + tw.getFromUser() + " " + extT);
// return thisT.matches(".*rt @" + tw.getFromUser() + ":? " + extT + ".*");
}
public JTweet setDaemon(boolean daemon) {
this.daemon = daemon;
return this;
}
/**
* If a tweet is added to the system and it is a retweet but no original
* tweet can be found a daemon tweet will be created to reflect this
* missing tweet.
*
* daemon tweets are expensive to look for and only 0.3% of the tweets (!)
* are only reactivated daemon tweets!
*/
public boolean isDaemon() {
return daemon;
}
public int getQuality() {
return quality;
}
public JTweet multiplyQuality(double factor) {
quality *= factor;
return this;
}
public JTweet setQuality(int quality) {
this.quality = quality;
return this;
}
public boolean isSpam() {
return quality < JTweet.QUAL_SPAM && quality >= 0;
}
/**
* For debugging purposes
*/
public void addQualAction(String str) {
if (qualDebug == null)
qualDebug = str;
else
qualDebug += str;
qualReductions++;
}
public String getQualDebug() {
return qualDebug;
}
public int getQualReductions() {
return qualReductions;
}
public static boolean isDefaultInReplyId(long inReplyTwitterId) {
return inReplyTwitterId == -1;
}
/**
* skip tweets with identical id or identical text. For the latter case:
* greater ids will win and identical text is only skipped if there is no
* tweet in-between. see the test case
*/
public static void deduplicate(List<JTweet> list) {
// now remove tweets if they have the identical twitterId or text.
// the standard hashCode/equals are based on the twitterId only
Iterator<JTweet> iter = list.iterator();
JTweet prevTweet = null;
while (iter.hasNext()) {
JTweet tw = iter.next();
if (prevTweet != null && (tw.getTwitterId().equals(prevTweet.getTwitterId())
|| tw.getText().equals(prevTweet.getText()))) {
iter.remove();
}
prevTweet = tw;
}
}
public static void sortAndDeduplicate(List<JTweet> list) {
Collections.sort(list, tweetIdComparator);
deduplicate(list);
}
@Override
public int hashCode() {
return 67 * 5 + (int) (this.twitterId ^ (this.twitterId >>> 32));
}
@Override
public boolean equals(Object obj) {
if (obj == null || getClass() != obj.getClass())
return false;
return this.twitterId == ((JTweet) obj).twitterId;
}
@Override
public String toString() {
return twitterId + " " + createdAt + " " + text + " v" + getVersion();
}
public static final Map<String, Set<String>> NOISE_WORDS = new LinkedHashMap<String, Set<String>>();
public static final Map<String, Set<String>> LANG_DET_WORDS = new LinkedHashMap<String, Set<String>>();
public static final Set<String> NOISE_WORDS_SINGLE = new LinkedHashSet<String>(Arrays.asList(new String[]{
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
}));
public static final Set<String> NOISE_WORDS_NUM = new LinkedHashSet<String>(Arrays.asList(new String[]{
"00", "01", "02", "03", "04", "05", "06", "07", "08", "09",
"1", "10", "100", "11", "12", "13", "14", "15", "16", "17",
"18", "19", "2", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "3", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "4", "40", "41", "42", "43", "44",
"45", "46", "47", "48", "49", "5", "50", "51", "52", "53",
"54", "55", "56", "57", "58", "59", "6", "60", "61", "62",
"63", "64", "65", "66", "67", "68", "69", "7", "70", "71",
"72", "73", "74", "75", "76", "77", "78", "79", "8", "80",
"81", "82", "83", "84", "85", "86", "87", "88", "89", "9",
"90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "100", "000"}));
public static final Set<String> NOISE_WORDS_MISC = new LinkedHashSet<String>(Arrays.asList(new String[]{
// ### TWITTER
"ah",
"aw", "cu", "ff",
"haha", "hahaha", "hehe", "hey", "hi",
"pls",
"rt", "re", "soo", "thx",
"yeah", "via",
"/by", "/cc", "/via",
"+1", "-1", ";d", "^^",
// ### MISC
".", ",", ";", "ur", "tx", "ini", "ii", "iii",
"//", "\\n", "\n", "com", "de", "el", "en", "je", "jp", "lol",
"ne", "om", "ve", "ya", "yr", "za"
}));
// ### Ausländisch ###
public static final Set<String> NOISE_WORDS_UNSORTED = new LinkedHashSet<String>(Arrays.asList(new String[]{
"¿qué",
"ak", "aku", "aja", "al", "ada", "amb", "así", "au", "avec",
"δεν",
"bien", "boa", "bom", "bueno",
"ca", "ça", "cap", "ce", "c'est", "cek", "ces", "che", "chi", "ci",
"col", "com", "como", "con", "crec", "cosa", "cuando", "cumpleaños",
"dan", "dans", "dc", "del", "decir", "dólar", "dong", "dua",
"di",
"ed", "een", "ei", "el", "els", "em", "en", "entre", "era", "és", "est",
"está", "esta", "estes", "estoy", "eso", "et", "été", "ex",
"fer", "fu",
"ga", "ge", "gue",
"ha", "hay", "han", "het", "ho", "hoy",
"ik",
"il", " inte", "iv",
"jajaja", "je", "jo", "jos", "ju",
"και",
"ki",
"ke",
"la", "las", "le", "les", "lett", "leur", "li", "lo", "los",
"mas", "más",
"mejor", "més", "merci", "ma", "me", "mi", "mon", "muchas", "muy",
"με",
"não", "nada",
"ne", "ni", "nih", "non", "nor", "nos", "notre", "nu",
"nya",
"ga", "gracias", "gua", "guau",
"θα",
"opció", "ou", "oui",
"par", "para", "pas", "per", "pero", "por", "pour", "pro",
"qualche", "que", "qu", "qui",
"san",
"se", "sen", "ses", "sí", "si", "sin",
"sólo", "son", "somme", "soirée", "sous",
"su", "suis", "sul", "sur", "sus",
"ta", "també", "te", "té", "tem", "ti", "tinc", "tion", "tive", "todos", "το", "tous", "tra", "très", "tu",
"uma", "un", "una", "une", "ut",
"va", "van", "να", "vi", "vie", "vos", "vous", "votre",
"yang", "για", "yg", "yo", "qué"}));
public static final Set<String> PHRASE_WHITE_LIST = new LinkedHashSet<String>(Arrays.asList(new String[]{
"bin laden", // -> otherwise wrong language detection for 'alqaedatracker' because of 'bin'
"open source"
}));
static {
// fill collection for language detection
for (String lang : TweetDetector.LANGS) {
importFrom(LANG_DET_WORDS, lang);
}
// fill collection for noise word determination
for (String lang : TweetDetector.LANGS) {
importNoiseFrom(NOISE_WORDS, lang);
}
// int delta = LANG_DET_WORDS.size();
for (Entry<String, Set<String>> noiseTerms : NOISE_WORDS.entrySet()) {
addFrom(LANG_DET_WORDS, noiseTerms);
}
//System.out.println("added " + (LANG_DET_WORDS.size() - delta) + " terms to lang detection from noise terms");
addFrom(NOISE_WORDS, TweetDetector.UNKNOWN_LANG, NOISE_WORDS_UNSORTED);
// indifferent
addFrom(NOISE_WORDS, TweetDetector.MISC_TERMS, NOISE_WORDS_MISC);
addFrom(NOISE_WORDS, TweetDetector.SINGLE_CHAR_TERMS, NOISE_WORDS_SINGLE);
addFrom(NOISE_WORDS, TweetDetector.NUM_TERMS, NOISE_WORDS_NUM);
}
public static void importNoiseFrom(Map<String, Set<String>> words, String lang) {
try {
List<String> list = Helper.readFile(Helper.createBuffReader(JTweet.class.getResourceAsStream("noise_words_" + lang + ".txt")));
addFrom(words, lang, list);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
public static void importFrom(Map<String, Set<String>> words, String lang) {
try {
List<String> list = Helper.readFile(Helper.createBuffReader(JTweet.class.getResourceAsStream("lang_det_" + lang + ".txt")));
addFrom(words, lang, list);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
public static void addFrom(Map<String, Set<String>> words, String lang, Collection<String> collection) {
for (String str : collection) {
if (str.isEmpty() || str.startsWith("//"))
continue;
str = str.trim().toLowerCase();
Set<String> langs = words.get(str);
if (langs == null)
langs = new LinkedHashSet<String>(10);
langs.add(lang);
words.put(str, langs);
}
}
public static void addFrom(Map<String, Set<String>> words, Entry<String, Set<String>> entry) {
String str = entry.getKey();
if (str.isEmpty() || str.startsWith("//"))
return;
str = str.trim().toLowerCase();
Set<String> langs = words.get(str);
if (langs == null)
langs = new LinkedHashSet<String>(10);
langs.addAll(entry.getValue());
words.put(str, langs);
}
/**
* specifies how many existing tweets with similar content were found
*/
public Collection<Long> getDuplicates() {
return duplicates;
}
public void addDuplicate(long twId) {
duplicates.add(twId);
}
public int getQueueAgeInSeconds() {
return Math.round((System.currentTimeMillis() - instantiatedAt.getTime()) / 1000f);
}
public JTweet setFeedSource(String feedSource) {
this.feedSource = feedSource;
return this;
}
public String getFeedSource() {
return feedSource;
}
@Override
public String getId() {
return Long.toString(getTwitterId());
}
public JTweet setGeoLocation(double lat, double lon) {
latitude = lat;
longitude = lon;
return this;
}
/**
* @return latitude
*/
public double getLat() {
return latitude;
}
/**
* @return longitude
*/
public double getLon() {
return longitude;
}
/**
* This method specifies how this tweet should get updated from a
* tweet - either an out-of-date tweet fetched from index or otherway around
*/
@Override
public JTweet updateFrom(JTweet a) {
if (!getId().equals(a.getId()))
throw new IllegalStateException("ids have to be the same to call update. This:" + this + " update:" + a);
if (getRetweetCount() > a.getRetweetCount())
return this;
setReplyCount(a.replyCount);
setRetweetCount(a.retweetCount);
replies.clear();
for (JTweet repl : a.replies) {
addReply(repl);
}
duplicates.clear();
for (Long val : a.getDuplicates()) {
addDuplicate(val);
}
return this;
}
public JTweet setProtected(boolean aProtected) {
isProtected = aProtected;
return this;
}
public boolean isProtected() {
return isProtected;
}
public String getUrl() {
if (getUrlEntries() == null || getUrlEntries().isEmpty())
return null;
return getUrlEntries().iterator().next().getResolvedUrl();
}
}