/**
* Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.tw.cmd;
import de.jetwick.data.UrlEntry;
import de.jetwick.data.JTweet;
import de.jetwick.data.JUser;
import de.jetwick.tw.FakeUrlExtractor;
import de.jetwick.tw.TweetDetector;
import de.jetwick.util.MyDate;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.Map.Entry;
import org.junit.Test;
import static org.junit.Assert.*;
/**
*
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class TermCreateCommandTest {
public TermCreateCommandTest() {
}
static void execute(Collection<JTweet> tweets) {
execute(tweets, true);
}
static void execute(Collection<JTweet> tweets, boolean termRemoving) {
// remove executor since we only have one remaining command?
for (JTweet tw : tweets) {
for (UrlEntry entry : new FakeUrlExtractor().setText(tw.getText()).run().getUrlEntries()) {
tw.addUrlEntry(entry);
}
}
new SerialCommandExecutor(tweets).add(new TermCreateCommand(termRemoving)).execute();
}
static void execute(JTweet tw) {
execute(Arrays.asList(tw), false);
}
@Test
public void testQuality() {
JTweet tw1 = createSolrTweet(1L, "@lwr32 #JAVA! "
+ "#COFFEE! #JAVA! #COFFEE! #JAVA! #COFFEE! #JAVA! #COFFEE! #JAVA! #COFFEE! #JAVA! #COFFEE! #JAVA! #COFFEE! #JAVA!", "usera");
JTweet tw2 = createSolrTweet(2L, "@meggytron JAH-VA! java java java java "
+ "java java java. /Dante's Peak #requirescaffeine mashup", "userb");
JTweet tw3 = createSolrTweet(3L, "@ierinleker ...JAVA JAVA JAVA JAVA JAVA "
+ "JAVA JAVA http://twitpic.com/2kk65u", "userc");
JTweet tw4 = createSolrTweet(4L, "java", "userd");
execute(Arrays.asList(tw1, tw2, tw3, tw4));
assertTrue(tw4.getQuality() > tw3.getQuality());
// both tweets have 7 java terms
assertEquals(tw3.getQuality(), tw2.getQuality());
assertTrue(tw2.getQuality() > tw1.getQuality());
}
@Test
public void testQuality2() {
String[] tweetsAsStr = new String[]{
"Fernsehen entut Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
"Fernsehen taek Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
"Fernsehen stream Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
"Fernsehen live Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
"Televisie kijken Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
"Fernsehen Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
"Fernsehen kijken Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
"Fernsehen Televisie Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html",
"Fernsehen Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html#1live",
"Fernsehen Werder Bremen vs FC Twente Enschede http://watchlivefree.blogspot.com/2010/11/fernsehen-werder-bremen-vs-fc-twente.html#1"
};
List<JTweet> list = new ArrayList();
int counter = 0;
JUser user = new JUser("sakilamahipallb");
for (String tw : tweetsAsStr) {
counter++;
list.add(new JTweet(counter, tw, user).setCreatedAt(new Date(counter)));
}
execute(list);
counter = 0;
int spamCounter = 0;
for (JTweet tw : list) {
if (counter++ > 0) {
assertTrue("tweet:" + tw, tw.getQuality() < JTweet.QUAL_LOW);
if (tw.getQuality() < JTweet.QUAL_SPAM)
spamCounter++;
}
}
// a lot of those tweets are spam - not only bad!
assertTrue(spamCounter > 5);
user = new JUser("user2");
JTweet tw1 = new JTweet(1L, "E Grant Rd / N Swan Rd Accident no injury (Tue 3:24 PM) http://tinyurl.com/5hwubc", user).setCreatedAt(new Date(1));
JTweet tw2 = new JTweet(2L, "N Columbus Bl / E Grant Rd Accident no injury (Tue 3:26 PM) http://tinyurl.com/658t96", user).setCreatedAt(new Date(2));
execute(Arrays.asList(tw1, tw2));
// assertTrue("tweet:" + tw1, tw1.getQuality() < SolrTweet.QUAL_MAX);
assertTrue("tweet:" + tw1, tw1.getQuality() > JTweet.QUAL_SPAM);
assertTrue("tweet:" + tw2, tw2.getQuality() < JTweet.QUAL_MAX);
assertTrue("tweet:" + tw2, tw2.getQuality() > JTweet.QUAL_SPAM);
user = new JUser("user2");
tw1 = new JTweet(1L, "Werder Bremen verliert sein Heimspiel gegen Twente http://goo.gl/fb/fKFEi #werder #svw", user).setCreatedAt(new Date(1));
tw2 = new JTweet(2L, "Werder Bremen verliert gegen Twente Enschede http://goo.gl/fb/O8maL #werder #svw", user).setCreatedAt(new Date(2));
execute(Arrays.asList(tw1, tw2));
assertTrue("tweet:" + tw1, tw1.getQuality() == JTweet.QUAL_MAX);
assertTrue("tweet:" + tw2, tw2.getQuality() < JTweet.QUAL_MAX);
assertTrue("tweet:" + tw2, tw2.getQuality() > JTweet.QUAL_SPAM);
}
@Test
public void testDecreaseQualityOnlyOnce() {
String url1, url2, url3;
url1 = url2 = url3 = "http://watchlivefree.blogspot.com";
String[] tweetsAsStr = new String[]{
"blap notspamword " + url1,
"blup secondnotspamword " + url2,
"bli secondsomething" + url3};
JUser user = new JUser("user1");
JTweet tw1 = new JTweet(1L, tweetsAsStr[0], user).setCreatedAt(new Date(1L));
tw1.getUrlEntries().add(new UrlEntry(5, 123, url1).setResolvedTitle("title1"));
JTweet tw2 = new JTweet(2L, tweetsAsStr[1], user).setCreatedAt(new Date(2L));
tw2.getUrlEntries().add(new UrlEntry(5, 123, url2).setResolvedTitle("title2"));
JTweet tw3 = new JTweet(3L, tweetsAsStr[2], user).setCreatedAt(new Date(3L));
tw3.getUrlEntries().add(new UrlEntry(5, 123, url3).setResolvedTitle("title3"));
execute(Arrays.asList(tw1, tw2, tw3));
assertEquals(JTweet.QUAL_MAX, tw1.getQuality());
assertTrue(tw2.getQuality() > JTweet.QUAL_SPAM);
assertTrue(tw3.getQuality() > JTweet.QUAL_SPAM);
}
// @Test
// public void testUrlTitleQuality() {
// String url1 = "http://watchlivefree.blogspot.different.domain.com",
// url2 = "http://watchlivefree.blogspot.com";
// String[] tweetsAsStr = new String[]{
// "blap notspamword " + url1,
// "blup secondnotspamword " + url2};
//
// JUser user = new JUser("user1");
// JTweet tw1 = new JTweet(1L, tweetsAsStr[0], user).setCreatedAt(new Date(1L));
// tw1.getUrlEntries().add(new UrlEntry(5, 123, url1).setResolvedTitle("identical title"));
// JTweet tw2 = new JTweet(2L, tweetsAsStr[1], user).setCreatedAt(new Date(2L));
// tw2.getUrlEntries().add(new UrlEntry(5, 123, url2).setResolvedTitle("identical title"));
//
// execute(Arrays.asList(tw1, tw2));
//
// assertTrue("tweet:" + tw1, tw1.getQuality() > 90);
// assertTrue("tweet:" + tw2, tw2.getQuality() < 90);
// }
@Test
public void testExecute() {
JTweet tw = new JTweet(1L, "java lava", new JUser("tmp")).setCreatedAt(new Date(1L));
execute(tw);
assertEquals(2, tw.getTextTerms().size());
JUser u = new JUser("peter");
tw = new JTweet(1L, "java lava", u);
JTweet tw2 = new JTweet(2L, "peter java", u).setCreatedAt(new Date(2L));
execute(tw);
assertEquals(2, tw.getTextTerms().size());
assertEquals(2, tw2.getTextTerms().size());
}
JTweet createSolrTweet(long id, String twText, String user) {
return new JTweet(id, twText, new JUser(user)).setCreatedAt(new MyDate(id).toDate());
}
@Test
public void testTermDetection() {
JUser user = new JUser("Peter");
user.addOwnTweet(new JTweet(1, "term1 term2 term1", user));
JTweet tw = new JTweet(2, "term3 not term2 important term3", user);
user.addOwnTweet(tw);
execute(tw);
Collection<Entry<String, Integer>> coll = tw.getTextTerms().entrySet();
assertEquals(3, (int) coll.size());
int counter = 0;
int counter2 = 0;
for (Entry<String, Integer> e : coll) {
if (e.getKey().equals("term1"))
counter++;
if (e.getKey().equals("not"))
counter2++;
}
assertEquals(0, counter);
assertEquals(0, counter2);
}
@Test
public void testTermDetection2() {
JUser user = new JUser("Peter");
JTweet tw1 = new JTweet(1, "#term1 #term1", user);
user.addOwnTweet(tw1);
JTweet tw2 = new JTweet(2, "term1", user);
user.addOwnTweet(tw2);
execute(tw2);
// two tweets with 'term1'
// assertEquals(2, (int) extractor.run().getSortedTerms().get(0).getValue());
assertEquals(1, (int) tw2.getTextTerms().size());
}
@Test
public void testTermDetection3() {
JTweet tw = new JTweet(1L, "A Year Without Rain "
+ "A Year Without Rain A Year Without Rain A Year Without Rain "
+ "A Year Without Rain A Year Without Rain A Year Without Rain", new JUser("peter"));
execute(tw);
assertEquals(2, tw.getTextTerms().size());
}
@Test
public void testOtherTweets() {
JUser u = new JUser("peter");
JTweet tw1 = new JTweet(1L, "A Year Without Rain Will Give Us desert xyz", u).setCreatedAt(new Date(2L));
// tw2 is older than tw1
JTweet tw2 = new JTweet(2L, "A Year Without Rain Will Give Us really fat desert", u).setCreatedAt(new Date(1L));
JTweet tw3 = new JTweet(3L, "great hui desert", u).setCreatedAt(new Date(0L));
tw1.setQuality(100);
tw2.setQuality(89);
execute(tw1);
// unchanged
assertEquals(89, tw2.getQuality());
assertTrue(tw1.getQuality() < 100);
tw1.setQuality(100);
StringFreqMap tFreq = new StringFreqMap();
StringFreqMap lFreq = new StringFreqMap();
new TermCreateCommand().checkSpamInExistingTweets(tw1, tFreq, lFreq);
// without tw1
assertEquals(9, (int) lFreq.get(TweetDetector.EN));
assertEquals(1, (int) lFreq.get(TweetDetector.DE));
assertEquals(6, (int) tw1.getLanguages().get(TweetDetector.EN));
// without tw1
assertEquals(2, (int) tFreq.get("desert"));
assertEquals(1, (int) tFreq.get("hui"));
assertNull(tFreq.get("xyz"));
assertEquals(1, (int) tw2.getTextTerms().get("fat"));
}
@Test
public void testLanguageDetection2() {
JUser user = new JUser("peter");
JTweet tw1 = new JTweet(0, "this is lastwordIsNotRecognizedBecauseItCouldBeStrippedOut", user);
execute(tw1);
assertEquals(2, tw1.getLanguages().get(TweetDetector.EN).intValue());
assertEquals(TweetDetector.UNKNOWN_LANG, tw1.getLanguage());
// now the language is detected because a lot noise NOISE_WORDS were found
JTweet tw = new JTweet(2, "viele ist dort deutscher Tweet!", user);
execute(tw);
assertEquals(TweetDetector.DE, tw.getLanguage());
user = new JUser("peter");
tw = new JTweet(3L, "Togos with @munckytown on lunch break. "
+ "Hall and Oates \"kiss on my list\" is playing... groovy", user);
execute(tw);
assertEquals(TweetDetector.EN, tw.getLanguage());
user = new JUser("peter");
tw = new JTweet(4L, "@ibood Bedankt voor de code! :-)", user);
execute(tw);
// only de and en are known so detect as unknown!
assertEquals(TweetDetector.UNKNOWN_LANG, tw.getLanguage());
// now detect the nl language
tw = new JTweet(5L, "@MrDeek Klinkt goed toch, een bestek set is altijd leuk om te krijgen of te geven!", user);
execute(tw);
assertEquals(TweetDetector.NL, tw.getLanguage());
}
@Test
public void testLanguageDetection3() {
JTweet tw = new JTweet(1L, "tmptext", new JUser("tmp"));
tw.getLanguages().inc("de", 1);
StringFreqMap otherLanguages = new StringFreqMap();
assertEquals(TweetDetector.UNKNOWN_LANG, new TermCreateCommand().detectLanguage(tw, otherLanguages));
tw = new JTweet(1L, "tmptext", new JUser("tmp"));
tw.getLanguages().inc("de", 2);
otherLanguages = new StringFreqMap().set("de", 1);
assertEquals("de", new TermCreateCommand().detectLanguage(tw, otherLanguages));
tw = new JTweet(1L, "tmptext", new JUser("tmp"));
tw.getLanguages().inc(TweetDetector.UNKNOWN_LANG, 2);
tw.getLanguages().inc("de", 2);
otherLanguages = new StringFreqMap().set("de", 1);
assertEquals("de", new TermCreateCommand().detectLanguage(tw, otherLanguages));
tw = new JTweet(1L, "tmptext", new JUser("tmp"));
tw.getLanguages().inc(TweetDetector.UNKNOWN_LANG, 2);
tw.getLanguages().inc("de", 2);
tw.getLanguages().inc("en", 2);
otherLanguages = new StringFreqMap().set("de", 1).set("en", 1);
assertEquals(TweetDetector.UNKNOWN_LANG, new TermCreateCommand().detectLanguage(tw, otherLanguages));
}
// @Test
// public void testSignature() {
// SolrTweet tw = new SolrTweet(1L, "wtf wtf text", new SolrUser("tmp"));
// new TermCreateCommand().calcTermsWithoutNoise(tw);
// assertTrue(tw.getTextSignature().size() > 0);
// SolrTweet tw2 = new SolrTweet(2L, "wtf wtf text", new SolrUser("tmp"));
// new TermCreateCommand().calcTermsWithoutNoise(tw2);
// assertEquals(tw.getTextSignature(), tw2.getTextSignature());
//
// SolrTweet tw3 = new SolrTweet(3L, "wtf wtf text wikileaks info", new SolrUser("tmp"));
// new TermCreateCommand().calcTermsWithoutNoise(tw3);
// int counter = 0;
// for (Long val : tw3.getTextSignature()) {
// if (tw2.getTextSignature().contains(val))
// counter++;
// }
// assertTrue("At least on signature should be identical for tweet2 and tweet3", counter > 0);
// }
}