/*
* Copyright 2010 Peter Karich jetwick_@_pannous_._info
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.es;
import java.util.regex.Pattern;
import de.jetwick.util.MyDate;
import org.elasticsearch.search.facet.filter.FilterFacet;
import org.elasticsearch.search.SearchHits;
import de.jetwick.config.Configuration;
import de.jetwick.data.UrlEntry;
import de.jetwick.data.JTweet;
import de.jetwick.data.JUser;
import de.jetwick.tw.Extractor;
import de.jetwick.tw.cmd.SerialCommandExecutor;
import de.jetwick.tw.cmd.TermCreateCommand;
import de.jetwick.util.AnyExecutor;
import de.jetwick.util.Helper;
import de.jetwick.util.MapEntry;
import de.jetwick.util.StopWatch;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.unit.DistanceUnit;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.json.JsonXContent;
import org.elasticsearch.index.query.FilterBuilder;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.index.query.GeoDistanceFilterBuilder;
import org.elasticsearch.index.query.NotFilterBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.index.query.RangeFilterBuilder;
import org.elasticsearch.index.search.geo.GeoDistance;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.facet.Facet;
import org.elasticsearch.search.facet.FacetBuilders;
import org.elasticsearch.search.facet.Facets;
import org.elasticsearch.search.facet.query.QueryFacet;
import org.elasticsearch.search.facet.terms.TermsFacet;
import org.elasticsearch.search.facet.terms.TermsFacetBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Provides search functionality via elasticsearch.
*
* @author Peter Karich, jetwick_@_pannous_._info
*/
public class ElasticTweetSearch extends AbstractElasticSearchQueueEnabled<JTweet> {
public static final long OLDEST_DT_IN_MILLIS = 4 * 24 * MyDate.ONE_HOUR;
public static final String TITLE = "dest_title_t";
public static final String TWEET_TEXT = "tw";
public static final String DATE = "dt";
public static final String DATE_FACET = "datefacet";
public static final String RT_COUNT = "retw_i";
public static final String DUP_COUNT = "dups_i";
public static final String IS_RT = "crt_b";
public static final String UPDATE_DT = "update_dt";
public static final String TAG = "tag";
public static final String INREPLY_ID = "inreply_l";
public static final String QUALITY = "quality_i";
public static final String LANG = "lang";
public static final String URL_COUNT = "url_i";
public static final String FIRST_URL_TITLE = "dest_title_1_s";
public static final String USER = "user";
public static final String FILTER_NO_DUPS = DUP_COUNT + ":0";
public static final String FILTER_ONLY_DUPS = DUP_COUNT + ":[1 TO *]";
public static final String FILTER_NO_URL_ENTRY = URL_COUNT + ":0";
public static final String FILTER_URL_ENTRY = URL_COUNT + ":[1 TO *]";
public static final String FILTER_NO_SPAM = QUALITY + ":[" + (JTweet.QUAL_SPAM + 1) + " TO *]";
public static final String FILTER_SPAM = QUALITY + ":[* TO " + JTweet.QUAL_SPAM + "]";
public static final String RELEVANCE = "relevance";
public static final String _ID = "_id_";
private String indexName = "twindex";
private List<AnyExecutor<JTweet>> commitListener = new ArrayList<AnyExecutor<JTweet>>(1);
private Logger logger = LoggerFactory.getLogger(getClass());
public ElasticTweetSearch() {
}
public ElasticTweetSearch(Configuration config) {
this(config.getTweetSearchUrl());
}
public ElasticTweetSearch(String url) {
super(url);
}
public ElasticTweetSearch(Client client) {
super(client);
}
@Override
public String getIndexName() {
return indexName;
}
@Override
public void setIndexName(String indexName) {
this.indexName = indexName;
}
@Override
public String getIndexType() {
return "tweet";
}
Client getClient() {
return client;
}
public void deleteUntil(Date removeUntil) {
logger.info("Deleting tweets older than " + removeUntil);
NotFilterBuilder notPersistentFilter = FilterBuilders.notFilter(FilterBuilders.existsFilter(UPDATE_DT));
FilterBuilder fewRetweetsFilter = FilterBuilders.rangeFilter(RT_COUNT).lt(100).includeUpper(false);
RangeFilterBuilder tooOldFilter = FilterBuilders.rangeFilter(DATE);
tooOldFilter.lte(removeUntil);
FilterBuilder filter = FilterBuilders.andFilter(tooOldFilter,
notPersistentFilter, fewRetweetsFilter);
client.prepareDeleteByQuery(getIndexName()).setTypes(getIndexType()).
setQuery(QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), filter)).
execute().
actionGet();
}
public void delete(Collection<JTweet> tws) {
if (tws.isEmpty())
return;
try {
for (JTweet tw : tws) {
deleteById(tw.getId());
}
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
@Override
public XContentBuilder createDoc(JTweet tw) throws IOException {
if (tw.getFromUser() == null) {
// this came from UpdateResult.addNewTweet(tweet1); UpdateResult.addRemovedTweet(tweet1) at the same time
// but should be fixed via if (!removedTweets.contains(tweet)) newTweets.add(tweet);
logger.error("fromUser of tweet must not be null:" + tw.getTwitterId() + " " + tw.getText());
return null;
}
// daemon tweets have no known twitterId and no known createdAt date
if (tw.isDaemon())
return null;
XContentBuilder b = JsonXContent.contentBuilder().startObject();
b.field(TWEET_TEXT, tw.getText());
b.field("tw_i", tw.getText().length());
b.field(UPDATE_DT, tw.getUpdatedAt());
b.field(DATE, tw.getCreatedAt());
b.field(IS_RT, tw.isRetweet());
if (tw.getLocation() == null)
b.field("loc", tw.getFromUser().getLocation());
else
b.field("loc", tw.getLocation());
b.field("geo", tw.getLat() + "," + tw.getLon());
if (!JTweet.isDefaultInReplyId(tw.getInReplyTwitterId()))
b.field(INREPLY_ID, tw.getInReplyTwitterId());
b.field(USER, tw.getFromUser().getScreenName());
b.field("iconUrl", tw.getFromUser().getProfileImageUrl());
double relevancy = tw.getCreatedAt().getTime() / MyDate.ONE_HOUR;
// every 14 retweets boosts the tweet one hour further
float scale = 14;
if (tw.getRetweetCount() <= 100)
relevancy += tw.getRetweetCount() / scale;
else
relevancy += 100 / scale;
if (tw.getText().length() <= 30)
relevancy *= 0.5;
if (tw.getQuality() <= 65)
relevancy *= 0.5;
b.field(RELEVANCE, relevancy);
for (Entry<String, Integer> entry : tw.getTextTerms().entrySet()) {
b.field(TAG, entry.getKey());
}
int counter = 0;
for (UrlEntry urlEntry : tw.getUrlEntries()) {
counter++;
b.field("orig_url_" + counter + "_s", urlEntry.getOriginalUrl(tw));
b.field("url_pos_" + counter + "_s", urlEntry.getIndex() + "," + urlEntry.getLastIndex());
b.field("dest_url_" + counter + "_s", urlEntry.getResolvedUrl());
if (!Helper.isEmpty(urlEntry.getResolvedDomain()))
b.field("dest_domain_" + counter + "_s", urlEntry.getResolvedDomain());
if (!Helper.isEmpty(urlEntry.getResolvedDomain()))
b.field("dest_title_" + counter + "_s", urlEntry.getResolvedTitle());
if (counter == 1)
b.field(TITLE, urlEntry.getResolvedTitle());
if (counter >= 3)
break;
}
b.field(URL_COUNT, counter);
b.field(DUP_COUNT, tw.getDuplicates().size());
b.field(LANG, tw.getLanguage());
b.field(QUALITY, tw.getQuality());
b.field("repl_i", tw.getReplyCount());
b.field(RT_COUNT, tw.getRetweetCount());
b.endObject();
return b;
}
@Override
public JTweet readDoc(String idAsStr, long version, Map<String, Object> source) {
// if we use in mapping: "_source" : {"enabled" : false}
// we need to include all fields in query to use doc.getFields()
// instead of doc.getSource()
String name = (String) source.get(USER);
String text = (String) source.get(TWEET_TEXT);
if (text == null || name == null || idAsStr == null) {
logger.error("Null tweet text or id!!!??" + idAsStr + " " + name + " " + text);
return new JTweet(-1L, "", new JUser(""));
}
JUser user = new JUser(name);
user.setLocation((String) source.get("loc"));
user.setProfileImageUrl((String) source.get("iconUrl"));
long id = Long.parseLong(idAsStr);
JTweet tw = new JTweet(id, text, user);
tw.setVersion(version);
String p = (String) source.get("geo");
if (p != null)
try {
String[] strs = p.split(",");
double lat = Double.parseDouble(strs[0]);
double lon = Double.parseDouble(strs[1]);
tw.setGeoLocation(lat, lon);
} catch (Exception ex) {
}
tw.setCreatedAt(Helper.toDateNoNPE((String) source.get(DATE)));
tw.setUpdatedAt(Helper.toDateNoNPE((String) source.get(UPDATE_DT)));
int rt = ((Number) source.get(RT_COUNT)).intValue();
int rp = ((Number) source.get("repl_i")).intValue();
tw.setRetweetCount(rt);
tw.setReplyCount(rp);
if (source.get(QUALITY) != null)
tw.setQuality(((Number) source.get(QUALITY)).intValue());
tw.setLanguage((String) source.get(LANG));
if (source.get(INREPLY_ID) != null) {
long replyId = ((Number) source.get(INREPLY_ID)).longValue();
tw.setInReplyTwitterId(replyId);
}
tw.setUrlEntries(Arrays.asList(parseUrlEntries(source)));
return tw;
}
public UrlEntry[] parseUrlEntries(Map<String, Object> source) {
int urlCount = 0;
try {
urlCount = ((Number) source.get(URL_COUNT)).intValue();
} catch (Exception ex) {
}
if (urlCount == 0)
return new UrlEntry[0];
UrlEntry urls[] = new UrlEntry[urlCount];
for (int i = 0; i < urls.length; i++) {
urls[i] = new UrlEntry();
}
for (int counter = 0; counter < urls.length; counter++) {
String str = (String) source.get("url_pos_" + (counter + 1) + "_s");
String strs[] = (str).split(",");
urls[counter].setIndex(Integer.parseInt(strs[0]));
urls[counter].setLastIndex(Integer.parseInt(strs[1]));
}
for (int counter = 0; counter < urls.length; counter++) {
String str = (String) source.get("dest_url_" + (counter + 1) + "_s");
urls[counter].setResolvedUrl(str);
}
for (int counter = 0; counter < urls.length; counter++) {
String str = (String) source.get("dest_domain_" + (counter + 1) + "_s");
urls[counter].setResolvedDomain(str);
}
for (int counter = 0; counter < urls.length; counter++) {
String str = (String) source.get("dest_title_" + (counter + 1) + "_s");
urls[counter].setResolvedTitle(str);
}
return urls;
}
/**
* Find a reason for a (trending) topic
* 1. first query via q=topic
* 2. retweet count should be high enough (not too high to have no results)
* but not too low (avoid noise) -> use facets with more fine grained buckets
* and determine the correct filterquery!
* 3. return created solrquery (added sort 'oldest'!)
*/
public JetwickQuery createFindOriginQuery(JetwickQuery oldQuery, String tag, int minResults) {
if (tag.isEmpty())
return new TweetQuery("");
try {
JetwickQuery q;
if (oldQuery == null)
q = new TweetQuery(tag);
else
q = oldQuery.getCopy().setQuery(tag);
// copy current state of q into resQuery!
JetwickQuery resQuery = q.getCopy();
// more fine grained information about retweets
Map<String, Integer> orderedFQ = new LinkedHashMap<String, Integer>();
orderedFQ.put("[16 TO *]", 16);
orderedFQ.put("[11 TO 15]", 11);
orderedFQ.put("[6 TO 10]", 6);
orderedFQ.put("[1 TO 5]", 1);
orderedFQ.put("0", 0);
q.setSize(0).addFilterQuery(IS_RT, false);
for (String facQ : orderedFQ.keySet()) {
q.addFacetQuery(RT_COUNT, facQ);
}
SearchResponse rsp = query(q);
long results = rsp.getHits().getTotalHits();
if (results == 0)
return new TweetQuery(tag);
resQuery.addFilterQuery(IS_RT, false);
resQuery.setSort(DATE, "asc");
long counter = 0;
for (Entry<String, Integer> entry : orderedFQ.entrySet()) {
FilterFacet ff = rsp.getFacets().facet(RT_COUNT + ":" + entry.getKey());
// System.out.println("facets:" + ff.count());
counter += ff.count();
if (counter >= minResults) {
if (entry.getValue() > 0)
resQuery.addFilterQuery(RT_COUNT, "[" + entry.getValue() + " TO *]");
break;
}
}
return resQuery;//.attachFacetibility();
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
Collection<JUser> search(String str) {
List<JUser> user = new ArrayList<JUser>();
query(user, new TweetQuery(str));
return user;
}
@Override
public SearchResponse query(JetwickQuery query) {
return query(new ArrayList(), query);
}
public SearchResponse query(Collection<JUser> users, JetwickQuery query) {
return query(users, super.query(query));
}
public SearchResponse query(Collection<JUser> users, SearchResponse rsp) {
SearchHit[] docs = rsp.getHits().getHits();
Map<String, JUser> usersMap = new LinkedHashMap<String, JUser>();
for (SearchHit sd : docs) {
// System.out.println(sd.getExplanation().toString());
JUser u = readDoc(sd.getId(), sd.getVersion(), sd.getSource()).getFromUser();
JUser uOld = usersMap.get(u.getScreenName());
if (uOld == null)
usersMap.put(u.getScreenName(), u);
else
uOld.addOwnTweet(u.getOwnTweets().iterator().next());
}
users.addAll(usersMap.values());
return rsp;
}
public Collection<JTweet> searchReplies(long id, boolean retweet) {
try {
JetwickQuery sq = new TweetQuery(true).addFilterQuery("crt_b", retweet).addFilterQuery(INREPLY_ID, id);
SearchResponse rsp = query(sq);
return collectObjects(rsp);
} catch (Exception ex) {
logger.error("Error while searchReplies", ex);
return Collections.EMPTY_SET;
}
}
void testUpdate(JTweet tmpTweets) {
queueObject(tmpTweets);
forceEmptyQueueAndRefresh();
}
void testUpdate(Collection<JTweet> tmpTweets) {
queueObjects(tmpTweets);
forceEmptyQueueAndRefresh();
}
/**
* Updates a list of tweet's with its replies and retweets.
*
* @param tmpTweets
* @param removeUntil the date until all old tweet should be removed
* @param performDelete avoid too frequent removing!
* @return updated tweets
*/
public Collection<JTweet> update(Collection<JTweet> tmpTweets, Date removeUntil, boolean performDelete) {
try {
Map<String, JUser> usersMap = new LinkedHashMap<String, JUser>();
Map<Long, JTweet> existingTweets = new LinkedHashMap<Long, JTweet>();
StringBuilder idStr = new StringBuilder();
int counts = 0;
// we can add max ~150 tweets per request (otherwise the webcontainer won't handle the long request)
for (JTweet tw : tmpTweets) {
if (counts > 0)
idStr.append(" OR ");
counts++;
idStr.append(tw.getTwitterId());
}
// get existing tweets and users
JetwickQuery query = new TweetQuery().addFilterQuery(_ID + getIndexType(), idStr.toString()).setSize(counts);
SearchResponse rsp = query(query);
SearchHits docs = rsp.getHits();
for (SearchHit sd : docs) {
JTweet tw = readDoc(sd.getId(), sd.getVersion(), sd.getSource());
existingTweets.put(tw.getTwitterId(), tw);
JUser u = tw.getFromUser();
JUser uOld = usersMap.get(u.getScreenName());
if (uOld == null)
usersMap.put(u.getScreenName(), u);
else
uOld.addOwnTweet(u.getOwnTweets().iterator().next());
}
// Avoid storing existing tweets again
Map<Long, JTweet> twMap = new LinkedHashMap<Long, JTweet>();
for (JTweet tmpTweet : tmpTweets) {
// do not store if too old
if (!tmpTweet.isPersistent() && tmpTweet.getCreatedAt().getTime() < removeUntil.getTime())
continue;
JTweet exTw = existingTweets.get(tmpTweet.getTwitterId());
// feed if new or if it should be persistent
if (exTw == null || tmpTweet.isPersistent()) {
String name = tmpTweet.getFromUser().getScreenName();
JUser u = usersMap.get(name);
if (u == null) {
u = tmpTweet.getFromUser();
usersMap.put(name, u);
}
u.addOwnTweet(tmpTweet);
// tweet does not exist. so store it into the todo map
twMap.put(tmpTweet.getTwitterId(), tmpTweet);
// overwrite existing tweets if persistent BUT update version
if (tmpTweet.isPersistent() && exTw != null)
tmpTweet.setVersion(exTw.getVersion());
}
}
LinkedHashSet<JTweet> updateTweets = new LinkedHashSet<JTweet>(twMap.values());
updateTweets.addAll(findReplies(twMap));
updateTweets.addAll(findRetweets(twMap, usersMap));
updateTweets.addAll(findDuplicates(twMap));
// add the additionally fetched tweets to the user but do not add to updateTweets
// this is a bit expensive ~30-40sec for every store call on a large index!
// fetchMoreTweets(twMap, usersMap);
store(updateTweets, false);
// We are not receiving the deleted tweets! but do we need to
// store the tweets where this deleted tweet was a retweet?
// No. Because "userA: text" and "userB: RT @usera: text" now the second tweet is always AFTER the first!
if (performDelete) {
logger.info("Deleting tweets older than " + removeUntil);
deleteUntil(removeUntil);
}
return updateTweets;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
private StopWatch sw1 = new StopWatch();
private StopWatch sw2 = new StopWatch();
private StopWatch sw3 = new StopWatch();
private StopWatch sw4 = new StopWatch();
void store(Collection<JTweet> tweets, boolean refresh) {
try {
if (tweets.isEmpty())
return;
tweets = new SerialCommandExecutor(tweets).add(
new TermCreateCommand().setSw1(sw1).setSw2(sw2).setSw3(sw3).setSw4(sw4)).execute();
List<JTweet> list = new ArrayList<JTweet>(tweets);
Collection<Integer> failedArticleIndices = bulkUpdate(list, getIndexName());
for (Integer integ : failedArticleIndices) {
JTweet tw = list.get(integ);
tw.setUpdateCount(tw.getUpdateCount() + 1);
if (tw.getUpdateCount() > 10)
logger.warn("PROBLEM: skipped tweet. it failed " + tw.getUpdateCount() + " times:" + tw);
else
queueFailedObject(tw);
}
} catch (Exception e) {
logger.error("Exception while updating.", e);
}
}
/**
* For every user there should be at least 5 tweets to make spam detection
* more efficient
*/
public void fetchMoreTweets(Map<Long, JTweet> tweets, final Map<String, JUser> userMap) {
for (JUser us : userMap.values()) {
// guarantee 5 tweets to be in the cache
if (us.getOwnTweets().size() > 4)
continue;
// fetch 10 tweets if less than 5 tweets are in the cache
JetwickQuery query = new TweetQuery().addFilterQuery("user", us.getScreenName()).setSize(10);
try {
SearchResponse rsp = query(query);
SearchHits docs = rsp.getHits();
for (SearchHit sd : docs) {
JTweet tw = readDoc(sd.getId(), sd.getVersion(), sd.getSource());
JTweet twOld = tweets.get(tw.getTwitterId());
if (twOld == null)
us.addOwnTweet(tw);
}
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
}
/**
* Connect tweets via its retweet text
*
* @return all tweets which should be updated
*/
public Collection<JTweet> findRetweets(Map<Long, JTweet> tweets, final Map<String, JUser> userMap) {
// 1. check if tweets contains originals which were retweeted -> only done for 'tweets'
// 2. check if tweets contains retweets -> done for 'tweets' and for tweets in solr
final Set<JTweet> updatedTweets = new LinkedHashSet<JTweet>();
Extractor extractor = new Extractor() {
@Override
public boolean onNewUser(int index, String user) {
boolean isRetweet = index >= 3 && text.substring(index - 3, index).equalsIgnoreCase("rt ");
if (isRetweet) {
user = user.toLowerCase();
JUser existingUser = userMap.get(user);
JTweet resTw = null;
// check ifRetweetOf against local tweets
if (existingUser != null)
for (JTweet tmp : existingUser.getOwnTweets()) {
if (tmp.getCreatedAt().getTime() < tweet.getCreatedAt().getTime()
&& tweet.isRetweetOf(tmp)) {
if (addReplyNoTricks(tmp, tweet)) {
resTw = tmp;
break;
}
}
}
// check ifRetweetOf against tweets existing in index
if (resTw == null)
resTw = connectToOrigTweet(tweet, user);
if (resTw != null) {
updatedTweets.add(resTw);
return false;
}
}
// break loop of Extractor because we only need the first user!
return true;
}
};
for (JTweet tw : tweets.values()) {
if (tw.isRetweet()) {
extractor.setTweet(tw).run();
}
}
return updatedTweets;
}
/**
* add relation to existing/original tweet
*/
public JTweet connectToOrigTweet(JTweet tw, String toUserStr) {
if (tw.isRetweet()) {
// do not connect if retweeted user == user who retweets
if (toUserStr.equals(tw.getFromUser().getScreenName()))
return null;
try {
// connect retweets to tweets only searchTweetsDays old
SearchResponse rsp = query(new TweetQuery(JetwickQuery.escapeQuery(tw.extractRTText())).addFilterQuery(USER, toUserStr).
addFilterQuery(IS_RT, false).
setSize(10));
List<JTweet> existingTw = collectObjects(rsp);
for (JTweet tmp : existingTw) {
boolean isRetweet = tw.isRetweetOf(tmp);
if (isRetweet) {
boolean check = addReplyNoTricks(tmp, tw);
if (check)
return tmp;
}
}
} catch (Exception ex) {
logger.error("couldn't connect tweet to orig tweet:" + ex.getMessage());
}
}
return null;
}
/**
* Connect tweets via its inReplyId
*
* @return all tweets which should be updated
*/
public Collection<JTweet> findReplies(Map<Long, JTweet> tweets) {
Set<JTweet> updatedTweets = new LinkedHashSet<JTweet>();
Map<Long, JTweet> replyMap = new LinkedHashMap<Long, JTweet>();
for (JTweet tw : tweets.values()) {
if (!JTweet.isDefaultInReplyId(tw.getInReplyTwitterId()) && !tw.isRetweet())
replyMap.put(tw.getInReplyTwitterId(), tw);
}
Iterator<JTweet> iter = tweets.values().iterator();
findRepliesInBatch(iter, tweets, replyMap, updatedTweets);
return updatedTweets;
}
protected void findRepliesInBatch(Iterator<JTweet> iter, Map<Long, JTweet> origTweets,
Map<Long, JTweet> replyIdToTweetMap, Collection<JTweet> updatedTweets) {
int counter = 0;
StringBuilder idStr = new StringBuilder();
StringBuilder replyIdStr = new StringBuilder();
while (iter.hasNext()) {
JTweet tw = iter.next();
JTweet tmp = replyIdToTweetMap.get(tw.getTwitterId());
if (tmp != null) {
if (addReplyNoTricks(tw, tmp)) {
updatedTweets.add(tw);
updatedTweets.add(tmp);
}
} else {
if (replyIdStr.length() > 0)
replyIdStr.append(" OR ");
replyIdStr.append(tw.getTwitterId());
}
if (JTweet.isDefaultInReplyId(tw.getInReplyTwitterId()))
continue;
tmp = origTweets.get(tw.getInReplyTwitterId());
if (tmp != null) {
if (addReplyNoTricks(tmp, tw)) {
updatedTweets.add(tw);
updatedTweets.add(tmp);
}
} else {
counter++;
if (idStr.length() > 0)
idStr.append(" OR ");
idStr.append(tw.getInReplyTwitterId());
}
}
try {
// get tweets which replies our input tweets
// INREPLY_ID:"tweets[i].id"
if (replyIdStr.length() > 0) {
JetwickQuery query = new TweetQuery().addFilterQuery(INREPLY_ID, replyIdStr.toString()).setSize(origTweets.size());
findRepliesForOriginalTweets(query, origTweets, updatedTweets);
}
// get original tweets where we have replies
if (idStr.length() > 0) {
JetwickQuery query = new TweetQuery().addFilterQuery(_ID + getIndexType(), idStr.toString()).setSize(counter);
selectOriginalTweetsWithReplies(query, origTweets.values(), updatedTweets);
}
} catch (Exception ex) {
logger.error("couldn't find replies in a batch query", ex);
}
}
protected void findRepliesForOriginalTweets(JetwickQuery query, Map<Long, JTweet> tweets,
Collection<JTweet> updatedTweets) {
Map<Long, JTweet> replyMap = new LinkedHashMap<Long, JTweet>();
SearchResponse rsp = query(query);
SearchHits docs = rsp.getHits();
for (SearchHit sd : docs) {
JTweet tw = readDoc(sd.getId(), sd.getVersion(), sd.getSource());
replyMap.put(tw.getTwitterId(), tw);
}
for (JTweet inReplSolrTweet : replyMap.values()) {
if (JTweet.isDefaultInReplyId(inReplSolrTweet.getInReplyTwitterId()))
continue;
JTweet origTw = tweets.get(inReplSolrTweet.getInReplyTwitterId());
if (origTw != null && addReplyNoTricks(origTw, inReplSolrTweet)) {
updatedTweets.add(origTw);
updatedTweets.add(inReplSolrTweet);
}
}
}
protected void selectOriginalTweetsWithReplies(JetwickQuery query, Collection<JTweet> tweets,
Collection<JTweet> updatedTweets) {
SearchResponse rsp = query(query);
SearchHits docs = rsp.getHits();
Map<Long, JTweet> origMap = new LinkedHashMap<Long, JTweet>();
for (SearchHit sd : docs) {
JTweet tw = readDoc(sd.getId(), sd.getVersion(), sd.getSource());
origMap.put(tw.getTwitterId(), tw);
}
if (origMap.size() > 0)
for (JTweet inReplSolrTweet : tweets) {
if (JTweet.isDefaultInReplyId(inReplSolrTweet.getInReplyTwitterId()))
continue;
JTweet origTw = origMap.get(inReplSolrTweet.getInReplyTwitterId());
if (origTw != null && addReplyNoTricks(origTw, inReplSolrTweet)) {
updatedTweets.add(origTw);
updatedTweets.add(inReplSolrTweet);
}
}
}
public boolean addReplyNoTricks(JTweet orig, JTweet reply) {
if (orig.getFromUser().equals(reply.getFromUser()))
return false;
try {
// ensure that reply.user has not already a tweet in orig.replies
JetwickQuery q = new TweetQuery().addFilterQuery(INREPLY_ID, orig.getTwitterId()).
addFilterQuery("-" + _ID + getIndexType(), reply.getTwitterId()).
addFilterQuery("user", reply.getFromUser().getScreenName());
if (query(q).getHits().getTotalHits() > 0)
return false;
orig.addReply(reply);
return true;
} catch (Exception ex) {
logger.error("couldn't add reply to:" + orig, ex);
return false;
}
}
/**
* @param exec will be called directly after the tweets have beed feeded
* into the index. WARNING: it is not guarantueed that the tweets are
* already searchable as every index has a realtime latency
*/
public void addListener(AnyExecutor<JTweet> exec) {
if (!commitListener.contains(exec))
commitListener.add(exec);
}
public void removeListener(AnyExecutor<JTweet> exec) {
commitListener.remove(exec);
}
public JTweet findByTwitterId(Long twitterId) {
try {
GetResponse rsp = client.prepareGet(getIndexName(), getIndexType(), Long.toString(twitterId)).
execute().actionGet();
if (rsp.getSource() == null)
return null;
return readDoc(rsp.getId(), rsp.getVersion(), rsp.getSource());
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
public Collection<String> getUserChoices(JetwickQuery lastQ, String input) {
try {
if (input.length() < 1)
return Collections.emptyList();
// NOT context dependent any longer ...
input = input.toLowerCase();
SearchRequestBuilder srb = createSearchBuilder();
srb.setQuery(QueryBuilders.fieldQuery(USER, input + "*"));
List<JUser> users = new ArrayList<JUser>();
query(users, new TweetQuery(false));
Set<String> res = new TreeSet<String>();
for (JUser u : users) {
if (u.getScreenName().startsWith(input))
res.add(u.getScreenName());
if (res.size() > 9)
break;
}
return res;
} catch (Exception ex) {
logger.error("Error while getUserChoices:" + input + " " + lastQ, ex);
return Collections.emptyList();
}
}
public Collection<String> getQueryChoices(JetwickQuery lastQ, String input) {
try {
if (input.length() < 2)
return Collections.emptyList();
String firstPart = "";
String secPart = input;
int index = input.lastIndexOf(" ");
Set<String> existingTerms = new HashSet<String>();
if (index > 0 && index < input.length()) {
firstPart = input.substring(0, index);
secPart = input.substring(index + 1);
for (String tmp : input.split(" ")) {
existingTerms.add(tmp.toLowerCase().trim());
}
} else
existingTerms.add(secPart);
if (lastQ == null) {
lastQ = new TweetQuery(firstPart, false);
} else {
lastQ = lastQ.getCopy().setQuery(firstPart);
// remove any date restrictions
lastQ.removeFilterQueries(DATE);
lastQ.removeFacets();
}
SearchRequestBuilder srb = createSearchBuilder();
lastQ.initRequestBuilder(srb);
TermsFacetBuilder tfb = FacetBuilders.termsFacet(TAG).field(TAG);
if (!secPart.trim().isEmpty())
tfb.regex(secPart + ".*", Pattern.DOTALL);
srb.addFacet(tfb);
SearchResponse rsp = query(new ArrayList<JUser>(), srb.execute().actionGet());
Set<String> res = new TreeSet<String>();
TermsFacet tf = rsp.facets().facet(TAG);
if (tf != null) {
for (TermsFacet.Entry cnt : tf.entries()) {
String lowerSugg = cnt.getTerm().toLowerCase();
if (existingTerms.contains(lowerSugg))
continue;
if (lowerSugg.startsWith(secPart)) {
if (firstPart.isEmpty())
res.add(cnt.getTerm());
else
res.add(firstPart + " " + cnt.getTerm());
}
if (res.size() > 9)
break;
}
}
return res;
} catch (Exception ex) {
logger.error("Error while getQueryChoices:" + input + " " + lastQ + " -> Error:" + ex.getMessage());
return Collections.emptyList();
}
}
JUser findByUserName(String uName) {
try {
List<JUser> list = new ArrayList<JUser>();
// get all tweets of the user so set rows large ...
query(list, new TweetQuery().addFilterQuery("user", uName.toLowerCase()).setSize(10));
if (list.isEmpty())
return null;
return list.get(0);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
public List<JTweet> searchTweets(JetwickQuery q) {
try {
return collectObjects(query(q));
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
public Collection<String> searchTrends(JetwickQuery q, int limit) {
try {
q.addFacetField(TAG);
SearchResponse rsp = query(q);
Facets facets = rsp.facets();
if (facets == null)
return Collections.emptyList();
Set<String> set = new LinkedHashSet<String>();
for (Facet facet : facets.facets()) {
if (facet instanceof TermsFacet) {
TermsFacet ff = (TermsFacet) facet;
for (TermsFacet.Entry e : ff.entries()) {
if (e.count() > limit)
set.add(e.getTerm());
}
}
}
return set;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
public String getTweetsAsString(JetwickQuery q, String separator) {
StringBuilder sb = new StringBuilder();
List<JTweet> tmpTweets = searchTweets(q);
for (JTweet tweet : tmpTweets) {
sb.append(Helper.toTwitterHref(tweet.getFromUser().getScreenName(), tweet.getTwitterId()));
sb.append(separator);
sb.append(tweet.getRetweetCount());
sb.append(separator);
sb.append(tweet.getText().replaceAll("\n", " "));
sb.append("\n");
}
return sb.toString();
}
public Collection<JTweet> findDuplicates(Map<Long, JTweet> tweets) {
final Set<JTweet> updatedTweets = new LinkedHashSet<JTweet>();
TermCreateCommand termCommand = new TermCreateCommand();
double JACC_BORDER = 0.7;
for (JTweet currentTweet : tweets.values()) {
if (currentTweet.isRetweet())
continue;
JetwickQuery reqBuilder = new SimilarTweetQuery(currentTweet, false).addLatestDateFilter(24);
if (currentTweet.getTextTerms().size() < 3)
continue;
int dups = 0;
try {
// find dups in index
for (JTweet simTweet : collectObjects(query(reqBuilder))) {
if (simTweet.getTwitterId().equals(currentTweet.getTwitterId()))
continue;
termCommand.calcTermsWithoutNoise(simTweet);
if (TermCreateCommand.calcJaccardIndex(currentTweet.getTextTerms(), simTweet.getTextTerms())
>= JACC_BORDER) {
currentTweet.addDuplicate(simTweet.getTwitterId());
dups++;
}
}
} catch (Exception ex) {
logger.error("Error while findDuplicate query execution", ex);
}
// find dups in tweets map
for (JTweet simTweet : tweets.values()) {
if (simTweet.getTwitterId().equals(currentTweet.getTwitterId()) || simTweet.isRetweet())
continue;
if (currentTweet.getCreatedAt().getTime() < simTweet.getCreatedAt().getTime())
continue;
termCommand.calcTermsWithoutNoise(simTweet);
if (TermCreateCommand.calcJaccardIndex(currentTweet.getTextTerms(), simTweet.getTextTerms())
>= JACC_BORDER) {
currentTweet.addDuplicate(simTweet.getTwitterId());
dups++;
}
}
// tw.setDuplicates(dups);
}
return updatedTweets;
}
public SearchResponse updateSavedSearches(final Collection<SavedSearch> savedSearches) {
JetwickQuery q = new TweetQuery() {
@Override
protected void processFacetQueries(SearchRequestBuilder srb) {
for (SavedSearch ss : savedSearches) {
srb.addFacet(FacetBuilders.queryFacet(SAVED_SEARCHES + "_" + ss.getId(),
createQSQB(ss.calcFacetQuery())));
}
}
}.setFrom(0).setSize(0);
return query(q);
}
QueryStringQueryBuilder createQSQB(String qStr) {
return QueryBuilders.queryString(qStr).
useDisMax(true).defaultOperator(QueryStringQueryBuilder.Operator.AND).
field(ElasticTweetSearch.TWEET_TEXT).field(TITLE).field(USER, 0);
}
/**
* @return a collection where the first string indicates the filter key
* which should be removed to increase the number of results.
* Of course this can be only a heuristic sorting against the count of each
* filter query
*/
public Collection<String> suggestRemoval(final JetwickQuery q) {
SearchResponse rsp = query(new TweetQuery() {
@Override
protected void processFacetQueries(SearchRequestBuilder srb) {
int counter = 0;
String initFacetQ = SavedSearch.buildInitialFacetQuery(q.getQuery());
for (Entry<String, Object> e : q.getFilterQueries()) {
String facetQuery = initFacetQ + " AND " + e.getKey() + ":" + e.getValue().toString();
srb.addFacet(FacetBuilders.queryFacet("ss_" + counter, createQSQB(facetQuery)));
counter++;
}
}
});
List<Entry<String, Long>> list = new ArrayList<Entry<String, Long>>();
int counter = 0;
boolean forceDateSuggestion = false;
for (Entry<String, Object> e : q.getFilterQueries()) {
QueryFacet qf = (QueryFacet) rsp.facets().facet("ss_" + counter);
list.add(new MapEntry<String, Long>(e.getKey(), qf.count()));
counter++;
if (DATE.equals(e.getKey())) {
try {
String str = (String) e.getValue();
int index = str.indexOf(" ");
// get from date
if (index > 0)
str = str.substring(1, index);
if ((new Date().getTime() - Helper.toDate(str).getTime()) / MyDate.ONE_DAY <= 1)
forceDateSuggestion = true;
} catch (Exception ex) {
}
}
}
Helper.sortInplaceLongReverse(list);
Collection<String> res = new LinkedHashSet<String>();
for (Entry<String, Long> e : list) {
if (e.getValue() > 0)
res.add(e.getKey());
}
if (forceDateSuggestion)
res.add(DATE);
return res;
}
public List<JTweet> searchGeo(double lat, double lon, double length) {
GeoDistanceFilterBuilder geoFilter = FilterBuilders.geoDistanceFilter("geo").
lat(lat).lon(lon).distance(length, DistanceUnit.KILOMETERS).geoDistance(GeoDistance.PLANE);
SearchRequestBuilder srb = createSearchBuilder();
srb.setQuery(QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), geoFilter));
return collectObjects(srb.execute().actionGet());
}
public Set<String> getQuerySuggestions(JetwickQuery query, SearchResponse rsp, long hits) {
TermsFacet tags = (TermsFacet) rsp.facets().facet(ElasticTweetSearch.TAG);
if (tags == null)
return Collections.emptySet();
Set<String> tmp = new LinkedHashSet<String>();
for (TermsFacet.Entry e : tags.entries()) {
// logger.info(e.term() + " " + e.count() + " " + hits);
if (e.count() > hits / 10000.0 + 1) {
boolean contains = false;
for (String tmpTerm : tmp) {
if (e.term().contains(tmpTerm) || tmpTerm.contains(e.term()))
contains = true;
}
if (!contains)
tmp.add(e.term());
}
}
Set<String> qSuggestions = new LinkedHashSet<String>();
int counter = 0;
for (String t : tmp) {
if (query.getQuery().contains(t) || t.contains(query.getQuery()))
continue;
qSuggestions.add(query.getQuery() + " " + t);
qSuggestions.add(query.getQuery() + " -" + t);
if (++counter > 2)
break;
}
if (qSuggestions.size() > 0)
qSuggestions.add(query.getQuery());
return qSuggestions;
}
public GetResponse findByTwitterIdRaw(Long twitterId) {
return client.prepareGet(getIndexName(), getIndexType(), Long.toString(twitterId)).
execute().actionGet();
}
SearchRequestBuilder createSearchBuilder(String indexName) {
return client.prepareSearch(indexName).setTypes(getIndexType()).setVersion(true);
}
private Map<String, JTweet> tweets = new LinkedHashMap<String, JTweet>(100);
private StopWatch sw = new StopWatch();
private int tweetCounter = 0;
private AtomicInteger feededTweets = new AtomicInteger(0);
private Collection<JTweet> protectedTweets = new LinkedHashSet<JTweet>();
private int feedCounter = 0;
public int getFeededTweets() {
return feededTweets.get();
}
@Override
public void innerAdd(JTweet tw) {
// do not add protected tweets and add them only once
if (!tw.isProtected()) {
JTweet existingTweet = tweets.put(tw.getId(), tw);
if (existingTweet != null) {
existingTweet.updateFrom(tw);
tweets.put(existingTweet.getId(), existingTweet);
}
} else
protectedTweets.add(tw);
}
@Override
public void innerThreadMethod() throws InterruptedException {
sw.start();
boolean delete = testing || feedCounter++ % 400 == 0;
// tweets can be updated from another thread (failed tweets)
Collection<JTweet> res = update(new ArrayList<JTweet>(tweets.values()), createRemoveOlderThan().toDate(), delete);
tweetCounter += res.size();
feededTweets.set(res.size());
sw.stop();
if (tweetCounter > getBatchSize()) {
logger.info("Updated " + tweetCounter + " tweets "
+ tweetCounter / sw.getSeconds() + " per sec. Remaining:"
+ getTodoObjects().size());
logger.info("sw1:" + sw1.getSeconds() + "\t sw2:" + sw2.getSeconds()
+ "\t sw3:" + sw3.getSeconds() + "\t sw4:" + sw4.getSeconds());
tweetCounter = 0;
sw = new StopWatch();
}
res.addAll(protectedTweets);
for (AnyExecutor<JTweet> exec : commitListener) {
for (JTweet tw : res) {
exec.execute(tw);
}
}
protectedTweets.clear();
tweets.clear();
}
/**
* Warning this is not real time!
*/
public List<JTweet> findByUrl(String url) {
SearchRequestBuilder srb = createSearchBuilder();
srb.setSearchType(SearchType.QUERY_AND_FETCH);
srb.setQuery(QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(),
FilterBuilders.orFilter(
FilterBuilders.termFilter("dest_url_1_s", url),
FilterBuilders.termFilter("orig_url_1_s", url))));
return collectObjects(srb.execute().actionGet());
}
public boolean tooOld(Date dt) {
return dt.getTime() < System.currentTimeMillis()
- ElasticTweetSearch.OLDEST_DT_IN_MILLIS;
}
@Override
public void deleteAll(String indexName, String indexType) {
protectedTweets.clear();
tweets.clear();
super.deleteAll(indexName, indexType);
}
}