return sb.toString();
}
public Collection<JTweet> findDuplicates(Map<Long, JTweet> tweets) {
final Set<JTweet> updatedTweets = new LinkedHashSet<JTweet>();
TermCreateCommand termCommand = new TermCreateCommand();
double JACC_BORDER = 0.7;
for (JTweet currentTweet : tweets.values()) {
if (currentTweet.isRetweet())
continue;
JetwickQuery reqBuilder = new SimilarTweetQuery(currentTweet, false).addLatestDateFilter(24);
if (currentTweet.getTextTerms().size() < 3)
continue;
int dups = 0;
try {
// find dups in index
for (JTweet simTweet : collectObjects(query(reqBuilder))) {
if (simTweet.getTwitterId().equals(currentTweet.getTwitterId()))
continue;
termCommand.calcTermsWithoutNoise(simTweet);
if (TermCreateCommand.calcJaccardIndex(currentTweet.getTextTerms(), simTweet.getTextTerms())
>= JACC_BORDER) {
currentTweet.addDuplicate(simTweet.getTwitterId());
dups++;
}
}
} catch (Exception ex) {
logger.error("Error while findDuplicate query execution", ex);
}
// find dups in tweets map
for (JTweet simTweet : tweets.values()) {
if (simTweet.getTwitterId().equals(currentTweet.getTwitterId()) || simTweet.isRetweet())
continue;
if (currentTweet.getCreatedAt().getTime() < simTweet.getCreatedAt().getTime())
continue;
termCommand.calcTermsWithoutNoise(simTweet);
if (TermCreateCommand.calcJaccardIndex(currentTweet.getTextTerms(), simTweet.getTextTerms())
>= JACC_BORDER) {
currentTweet.addDuplicate(simTweet.getTwitterId());
dups++;
}