Package de.jetwick.tw.cmd

Source Code of de.jetwick.tw.cmd.TermCreateCommand

/**
* Copyright (C) 2010 Peter Karich <jetwick_@_pannous_._info>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*         http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.jetwick.tw.cmd;

import de.jetwick.data.UrlEntry;
import de.jetwick.data.JTweet;
import de.jetwick.util.AnyExecutor;
import de.jetwick.tw.TweetDetector;
import de.jetwick.util.StopWatch;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* @author Peter Karich, peat_hal 'at' users 'dot' sourceforge 'dot' net
*/
public class TermCreateCommand implements AnyExecutor<JTweet> {

    private Logger logger = LoggerFactory.getLogger(getClass());
    private boolean termRemoving = true;
    private StopWatch sw1 = new StopWatch();
    private StopWatch sw2 = new StopWatch();
    private StopWatch sw3 = new StopWatch();
    private StopWatch sw4 = new StopWatch();

    public TermCreateCommand() {
        //http://en.wikipedia.org/wiki/Phonetic_algorithm
        //http://en.wikipedia.org/wiki/Approximate_string_matching
        //http://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
        //BUT we are using a technic from TextProfileSignature:
        // create a list of tokens and their frequency, separated by spaces, in the order of decreasing frequency.
        // This list is then submitted to an MD5 hash calculation.
        // Only suited for short strings: JaroWinklerDistance, LevensteinDistance,new NGramDistance.
        // Use relative termMinFrequency!!
    }

    public TermCreateCommand setSw1(StopWatch sw1) {
        this.sw1 = sw1;
        return this;
    }

    public TermCreateCommand setSw2(StopWatch sw2) {
        this.sw2 = sw2;
        return this;
    }

    public TermCreateCommand setSw3(StopWatch sw3) {
        this.sw3 = sw3;
        return this;
    }

    public TermCreateCommand setSw4(StopWatch sw4) {
        this.sw4 = sw4;
        return this;
    }

    public TermCreateCommand(boolean termRemoving) {
        this.termRemoving = termRemoving;
    }

    @Override
    public JTweet execute(JTweet tw) {
        // HINT: do only modify the current tweets' quality!
        double qual = tw.getQuality();
        calcTermsWithoutNoise(tw);

        int maxTerms = 0;
        for (Entry<String, Integer> entry : tw.getTextTerms().getSortedFreqLimit(0.05f)) {
            if (entry.getValue() > maxTerms)
                maxTerms = entry.getValue();
        }

        // term occurs more than one time on the current tweet?
        if (maxTerms > 4) {
            qual = Math.max(0, 100 - maxTerms * 8);
//            tw.addQualAction("MT,");
        }

        // now calculate quality via comparing to existing tweets
        StringFreqMap otherTerms = new StringFreqMap();
        StringFreqMap otherLangs = new StringFreqMap();
        tw.setQuality((int) qual);
        qual = checkSpamInExistingTweets(tw, otherTerms, otherLangs);
        tw.setQuality((int) qual);

        // prepare indexing and remove terms which do NOT occur in other tweets (i.e. are 'unimportant')
        if (termRemoving) {
            Iterator<String> iter = tw.getTextTerms().keySet().iterator();
            while (iter.hasNext()) {
                String term = iter.next();
                Integer integ = otherTerms.get(term);
                if (integ == null || integ < 1)
                    iter.remove();
            }
        }

        // language detection from tw.getLanguages()
        tw.setLanguage(detectLanguage(tw, otherLangs));
        return tw;
    }

    public String detectLanguage(JTweet tweet, StringFreqMap languages) {
        if (tweet.getLanguages().size() > 0) {
            List<Entry<String, Integer>> list = tweet.getLanguages().getSorted();
            int index = 0;

            Entry<String, Integer> lEntry = list.get(index);
            if (tweet.getLanguages().size() > index + 1 && TweetDetector.UNKNOWN_LANG.equals(lEntry.getKey())) {
                index++;
                lEntry = list.get(index);
            }

            if (tweet.getLanguages().size() > index + 1 && !TweetDetector.UNKNOWN_LANG.equals(lEntry.getKey())) {
                if (lEntry.getValue() - 1 < list.get(index + 1).getValue())
                    // the second language seems also important
                    return TweetDetector.UNKNOWN_LANG;
            }

            if (languages.containsKey(lEntry.getKey()) || lEntry.getValue() > 2)
                return lEntry.getKey();
        }
        return TweetDetector.UNKNOWN_LANG;
    }

    public double checkSpamInExistingTweets(JTweet currentTweet,
            StringFreqMap mergedTerms, StringFreqMap mergedLangs) {
        double qual = currentTweet.getQuality();

        sw1.start();
        StringFreqMap urlMap = new StringFreqMap();
        for (JTweet older : currentTweet.getFromUser().getOwnTweets()) {
            for (UrlEntry entry : older.getUrlEntries()) {
                urlMap.inc(entry.getResolvedUrl(), 1);
            }
        }
        sw1.stop();

        sw2.start();
        boolean sameUrl = false;
        for (JTweet older : currentTweet.getFromUser().getOwnTweets()) {
            if (older == currentTweet)
                continue;

            sw3.start();
            // create tags to decide if tags of currentTweet are important
            calcTermsWithoutNoise(older);
            sw3.stop();
            // count only one term per tweet
            mergedTerms.addOne2All(older.getTextTerms());
            // count languages as they appear
            mergedLangs.addValue2All(older.getLanguages());

            // compare only to older tweets           
            if (currentTweet.getCreatedAt().getTime() <= older.getCreatedAt().getTime())
                continue;

            // we don't need the signature because we have the jaccard index
            // performance improvement of comparison: use int[] instead of byte[]
//            if (Arrays.equals(tw.getTextSignature(), older.getTextSignature()))
//                qual = qual * 0.7;

            double ji = calcJaccardIndex(currentTweet.getTextTerms(), older.getTextTerms());

            if (ji >= 0.8) {
                // nearly equal terms
                qual *= JTweet.QUAL_BAD / 100.0;
//                currentTweet.addQualAction("JB," + older.getTwitterId() + ",");
            } else if (ji >= 0.6 && currentTweet.getQualReductions() < 3) {
                // e.g. if 3 of 4 terms occur in both tweets
                qual *= JTweet.QUAL_LOW / 100.0;
//                currentTweet.addQualAction("JL," + older.getTwitterId() + ",");
            }

            if (!sameUrl) {
                sw4.start();
                for (UrlEntry entry : older.getUrlEntries()) {
                    Integer urlCounts = urlMap.get(entry.getResolvedUrl());
                    if (urlCounts != null) {
                        if ((urlCounts == 2 || urlCounts == 3) && currentTweet.getQualReductions() < 3) {
                            sameUrl = true;
                            qual *= JTweet.QUAL_LOW / 100.0;
//                            currentTweet.addQualAction("UL," + older.getTwitterId() + ",");
                        } else if (urlCounts > 3) {
                            sameUrl = true;
                            // tweeted about the identical url title!
                            qual *= JTweet.QUAL_BAD / 100.0;
//                            currentTweet.addQualAction("UB," + older.getTwitterId() + ",");
                        }
                    }
                }
                sw4.stop();
            }
        }
        sw2.stop();
        return qual;
    }

    public static double calcJaccardIndex(StringFreqMap map1, StringFreqMap map2) {
        int a = map1.andSize(map2);
        double b = map1.orSize(map2);
        return a / b;
    }

    public void calcTermsWithoutNoise(JTweet tw) {
        if (tw.getTextTerms().size() > 0)
            return;

        TweetDetector extractor = new TweetDetector().runOne(tw.getText());
        tw.setTextTerms(extractor.getTerms());
        tw.setLanguages(extractor.getLanguages());

        // create text signature, sort against frequency
//        int termsAtOnce = 2;
//        int packs = 3;
//        int maxTerms = packs * termsAtOnce;
//        Signature sig = null;
//        List<Signature> sigs = new ArrayList<Signature>();
//        List<Entry<String, Integer>> frequentTerms = tw.getTextTerms().getSortedTermLimited(maxTerms);
////        Set<String> sortedTerms = new TreeSet<String>();
////        for (Entry<String, Integer> e : frequentTerms) {
////            sortedTerms.add(e.getKey());
////        }
//
//        // now sort the remaining terms alphabetically to compensate term mixure
//        // TODO: it would be better to sort terms alphabetically only if they have equal count
//        int counter = 0;
//        for (Entry<String, Integer> term : frequentTerms) {
////          for (String term : sortedTerms) {
//            if (counter++ % termsAtOnce == 0) {
//                //sig = new MD5Signature();
//                // we can convert this easily to long because it is only 64 bit
//                sig = new Lookup3Signature();
//                sigs.add(sig);
//            }
//
//            sig.add(term.getKey());
////            sig.add(term);
//        }
//
//        for (Signature tmpSig : sigs) {
//            tw.addTextSignature(Helper.byteArray2long(tmpSig.getSignature()));
//        }
    }
}
TOP

Related Classes of de.jetwick.tw.cmd.TermCreateCommand

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.