Package de.jetwick.util

Source Code of de.jetwick.util.GenericUrlResolver

/*
*  Copyright 2010 Peter Karich jetwick_@_pannous_._info
*
*  Licensed under the Apache License, Version 2.0 (the "License");
*  you may not use this file except in compliance with the License.
*  You may obtain a copy of the License at
*
*       http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package de.jetwick.util;

import com.google.inject.Inject;
import de.jetwick.tw.*;
import de.jetwick.data.JTweet;
import de.jetwick.data.UrlEntry;
import de.jetwick.es.ElasticTweetSearch;
import de.jetwick.snacktory.HtmlFetcher;
import de.jetwick.snacktory.JResult;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.elasticsearch.common.cache.CacheBuilder;
import org.elasticsearch.common.collect.MapMaker;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* This class takes the urls from article index and resolves them. Additionally
* and more importantly it stores the text and title into article index.
*
* @author Peter Karich, jetwick_@_pannous_._info
*/
public class GenericUrlResolver extends MyThread implements AnyExecutor<JTweet> {

    private final Logger logger = LoggerFactory.getLogger(getClass());
    private int resolveThreads = 5;
    private int resolveTimeout = 500;
    private ExecutorService service;
    private long testWait = -1;
    protected BlockingQueue<JTweet> resolverQueue;
    @Inject
    private ElasticTweetSearch tweetSearch;
    private UrlTitleCleaner urlTitleCleaner = new UrlTitleCleaner();
    @Inject
    private HtmlFetcher fetcher;
    private final Map<String, JTweet> unresolvedCache;
    private final Map<String, Object> tooOldMap;
    private static final Object OBJECT = new Object();
    private AtomicInteger counter = new AtomicInteger(0);
    private AtomicInteger emptyTitleCounter = new AtomicInteger(0);
    private AtomicLong start = new AtomicLong(System.nanoTime());

//    public GenericUrlResolver() {
//        this(600);
//    }
    public GenericUrlResolver(int queueSize) {
        super("generic-url-resolver");
        unresolvedCache = createGenericCache(5000, 24 * 60);
        tooOldMap = createGenericCache(500, 24 * 60);
        resolverQueue = new LinkedBlockingQueue<JTweet>(queueSize);
    }

    public static <K, V> Map<K, V> createGenericCache(int count, int minutes) {
        // do NOT use .softKeys() otherwise we will get == comparison which
        // is bad for 'new Long'       
        return (ConcurrentMap<K, V>) CacheBuilder.newBuilder().concurrencyLevel(20).maximumSize(count).
                expireAfterAccess(minutes, TimeUnit.MINUTES).build().asMap();
    }

    public GenericUrlResolver setHtmlFetcher(HtmlFetcher fetcher) {
        this.fetcher = fetcher;
        return this;
    }

    public GenericUrlResolver setTest(long testWait) {
        this.testWait = testWait;
        return this;
    }

    public void setResolveTimeout(int resolveTimeout) {
        this.resolveTimeout = resolveTimeout;
    }

    public int getResolveTimeout() {
        return resolveTimeout;
    }

    public GenericUrlResolver setResolveThreads(int resolveThreads) {
        this.resolveThreads = resolveThreads;
        return this;
    }

    public ExecutorService getService() {
        if (service == null)
            service = Executors.newFixedThreadPool(resolveThreads);

        return service;
    }

    public BlockingQueue<JTweet> getInputQueue() {
        return resolverQueue;
    }

    JTweet findUrlInCache(String url) {
        return unresolvedCache.get(url);
    }

    int getUnresolvedSize() {
        return unresolvedCache.size();
    }

    @Override
    public void run() {
        Collection<Callable<Object>> workerCollection = new ArrayList<Callable<Object>>(resolveThreads);
        for (int i = 0; i < resolveThreads; i++) {
            final int tmp = i;
            workerCollection.add(new Callable() {

                @Override
                public Object call() throws Exception {
                    try {
                        while (true) {
                            if (!executeResolve(tmp))
                                break;
                        }
                        logger.info(getName() + " stopped");
                    } catch (Throwable ex) {
                        logger.error("url resolver " + tmp + "died", ex);
                    }
                    return null;
                }
            });
        }
        try {
            if (testWait > 0)
                getService().invokeAll(workerCollection, testWait, TimeUnit.MILLISECONDS);
            else
                getService().invokeAll(workerCollection);

            logger.warn("FINISHED " + getName() + " testWait:" + testWait);
        } catch (InterruptedException ex) {
            logger.info(getName() + " was interrupted:" + ex.getMessage());
        }
    }

    public void queueObject(JTweet tw) {
        // if tweet is persistent we need to queue it
        boolean directlyQueueIt = false;
        String url = tw.getUrl();
        if (tweetSearch.tooOld(tw.getCreatedAt())) {
            tooOldMap.put(url, OBJECT);
            unresolvedCache.remove(url);
            directlyQueueIt = true;
        } else {
            if (Helper.isEmpty(url))
                tweetSearch.queueObject(tw);
            else if (tooOldMap.containsKey(url)) {
                logger.warn("(2) Skipped too old tweet: " + url);
                directlyQueueIt = true;
            } else {
                putObject(tw);
            }
        }
        if (!directlyQueueIt && tw.isPersistent())
            tweetSearch.queueObject(tw);
    }

    void putObject(JTweet tw) {
        if (isTweetInIndex(tw)) {
            // no need to queue again to aindex as we queue if article already exists on every resolve
            unresolvedCache.remove(tw.getUrl());
            canRemoveOrigUrl(tw);
            tweetSearch.queueObject(tw);
        } else {
            if (canRemoveOrigUrl(tw)) {
                tweetSearch.queueObject(tw);
                return;
            }

            String url = tw.getUrl();
            boolean alreadyExistent = false;
            for (int i = 0; i < 2; i++) {
                JTweet old = unresolvedCache.put(url, tw);
                if (old != null) {
                    if (tw.getTwitterId() == old.getTwitterId())
                        tw.updateFrom(old);

                    tweetSearch.queueObject(tw);
                    alreadyExistent = true;
                    break;
                }

                String tmp = getFirstOrigUrl(tw);
                if (Helper.isEmpty(tmp) || tmp.equals(url))
                    break;
                url = tmp;
                // try again for original url
            }

            if (!alreadyExistent)
                try {
                    resolverQueue.put(tw);
                } catch (InterruptedException ex) {
                    logger.error("Couldn't put article:" + tw.getUrl(), ex);
                }
        }
    }

    private String getFirstOrigUrl(JTweet tw) {
        if (tw.getUrlEntries().size() > 0)
            return tw.getUrlEntries().iterator().next().getOriginalUrl(tw);

        return null;
    }

    public boolean executeResolve(final int thread) {
        JTweet tweet = null;
        try {
            tweet = resolverQueue.take();
        } catch (Exception ex) {
            if (thread == 0)
                logger.warn("url resolver " + thread + " died " + ex.getMessage());
            return false;
        }

        String origUrl = tweet.getUrl();
        String url = origUrl;
        try {
            boolean doFetch = true;
            String resUrl = fetcher.getResolvedUrl(url, resolveTimeout);
            if (!Helper.isEmpty(resUrl) && resUrl.length() > url.length()) {
                url = resUrl;
                // check if resolved url already exists
                if (exists(resUrl)) {
                    unresolvedCache.remove(resUrl);
                    doFetch = false;
                }
            }
            if (doFetch) {
                JResult res = fetcher.fetchAndExtract(url, resolveTimeout, false);

                // set resolved url
                if (tweet.getUrlEntries().size() > 0) {
                    UrlEntry ue = tweet.getUrlEntries().iterator().next();
                    ue.setResolvedUrl(res.getUrl());
                    ue.setResolvedTitle(res.getTitle());
                    ue.setResolvedSnippet(res.getText());
                    ue.setResolvedDomain(Helper.extractDomain(url));
                }

                if (urlTitleCleaner.contains(res.getTitle()))
                    tweet.setQuality(20);

                if (res.getTitle().isEmpty())
                    emptyTitleCounter.addAndGet(1);
                counter.addAndGet(1);
                if (thread < 3) {
                    float secs = (System.nanoTime() - start.get()) / 1e+9f;
                    logger.info(thread + "| " + counter.get() / secs + " entries/sec"//, secs:" + secs
                            + ", feeded:" + counter
                            + ", resolverQueue.size:" + resolverQueue.size()
                            + ", unresolved.size:" + unresolvedCache.size()
                            + ", tooOld.size:" + tooOldMap.size()
                            + ", empty titles:" + emptyTitleCounter);
                }
            }

        } catch (Exception ex) {
            //logger.info("Error while resolveAndFetch url:" + art.getUrl() + " Error:" + Helper.getMsg(ex));
            tweet.setQuality(Math.round(tweet.getQuality() * 0.8f));
        } finally {
            // always feed the article even if there was an error           
            tweetSearch.queueObject(tweet);

            // real time get ensures that we have at least the url in aindex (not so for origURL!)
            unresolvedCache.remove(tweet.getUrl());

            // DISABLED for now as
//            if (!checkAgainQueue.offer(art))
//                logger.error("checkAgainQueue full. Skipped:" + art.getUrl());
        }
        return true;
    }

    boolean isTweetInIndex(JTweet tw) {
        JTweet existing = tweetSearch.findByTwitterId(tw.getTwitterId());
        if (existing != null)
            return true;

        return exists(tw.getUrl());
    }

    boolean canRemoveOrigUrl(JTweet tw) {
        boolean remove = false;
        for (UrlEntry as : tw.getUrlEntries()) {
            String oUrl = as.getOriginalUrl(tw);
            // is original url already in index?
            if (oUrl != null && exists(oUrl)) {
                unresolvedCache.remove(oUrl);
                remove = true;
            }
        }
        return remove;
    }

    boolean exists(String url) {
        return !tweetSearch.findByUrl(url).isEmpty();
    }

    @Override
    public JTweet execute(JTweet tweet) {
        queueObject(tweet);
        return tweet;
    }

    public void setTweetSearch(ElasticTweetSearch tweetSearch) {
        this.tweetSearch = tweetSearch;
    }

    public ElasticTweetSearch getTweetSearch() {
        return tweetSearch;
    }
}
TOP

Related Classes of de.jetwick.util.GenericUrlResolver

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.