Package de.alpengeist

Source Code of de.alpengeist.DataCrawler

package de.alpengeist;

import org.apache.commons.lang.StringUtils;
import org.neo4j.graphdb.Direction;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import twitter4j.ResponseList;
import twitter4j.TwitterException;
import twitter4j.User;
import twitter4j.internal.org.json.JSONException;
import twitter4j.internal.org.json.JSONObject;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import static de.alpengeist.Prop.*;

public class DataCrawler extends TwitterCrawler {
    private Logger log = LoggerFactory.getLogger(DataCrawler.class);

    private YahooPlaces places = new YahooPlaces();
    private int placesCountdown;

    public DataCrawler() throws IOException {
        super();

    }

    public void setPlacesCountdown(int max) {
        this.placesCountdown = max;
    }

    public void updateData(final long twitterId) {
        neo.transaction(new NeoFeeder.Trx() {
            @Override
            public void execute(GraphDatabaseService db) throws Exception {
                Node root = findUser(twitterId);
                // Extrawurst for start node
                updateRoot(root);
                try {
                    updateFriends(root, 1);
                } catch (TwitterException e) {
                    // save the work, just report and finish
                    log.warn("Twitter error", e);
                } finally {
                    dataCache.writeCache();
                }
            }
        });
    }

    private void updateRoot(Node node) throws TwitterException {
        if (dataCache.hasNode(node)) {
            dataCache.fillNode(node);
            fillInGeodata(node);
            dataCache.putNode(node);
        } else {
            fillInDataSingleNode(node);
        }
    }

    private void fillInDataSingleNode(Node node) throws TwitterException {
        Map<Long, Node> single = new HashMap<Long, Node>();
        single.put(TWITTER_ID.getL(node), node);
        fillInData(single);
    }

    // processes all friends of node and fills in their data if they have no name yet
    // we could update all friends, but that would eat our Twitter request every time and successive
    // node data enrichment would not be possible
    private void updateFriends(Node node, int depth) throws TwitterException {
        if (depth < maxdepth) {
            Map<Long, Node> friendsWithoutData = new HashMap<Long, Node>();
            for (Relationship rel : node.getRelationships(TwitterRelationship.FRIEND, Direction.OUTGOING)) {
                Node friend = rel.getEndNode();
                if (StringUtils.isEmpty(NAME.getS(friend))) {
                    if (dataCache.hasNode(friend)) {
                        dataCache.fillNode(friend);
                        fillInGeodata(friend);
                        dataCache.putNode(friend);
                        neo.addToNameIndex(friend);
                    } else {
                        friendsWithoutData.put(TWITTER_ID.getL(friend), friend);
                    }
                }
            }
            try {
                if (!(friendsWithoutData.isEmpty() || isRateLimitExceeded))
                    fillInData(friendsWithoutData);
            } catch (TwitterException e) {
                handleTwitterException(e);
            }
            // recurse into friends
            for (Relationship rel : node.getRelationships(TwitterRelationship.FRIEND, Direction.OUTGOING)) {
                updateFriends(rel.getEndNode(), depth + 1);
            }
        }
    }

    private void handleTwitterException(TwitterException e) {
        switch (e.getStatusCode()) {
            case 400:
                // most likely request rate limit; continue since cache can still feed following nodes
                isRateLimitExceeded = true;
                log.warn("twitter rate limit exceeded - skipping further requests");
                break;
            case 404:
                log.warn("some user does not exist anymore");
                break;
            default:
                // swallow it, later inquiries may be satisfied by the cache
        }
    }

    private void fillInData(Map<Long, Node> nodes) throws TwitterException {
        ResponseList response = twitter().lookupUsers(twitterIds(nodes.keySet()));
        for (Object aResponse : response) {
            User twUser = (User) aResponse;
            log.info("filling data {}:{}", twUser.getId(), twUser.getScreenName());
            Node node = nodes.get(twUser.getId());
            NAME.set(node, twUser.getScreenName());
            LABEL.set(node, twUser.getScreenName());
            FULL_NAME.set(node, twUser.getName());
            TIMEZONE.set(node, twUser.getTimeZone());
            FRIEND_COUNT.set(node, (long) twUser.getFriendsCount());
            FOLLOWER_COUNT.set(node, (long) twUser.getFollowersCount());
            LOCATION.set(node, twUser.getLocation());

            fillInGeodata(node);
            neo.addToNameIndex(node);
            dataCache.putNode(node);

        }
    }

    private long[] twitterIds(Set<Long> set) {
        long[] result = new long[set.size()];
        int i = 0;
        for (Long n : set) {
            result[i] = n;
            i++;
        }
        return result;
    }

    private void fillInGeodata(Node node) throws TwitterException {
        if (placesCountdown > 0
                && StringUtils.isNotEmpty(LOCATION.getS(node))
                && !GEOQUALITY.exists(node)
                ) {
            try {
                JSONObject result = places.getPlace(LOCATION.getS(node));
                if (places.isOk(result)) {
                    if (places.getFound(result) > 0) {
                        LATITUDE.set(node, Double.parseDouble(places.getLatitude(result)));
                        LONGITUDE.set(node, Double.parseDouble(places.getLongitude(result)));
                        GEOQUALITY.set(node, places.getGeoQuality(result));
                        COUNTRY.set(node, places.getCountry(result));
                    }
                }
            } catch (JSONException e) {
                throw new TwitterException("JSON exception: " + e.getMessage());
            } finally {
                --placesCountdown;
            }
        }
    }

    public void logStatistics() {
        super.logStatistics();
        places.logStatistics();
    }
}
TOP

Related Classes of de.alpengeist.DataCrawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.