Package cc.twittertools.corpus.data

Source Code of cc.twittertools.corpus.data.HTMLStatusExtractor

package cc.twittertools.corpus.data;

import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URL;
import java.net.URLDecoder;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.TimeZone;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonObject;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

public class HTMLStatusExtractor {

    public SimpleDateFormat date_fmt = new SimpleDateFormat("EEE MMM d kk:mm:ss Z yyyy");

    public HTMLStatusExtractor() {
  date_fmt.setTimeZone(TimeZone.getTimeZone("UTC"));
    }

    public static Map<String, String> splitQuery(URL url)
  throws java.io.UnsupportedEncodingException {
  Map<String, String> query_pairs = new LinkedHashMap<String, String>();
  String query = url.getQuery();
  String[] pairs = query.split("&");
  for (String pair : pairs) {
      int idx = pair.indexOf("=");
      query_pairs.put(URLDecoder.decode(pair.substring(0, idx), "UTF-8"),
          URLDecoder.decode(pair.substring(idx + 1), "UTF-8"));
  }
  return query_pairs;
    }

    public JsonObject extractTweet(String html)
  throws java.net.MalformedURLException, java.io.UnsupportedEncodingException {
  JsonObject status = new JsonObject();

  Document doc = Jsoup.parse(html);
  Element tweet_div = doc.select("div.permalink-tweet").first();

  String tweet_text = tweet_div.select("p.tweet-text").first().text();
  status.addProperty("text", tweet_text);

  String tweet_id = tweet_div.attr("data-tweet-id");
  status.addProperty("id_str", tweet_id);
  status.addProperty("id", Long.parseLong(tweet_id));

  String timestamp = doc.select("span.js-short-timestamp").first().attr("data-time");
  Date created_at = new Date();
  created_at.setTime(Long.parseLong(timestamp) * 1000);
  status.addProperty("created_at", date_fmt.format(created_at));

  Elements js_stats_retweets = doc.select("li.js-stat-retweets");
  if (!js_stats_retweets.isEmpty()) {
      status.addProperty("retweeted", true);
      String count = js_stats_retweets.select("strong").first().text();
      status.addProperty("retweet_count", Long.parseLong(count));
  } else {
      status.addProperty("retweeted", false);
      status.addProperty("retweet_count", 0);
  }
  Elements js_stats_favs = doc.select("li.js-stat-favorites");
  status.addProperty("favorited", !js_stats_favs.isEmpty());
     

  // User subfield
  JsonObject user = new JsonObject();
  String user_id = tweet_div.attr("data-user-id");
  user.addProperty("id_str", user_id);
  user.addProperty("id", Long.parseLong(user_id));
  String screen_name = tweet_div.attr("data-screen-name");
  user.addProperty("screen_name", screen_name);
  String user_name = tweet_div.attr("data-name");
  user.addProperty("name", user_name);
 
  status.add("user", user);
 
  // Geo information
  Elements tweet_loc = doc.select("a.tweet-geo-text");
  if (!tweet_loc.isEmpty()) {
      JsonObject location = new JsonObject();
      Element loc = tweet_loc.first();
      // Adding http to avoid malformed URL exception
      URL url = new URL("http:" + loc.attr("href"));
      Map<String, String> query_params = HTMLStatusExtractor.splitQuery(url);
      // Loop over possible query parameters
      // http://asnsblues.blogspot.ch/2011/11/google-maps-query-string-parameters.html
      String lat_and_long = null;
      if ((lat_and_long = query_params.get("ll")) != null
    || (lat_and_long = query_params.get("sll")) != null
    || (lat_and_long = query_params.get("cbll")) != null
    || (lat_and_long = query_params.get("q")) != null) {
    String[] coordinates = lat_and_long.split(",");
    double latitude = Double.parseDouble(coordinates[0]);
    double longitude = Double.parseDouble(coordinates[1]);
    location.addProperty("latitude", latitude);
    location.addProperty("longitude", longitude);
      }
      location.addProperty("location_text", loc.text());
      status.add("location", location);
  }

  return status;
    }

    private static final String HTML_OPTION = "html";
   
    @SuppressWarnings("static-access")
    public static void main(String[] args) throws Exception {
  Options options = new Options();
  options.addOption(OptionBuilder.withArgName("path").hasArg()
        .withDescription("HTML file from twitter.com").create(HTML_OPTION));
 
  CommandLine cmdline = null;
  CommandLineParser parser = new GnuParser();
  try {
      cmdline = parser.parse(options, args);
  } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      System.exit(-1);
  }
 
  if (!cmdline.hasOption(HTML_OPTION)) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(HTMLStatusExtractor.class.getName(), options);
      System.exit(-1);
  }
 
  String html_filename = cmdline.getOptionValue(HTML_OPTION);
  BufferedReader html_file = null;
  StringBuffer buf = new StringBuffer();
  try {
      html_file = new BufferedReader(new InputStreamReader(new FileInputStream(html_filename)));
      String line;
      while ((line = html_file.readLine()) != null) {
    buf.append(line);
    buf.append('\n');
      }
  } catch (IOException e) {
      e.printStackTrace();
  } finally {
      html_file.close();
  }

  HTMLStatusExtractor hse = new HTMLStatusExtractor();
  JsonObject json = hse.extractTweet(buf.toString());
  Gson gson = new GsonBuilder().setPrettyPrinting().create();
  System.out.println(gson.toJson(json));
    }
}
TOP

Related Classes of cc.twittertools.corpus.data.HTMLStatusExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.