Package uk.ac.cam.ha293.tweetlabel.classify

Source Code of uk.ac.cam.ha293.tweetlabel.classify.CalaisClassifier

package uk.ac.cam.ha293.tweetlabel.classify;

import java.io.IOException;
import java.io.StringReader;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.clearforest.calais.common.CalaisJavaIf;
import com.clearforest.calais.common.StringUtils;

public class CalaisClassifier {

  private static String[] keys;
  private static int keyIndex;
  private static int requestCount;
  private static CalaisJavaIf calais;

  public CalaisClassifier() {
    init(); //NOTE: needs to be called from elsewhere too
  }
 
  public static void init() {
    keys = new String[13];
    keys[0] = "3xw5bayrkykgeseyydwxw4re";
    keys[1] = "amn8753bkhfhxf3mth3yv6ev";
    keys[2] = "sg2dn8yfy4uavasmnzehaffg";
    keys[3] = "v7kg9y8p388q3wjd4arsgtw2";
    keys[4] = "bkuf9tnx9xsb6hgjf4f2x845";
    keys[5] = "tnvpbfb94mxgvg9ycaygkqak";
    keys[6] = "6umcpjh6ufp6v3th6zw6yfgs";
    keys[7] = "zcuu786ap89euazm88b77kzm";
    keys[8] = "54dr8xsuy6z72m5p3x6dt5nq";
    keys[9] = "hk3nwmjzf5pg8qt79sac9yth";
    keys[10] = "jt5pyvr9f428f3qr4rsyh6dg";
    keys[11] = "sg87qnkr8c5jjw233ezejqwh";
    keys[12] = "fkrrhm7w8asp3dchddzvzzs3";
    keyIndex = 0;
    requestCount = 8600; //roughly where it left off
    calais = new CalaisJavaIf(keys[keyIndex]);
  }
     
  //Discards a lot of tweets - maybe better to amalgamate some?
  public static CalaisClassification classifyText(String text) {
   
    if(text.length() < 100) {
      //System.err.println("Tweet is <100 chars - too short for Calais");
      return null;
    }
   
    updateRequestCount();
   
    //Hacky key rotation scheme to hopefully bypass the time and rate limits...
    keyIndex++;
    if(keyIndex >= keys.length) keyIndex = 0;
    calais = new CalaisJavaIf(keys[keyIndex]);
       
    CalaisClassification result = new CalaisClassification();   
    String xmlResponse = StringUtils.unescapeHTML(calais.callEnlighten(text));
         
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    DocumentBuilder documentBuilder = null;
    org.w3c.dom.Document parsedXML = null;
    try {
      documentBuilder = dbf.newDocumentBuilder();
     
      StringReader reader = new StringReader(xmlResponse);
      InputSource inputSource = new InputSource(reader);
      parsedXML = documentBuilder.parse(inputSource);
         
      //System.out.println(text);
     
      for(int i=0; i<3; i++) {
        String pairedString = null;;
        try {
          pairedString = parsedXML.getChildNodes().item(0).getChildNodes().item(2).getChildNodes().item(3+i).getTextContent();
        } catch(NullPointerException e) {
          //System.err.println("NullPointerException found, scores cannot be present in the XML");
          break;
        }
        if(!pairedString.startsWith("Calais")) {
          break;
        }
        pairedString = pairedString.substring(6);
        int scoreIndex = pairedString.indexOf("0.");
        if(pairedString.equals("Other")) {
          result.add("Other", 1.0);
          break;
        }
       
        //"Foolproof" check to make sure we do actually have a category String...
        if(scoreIndex == -1) {
          break;
        }
             
        String category = pairedString.substring(0, scoreIndex);
        String score = pairedString.substring(scoreIndex);
        try {
          result.add(category, Double.parseDouble(score));
        } catch(NumberFormatException e) {
          System.err.println("Got an NFE - "+category+", "+score);
        }
      }   
     
      return result;
     
    } catch (IOException e) {
      System.err.println("Couldn't parse the XML response - IOException");
    } catch (ParserConfigurationException e) {
      System.err.println("Couldn't parse the XML response - Parser Config Error");
    } catch (SAXException e) {
      System.err.println("Couldn't parse the XML response - SAX Error. Retrying...");
     
      //experimental retry
      return classifyText(text);
    }
    return null;
  }
 
  private static void updateRequestCount() {
    requestCount++;
    if(requestCount % 50 == 0) {
      System.out.println("Calais Requests Made: "+requestCount);
     
      if(requestCount % 49900 == 0) {
        updateKey();
      }
    }
  }
 
  private static boolean updateKey() {
    if(keyIndex < keys.length - 1) {
      System.out.println("Trying another Calais key...");
      keyIndex++;
      calais = new CalaisJavaIf(keys[keyIndex]);
      System.out.println("Using key "+keyIndex);
      return true;
    } else {
      System.out.println("Out of Calais keys to try");
      return false;
    }
  }
 
}
TOP

Related Classes of uk.ac.cam.ha293.tweetlabel.classify.CalaisClassifier

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.