Source Code of maui.main.MauiTopicExtractor

package maui.main;


/*
 *    MauiTopicExtractor.java
 *    Copyright (C) 2001-2009 Eibe Frank, Olena Medelyan
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Vector;


import org.wikipedia.miner.model.Wikipedia;


import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;


import maui.filters.MauiFilter;
import maui.stemmers.*;
import maui.stopwords.*;




/**
 * Extracts topics from the documents in a given directory.
 * Assumes that the file names for the documents end with ".txt".
 * Puts extracted topics into corresponding files ending with
 * ".key" (if those are not already present). Optionally an encoding
 * for the documents/keyphrases can be defined (e.g. for Chinese
 * text). Documents for which ".key" exists are used for evaluation.
 *
 * Valid options are:<p>
 *
 * -l "directory name"<br>
 * Specifies name of directory.<p>
 *
 * -m "model name"<br>
 * Specifies name of model.<p>
 *
 * -v "vocabulary name"<br>
 * Specifies name of vocabulary.<p>
 *
 * -f "vocabulary format"<br>
 * Specifies format of vocabulary (text or skos).<p>
 *
 * -i "document language" <br>
 * Specifies document language (en, es, de, fr).<p>
 * 
 * -e "encoding"<br>
 * Specifies encoding.<p>
 *
 * -n <br>
 * Specifies number of phrases to be output (default: 5).<p>
 *
 * -t "name of class implementing stemmer"<br>
 * Sets stemmer to use (default: SremovalStemmer). <p>
 *
 * -s "name of class implementing stopwords"<br>
 * Sets stemmer to use (default: StopwordsEnglish). <p>
 * 
 * -d<br>
 * Turns debugging mode on.<p>
 *
 * -g<br>
 * Build global dictionaries from the test set.<p>
 *
 * -a<br>
 * Also write stemmed phrase and score into ".key" file.<p>
 *
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version 1.0
 */
public class MauiTopicExtractor implements OptionHandler {
  
  /** Name of directory */
  String inputDirectoryName = null;
  
  /** Name of model */
  String modelName = null;
  
  /** Vocabulary name */
  String vocabularyName = "none";
  
  /** Format of the vocabulary */
  String vocabularyFormat = null;
  
  /** Document language */
  String documentLanguage = "en";
  
  /** Document encoding */
  String documentEncoding = "default";
  
  /** Debugging mode? */
  boolean debugMode = false;
  
  
  /** Maui filter object */
  private MauiFilter mauiFilter = null;
  
  /** Wikipedia object */
  private Wikipedia wikipedia = null;
  
  /** Name of the server with the mysql Wikipedia data */ 
  private String wikipediaServer = "localhost"; 
  
  /** Name of the database with Wikipedia data */
  private String wikipediaDatabase = "enwiki_20090306";
  
  /** Name of the directory with Wikipedia data in files */
  private String wikipediaDataDirectory = null;
  
  /** Should Wikipedia data be cached first? */
  private boolean cacheWikipediaData = false;
  
  /** The number of phrases to extract. */
  int topicsPerDocument = 10;
  
  /** Stemmer to be used */
  private Stemmer stemmer = new PorterStemmer();
  
  /** Llist of stopwords to be used */
  private Stopwords stopwords = new StopwordsEnglish();
  
  
  /** Also write stemmed phrase and score into .key file. */
  boolean additionalInfo = false;
  
  
  /** Build global dictionaries from the test set. */
  boolean buildGlobalDictionaries = false;
  
  public void setWikipedia(Wikipedia wikipedia) {
    this.wikipedia = wikipedia;
  }
  
  public String getWikipediaDatabase() {
    return wikipediaDatabase;
  }


  public void setWikipediaDatabase(String wikipediaDatabase) {
    this.wikipediaDatabase = wikipediaDatabase;
  }
  
  public String getWikipediaServer() {
    return wikipediaServer;
  }


  public void setWikipediaServer(String wikipediaServer) {
    this.wikipediaServer = wikipediaServer;
  }
  
  public String getWikipediaDataDirectory() {
    return wikipediaDataDirectory;
  }


  public void setWikipediaDataDirectory(String wikipediaDataDirectory) {
    this.wikipediaDataDirectory = wikipediaDataDirectory;
  }
  
  public boolean getCachWikipediaData() {
    return cacheWikipediaData;
  }


  public void setCachWikipediaData(boolean cacheWikipediaData) {
    this.cacheWikipediaData = cacheWikipediaData;
  }
  
  public boolean getAdditionalInfo() {
    return additionalInfo;
  }
  
  public void setAdditionalInfo(boolean additionalInfo) {
    this.additionalInfo = additionalInfo;
  }
  
  public boolean getBuildGlobalDictionaries() {    
    return buildGlobalDictionaries;
  }
  
  public void setBuildGlobal(boolean buildGlobalDictionaries) {  
    this.buildGlobalDictionaries = buildGlobalDictionaries;
  }
  


  public int getNumTopics() {
    return topicsPerDocument;
  }
  
  public void setNumTopics(int topicsPerDocument) {
    this.topicsPerDocument = topicsPerDocument;
  }
  
  public Stopwords getStopwords() {
    return stopwords;
  }
  
  public void setStopwords(Stopwords stopwords) {
    this.stopwords = stopwords;
  }
  
  public Stemmer getStemmer() {
    return stemmer;
  }
  
  public void setStemmer(Stemmer stemmer) {
    this.stemmer = stemmer;
  }
  
  public boolean getDebug() {
    return debugMode;
  }


  public void setDebug(boolean debugMode) {    
    this.debugMode = debugMode;
  }
  
  public String getEncoding() {
    return documentEncoding;
  }
  
  public void setEncoding(String documentEncoding) {
    this.documentEncoding = documentEncoding;
  }
  
  public String getVocabularyName() {
    return vocabularyName;
  }
  
  public void setVocabularyName(String vocabularyName) {
    this.vocabularyName = vocabularyName;
  }
  
  public String getDocumentLanguage() {
    return documentLanguage;
  }
  
  public void setDocumentLanguage(String documentLanguage) {
    this.documentLanguage = documentLanguage;
  }
  
  public String getVocabularyFormat() {
    return vocabularyFormat;
  }
  
  public void setVocabularyFormat(String vocabularyFormat) {
    this.vocabularyFormat = vocabularyFormat;
  }
  
  public String getModelName() {
    return modelName;
  }
  
  public void setModelName(String modelName) {
    this.modelName = modelName;
  }
  
  public String getDirName() {
    return inputDirectoryName;
  }
  
  public void setDirName(String inputDirectoryName) {
    this.inputDirectoryName = inputDirectoryName;
  }
  
  /**
   * Parses a given list of options controlling the behaviour of this object.
   * Valid options are:<p>
   *
   * -l "directory name"<br>
   * Specifies name of directory.<p>
   *
   * -m "model name"<br>
   * Specifies name of model.<p>
   *
   * -v "vocabulary name"<br>
   * Specifies vocabulary name.<p>
   * 
   * -f "vocabulary format"<br>
   * Specifies vocabulary format.<p>
   * 
   * -i "document language" <br>
   * Specifies document language.<p>
   * 
   * -e "encoding"<br>
   * Specifies encoding.<p>
   *
   * -n<br>
   * Specifies number of phrases to be output (default: 5).<p>
   *
   * -d<br>
   * Turns debugging mode on.<p>
   *
   * -b<br>
   * Builds global dictionaries for computing TFxIDF from the test collection.<p>
   *
   * -a<br>
   * Also write stemmed phrase and score into ".key" file.<p>
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {
    
    String dirName = Utils.getOption('l', options);
    if (dirName.length() > 0) {
      setDirName(dirName);
    } else {
      setDirName(null);
      throw new Exception("Name of directory required argument.");
    }
    String modelName = Utils.getOption('m', options);
    if (modelName.length() > 0) {
      setModelName(modelName);
    } else {
      setModelName(null);
      throw new Exception("Name of model required argument.");
    }
    
    String vocabularyName = Utils.getOption('v', options);
    if (vocabularyName.length() > 0) {
      setVocabularyName(vocabularyName);
    } else {
      setVocabularyName(null);
      throw new Exception("Name of vocabulary required argument.");
    }
    
    String vocabularyFormat = Utils.getOption('f', options);
    
    if (!getVocabularyName().equals("none")) {
      if (vocabularyFormat.length() > 0) {
        if (vocabularyFormat.equals("skos") || vocabularyFormat.equals("text")) {
          setVocabularyFormat(vocabularyFormat);
        } else {
          throw new Exception("Unsupported format of vocabulary. It should be either \"skos\" or \"text\".");
        }
      } else {
        setVocabularyFormat(null);
        throw new Exception("If a controlled vocabulary is used, format of vocabulary required argument (skos or text).");
      }
    } else {
      setVocabularyFormat(null);
    }
    
    
    String encoding = Utils.getOption('e', options);
    if (encoding.length() > 0) {
      setEncoding(encoding);
    } else {
      setEncoding("default");
    }
    
    String documentLanguage = Utils.getOption('i', options);
    if (documentLanguage.length() > 0) {
      setDocumentLanguage(documentLanguage);
    } else {
      setDocumentLanguage("en");
    }
    
    
    String numPhrases = Utils.getOption('n', options);
    if (numPhrases.length() > 0) {
      setNumTopics(Integer.parseInt(numPhrases));
    } else {
      setNumTopics(5);
    }
    
    
    String stemmerString = Utils.getOption('t', options);
    if (stemmerString.length() > 0) {
      stemmerString = "maui.stemmers.".concat(stemmerString);
      setStemmer((Stemmer)Class.forName(stemmerString).newInstance());
    }
    
    String stopwordsString = Utils.getOption('s', options);
    if (stopwordsString.length() > 0) {
      stopwordsString = "maui.stopwords.".concat(stopwordsString);
      setStopwords((Stopwords)Class.forName(stopwordsString).newInstance());
    }


    
    setDebug(Utils.getFlag('d', options));
    setBuildGlobal(Utils.getFlag('b', options));
    setAdditionalInfo(Utils.getFlag('a', options));
    Utils.checkForRemainingOptions(options);
  }
  
  /**
   * Gets the current option settings.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String [] getOptions() {
    
    String [] options = new String [21];
    int current = 0;
    
    options[current++] = "-l"; 
    options[current++] = "" + (getDirName());
    options[current++] = "-m"; 
    options[current++] = "" + (getModelName());
    options[current++] = "-v"; 
    options[current++] = "" + (getVocabularyName());
    options[current++] = "-f"; 
    options[current++] = "" + (getVocabularyFormat());
    options[current++] = "-e"; 
    options[current++] = "" + (getEncoding());
    options[current++] = "-i"; 
    options[current++] = "" + (getDocumentLanguage());
    options[current++] = "-n"; 
    options[current++] = "" + (getNumTopics());
    options[current++] = "-t"; 
    options[current++] = "" + (getStemmer().getClass().getName());    
    options[current++] = "-s"; 
    options[current++] = "" + (getStopwords().getClass().getName());


    if (getDebug()) {
      options[current++] = "-d";
    }
    
    if (getBuildGlobalDictionaries()) {
      options[current++] = "-b";
    }
    
    if (getAdditionalInfo()) {
      options[current++] = "-a";
    }
    
    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }
  
  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options
   */
  public Enumeration<Option> listOptions() {
    
    Vector<Option> newVector = new Vector<Option>(13);
    
    newVector.addElement(new Option(
        "\tSpecifies name of directory.",
        "l", 1, "-l <directory name>"));
    newVector.addElement(new Option(
        "\tSpecifies name of model.",
        "m", 1, "-m <model name>"));
    newVector.addElement(new Option(
        "\tSpecifies vocabulary name.",
        "v", 1, "-v <vocabulary name>"));
    newVector.addElement(new Option(
        "\tSpecifies vocabulary format.",
        "f", 1, "-f <vocabulary format>"));
    newVector.addElement(new Option(
        "\tSpecifies encoding.",
        "e", 1, "-e <encoding>"));    
    newVector.addElement(new Option(
        "\tSpecifies document language (en (default), es, de, fr).",
        "i", 1, "-i <document language>"));
    newVector.addElement(new Option(
        "\tSpecifies number of phrases to be output (default: 5).",
        "n", 1, "-n"));
    newVector.addElement(new Option(
        "\tSet the stemmer to use (default: SremovalStemmer).",
        "t", 1, "-t <name of stemmer class>"));
    newVector.addElement(new Option(
        "\tSet the stopwords class to use (default: EnglishStopwords).",
        "s", 1, "-s <name of stopwords class>"));
    newVector.addElement(new Option(
        "\tTurns debugging mode on.",
        "d", 0, "-d"));
    newVector.addElement(new Option(
        "\tBuilds global dictionaries for computing TFIDF from the test collection.",
        "b", 0, "-b"));
    newVector.addElement(new Option(
        "\tAlso write stemmed phrase and score into \".key\" file.",
        "a", 0, "-a"));
    
    return newVector.elements();
  }
  
  /**
   * Collects the file names
   */
  public HashSet<String> collectStems() throws Exception {
    
    HashSet<String> stems = new HashSet<String>();
    
    try {
      File dir = new File(inputDirectoryName);
      
      for (String file : dir.list()) {
        if (file.endsWith(".txt")) {
          String stem = file.substring(0, file.length() - 4);
          
          if (!stems.contains(stem)) {
            stems.add(stem);
          }
        }
      }
    } catch (Exception e) {
      throw new Exception("Problem reading directory " + inputDirectoryName);
    }
    return stems;
  }
  
  /**
   * Builds the model from the files
   */
  public void extractKeyphrases(HashSet<String> fileNames) throws Exception {
    
    // Check whether there is actually any data
    if (fileNames.size() == 0) {
      throw new Exception("Couldn't find any data in " + inputDirectoryName);
    }
    
    mauiFilter.setVocabularyName(getVocabularyName());
    mauiFilter.setVocabularyFormat(getVocabularyFormat());
    mauiFilter.setDocumentLanguage(getDocumentLanguage());
    mauiFilter.setStemmer(getStemmer());
    mauiFilter.setStopwords(getStopwords());
    if (wikipedia != null) {
      mauiFilter.setWikipedia(wikipedia);
    } else {
      mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory);
    }
    if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
      mauiFilter.loadThesaurus(getStemmer(),getStopwords());
    }
    
    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);
    
    System.err.println("-- Extracting Keyphrases... ");
    
    Vector<Double> correctStatistics = new Vector<Double>();
    Vector<Double> precisionStatistics = new Vector<Double>();
    Vector<Double> recallStatistics = new Vector<Double>();
    
    for (String fileName : fileNames) {
    
      double[] newInst = new double[3];
      
      newInst[0] = (double)data.attribute(0).addStringValue(fileName); ;
      
      File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt");
      File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key");
      
      try {


        InputStreamReader is;
        if (!documentEncoding.equals("default")) {
          is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding);
        } else {
          is = new InputStreamReader(new FileInputStream(documentTextFile));
        }
        
        // Reading the file content
        StringBuffer txtStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
          txtStr.append((char)c);
        }
        is.close();
        
        // Adding the text of the document to the instance
        newInst[1] = (double)data.attribute(1).addStringValue(txtStr.toString());
        
      } catch (Exception e) {
        System.err.println("Problem with reading " + documentTextFile);
        e.printStackTrace();
        newInst[1] = Instance.missingValue();
      }
      
      
      try {
        
        InputStreamReader is; 
        if (!documentEncoding.equals("default")) {
          is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding);
        } else {
          is = new InputStreamReader(new FileInputStream(documentTopicsFile));
        }
        
        // Reading the content of the keyphrase file
        StringBuffer keyStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
          keyStr.append((char)c);
        }      
        
        // Adding the topics to the file
        newInst[2] = (double)data.attribute(2).addStringValue(keyStr.toString());
        
      } catch (Exception e) {
        if (debugMode) {
          System.err.println("No existing topics for " + documentTextFile);
        }
        newInst[2] = Instance.missingValue();
      }
      
      
      data.add(new Instance(1.0, newInst));
      
      mauiFilter.input(data.instance(0));
      
      
      data = data.stringFreeStructure();
      if (debugMode) {
        System.err.println("-- Processing document: " + fileName);
      }
      Instance[] topRankedInstances = new Instance[topicsPerDocument];
      Instance inst;
      
      // Iterating over all extracted keyphrases (inst)
      while ((inst = mauiFilter.output()) != null) {
        
        int index = (int)inst.value(mauiFilter.getRankIndex()) - 1;
      
        if (index < topicsPerDocument) {
          topRankedInstances[index] = inst;
        }
      }
      
      if (debugMode) {
        System.err.println("-- Keyphrases and feature values:");
      }
      FileOutputStream out = null;
      PrintWriter printer = null; 


      if (!documentTopicsFile.exists()) {
        out = new FileOutputStream(documentTopicsFile);
        if (!documentEncoding.equals("default")) {
          printer = new PrintWriter(new OutputStreamWriter(out, documentEncoding));          
        } else {
          printer = new PrintWriter(out);
        }
      }
      
      double numExtracted = 0, numCorrect = 0;
      
      for (int i = 0; i < topicsPerDocument; i++) {
        if (topRankedInstances[i] != null) {
          if (!topRankedInstances[i].
              isMissing(topRankedInstances[i].numAttributes() - 1)) {
            numExtracted += 1.0;
          }
          if ((int)topRankedInstances[i].
              value(topRankedInstances[i].numAttributes() - 1) == 1) {
            numCorrect += 1.0;
          }
          if (printer != null) {
            printer.print(topRankedInstances[i].
                stringValue(mauiFilter.getOutputFormIndex()));
            
            if (additionalInfo) {
              printer.print("\t");
              printer.print(topRankedInstances[i].
                  stringValue(mauiFilter.getNormalizedFormIndex()));
              printer.print("\t");
              printer.print(Utils.
                  doubleToString(topRankedInstances[i].
                      value(mauiFilter.
                          getProbabilityIndex()), 4));
            }
            printer.println();
          }
          if (debugMode) {
            System.err.println(topRankedInstances[i]);
          }
        }
      }
      if (numExtracted > 0) {
        if (debugMode) {
          System.err.println("-- " + numCorrect + " correct");
        }
        double totalCorrect = mauiFilter.getTotalCorrect();
        correctStatistics.addElement(new Double(numCorrect));
        precisionStatistics.addElement(new Double(numCorrect/numExtracted));
        double recall = (double)numCorrect/totalCorrect;
        recallStatistics.addElement(new Double(numCorrect/totalCorrect));
        
      }
      if (printer != null) {
        printer.flush();
        printer.close();
        out.close();
      }
    }
    
    if (correctStatistics.size() != 0) {
      
    double[] st = new double[correctStatistics.size()];
    for (int i = 0; i < correctStatistics.size(); i++) {
      st[i] = correctStatistics.elementAt(i).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));
    
    
    if (correctStatistics.size() == 1) {
      System.err.println("\n-- Evaluation results based on 1 document:");
        
    } else {
      System.err.println("\n-- Evaluation results based on " + correctStatistics.size() + " documents:");
    }
    System.err.println("Avg. number of correct keyphrases per document: " +
        Utils.doubleToString(avg, 2) + " +/- " + 
        Utils.doubleToString(stdDev, 2));
    
    
    st = new double[precisionStatistics.size()];
    for (int i = 0; i < precisionStatistics.size(); i++) {
      st[i] = precisionStatistics.elementAt(i).doubleValue();
    }
    double avgPrecision = Utils.mean(st);
    double stdDevPrecision = Math.sqrt(Utils.variance(st));
    
    System.err.println("Precision: " +
        Utils.doubleToString(avgPrecision, 2) + " +/- " + 
        Utils.doubleToString(stdDevPrecision, 2));
    
    st = new double[recallStatistics.size()];
    for (int i = 0; i < recallStatistics.size(); i++) {
      st[i] = recallStatistics.elementAt(i).doubleValue();
    }
    double avgRecall = Utils.mean(st);
    double stdDevRecall = Math.sqrt(Utils.variance(st));
    
    System.err.println("Recall: " +
        Utils.doubleToString(avgRecall, 2) + " +/- " + 
        Utils.doubleToString(stdDevRecall, 2));
    
    double fMeasure = 2*avgRecall*avgPrecision/(avgRecall + avgPrecision);
    System.err.println("F-Measure: " + Utils.doubleToString(fMeasure, 2));
    
    System.err.println("");
    }
    mauiFilter.batchFinished();
  }
  
  
  
  /** 
   * Loads the extraction model from the file.
   */
  public void loadModel() throws Exception {
    
    BufferedInputStream inStream =
      new BufferedInputStream(new FileInputStream(modelName));
    ObjectInputStream in = new ObjectInputStream(inStream);
    mauiFilter = (MauiFilter)in.readObject();
    
    // If TFxIDF values are to be computed from the test corpus
    if (buildGlobalDictionaries == true) {
      if (debugMode) {
        System.err.println("-- The global dictionaries will be built from this test collection..");
      }
      mauiFilter.globalDictionary = null;      
    }
    in.close();
  }
  
  /**
   * The main method.  
   */
  public static void main(String[] ops) {
    
    MauiTopicExtractor topicExtractor = new MauiTopicExtractor();
    try {
      // Checking and Setting Options selected by the user:
      topicExtractor.setOptions(ops);      
      System.err.print("Extracting keyphrases with options: ");
      
      // Reading Options, which were set above and output them:
      String[] optionSettings = topicExtractor.getOptions();
      for (int i = 0; i < optionSettings.length; i++) {
        System.err.print(optionSettings[i] + " ");
      }
      System.err.println();
      
      // Loading selected Model:
      System.err.println("-- Loading the Model... ");
      topicExtractor.loadModel();
      
      // Extracting Keyphrases from all files in the selected directory
      topicExtractor.extractKeyphrases(topicExtractor.collectStems());
      
    } catch (Exception e) {
      
      // Output information on how to use this class
      e.printStackTrace();
      System.err.println(e.getMessage());
      System.err.println("\nOptions:\n");
      Enumeration<Option> en = topicExtractor.listOptions();
      while (en.hasMoreElements()) {
        Option option = (Option) en.nextElement();
        System.err.println(option.synopsis());
        System.err.println(option.description());
      }
    }
  }
}
Source Code of maui.main.MauiTopicExtractor

Related Classes of maui.main.MauiTopicExtractor