Source Code of maui.main.Examples

package maui.main;


/*
 *    MauiModelBuilder.java
 *    Copyright (C) 2009 Olena Medelyan
 *
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
import gnu.trove.TIntHashSet;


import java.io.File;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;


import org.wikipedia.miner.model.Wikipedia;
import org.wikipedia.miner.util.ProgressNotifier;
import org.wikipedia.miner.util.text.CaseFolder;
import org.wikipedia.miner.util.text.TextProcessor;


import maui.stemmers.PorterStemmer;
import maui.stopwords.StopwordsEnglish;


/**
 * Demonstrates how to use Maui for three types of topic extraction 
 * 1. Keyphrase extraction - extracting significant phrases from 
 *  the document, also suitable for automatic tagging. 
 * 2. Term assignment - indexing documents with terms
 * from a controlled vocabulary in SKOS or text format. 
 * 3. Indexing with Wikipedia - indexing documents with 
 * terms from Wikipedia, also suitable for
 * keyphrase extraction and tagging, or any case where there is no controlled
 * vocabulary available, but consistency is required.
 * 
 * @author Olena Medelyan (olena@cs.waikato.ac.nz)
 * 
 */
public class Examples {


  private MauiTopicExtractor topicExtractor;
  private MauiModelBuilder modelBuilder;


  private Wikipedia wikipedia;




  public Examples (boolean cacheData) throws Exception {
    loadWikipedia(cacheData);
  }
  
  /**
   * Sets general parameters: debugging printout, language specific options
   * like stemmer, stopwords.
   * @throws Exception 
   */
  private void setGeneralOptions()  {
    modelBuilder.setDebug(true);
    modelBuilder.setStemmer(new PorterStemmer());
    modelBuilder.setStopwords(new StopwordsEnglish());
    modelBuilder.setDocumentLanguage("en");
    modelBuilder.setMaxPhraseLength(5);
    modelBuilder.setWikipedia(wikipedia);
    
    topicExtractor.setDebug(true);
    topicExtractor.setStemmer(new PorterStemmer());
    topicExtractor.setStopwords(new StopwordsEnglish());
    topicExtractor.setDocumentLanguage("en");
    topicExtractor.setNumTopics(10);
    topicExtractor.setWikipedia(wikipedia);
  }


  /**
   * Set true for all features that will be used
   */
  private void setFeatures() {
    modelBuilder.setBasicFeatures(false);
    modelBuilder.setKeyphrasenessFeature(true);
    modelBuilder.setFrequencyFeatures(false);
    modelBuilder.setPositionsFeatures(false);
    modelBuilder.setLengthFeature(false);
    modelBuilder.setNodeDegreeFeature(false);
    modelBuilder.setBasicWikipediaFeatures(false);
    modelBuilder.setAllWikipediaFeatures(false);
  }


  /**
   * Demonstrates how to perform automatic tagging. Also applicable to
   * keyphrase extraction.
   * 
   * @throws Exception
   */
  public void testAutomaticTagging() throws Exception {
    topicExtractor = new MauiTopicExtractor();
    modelBuilder = new MauiModelBuilder();
    setGeneralOptions();
    setFeatures();
    
    // Directories with train & test data
    String trainDir = "data/automatic_tagging/train";
    String testDir = "data/automatic_tagging/test";


    // name of the file to save the model
    String modelName = "test";


    // Settings for the model builder
    modelBuilder.setDirName(trainDir);
    modelBuilder.setModelName(modelName);
    
    
    // change to 1 for short documents
    modelBuilder.setMinNumOccur(3);


    // Run model builder
    HashSet<String> fileNames = modelBuilder.collectStems();
    modelBuilder.buildModel(fileNames);
    modelBuilder.saveModel();


    // Settings for topic extractor
    topicExtractor.setDirName(testDir);
    topicExtractor.setModelName(modelName);
  
    
    // Run topic extractor
    topicExtractor.loadModel();
    fileNames = topicExtractor.collectStems();
    topicExtractor.extractKeyphrases(fileNames);
  }


  /**
   * Demonstrates how to perform term assignment. Applicable to any vocabulary
   * in SKOS or text format.
   * 
   * @throws Exception
   */
  public void testTermAssignment() throws Exception {
    topicExtractor = new MauiTopicExtractor();
    modelBuilder = new MauiModelBuilder();
    setGeneralOptions();
    setFeatures();
    
    // Directories with train & test data
    String trainDir = "data/term_assignment/train";
    String testDir = "data/term_assignment/test";


    // Vocabulary
    String vocabulary = "agrovoc";
    String format = "skos";


    // name of the file to save the model
    String modelName = "test";
    HashSet<String> fileNames;


    // Settings for the model builder
    modelBuilder.setDirName(trainDir);
    modelBuilder.setModelName(modelName);
    modelBuilder.setVocabularyFormat(format);
    modelBuilder.setVocabularyName(vocabulary);
    
    // Run model builder
    fileNames = modelBuilder.collectStems();
    modelBuilder.buildModel(fileNames);
    modelBuilder.saveModel();


    // Settings for topic extractor
    topicExtractor.setDirName(testDir);
    topicExtractor.setModelName(modelName);
    topicExtractor.setVocabularyName(vocabulary);
    topicExtractor.setVocabularyFormat(format);
    
    // Run topic extractor
    topicExtractor.loadModel();
    fileNames = topicExtractor.collectStems();
    topicExtractor.extractKeyphrases(fileNames);
  }


  /**
   * Demonstrates how to perform topic indexing
   * with Wikipedia.
   * 
   * @throws Exception
   */
  public void testIndexingWithWikipedia() throws Exception {
    topicExtractor = new MauiTopicExtractor();
    modelBuilder = new MauiModelBuilder();
    setGeneralOptions();
    setFeatures();


    // Directories with train & test data
    String trainDir = "data/wikipedia_indexing/train";
    String testDir = "data/wikipedia_indexing/test";


    // Vocabulary
    String vocabulary = "wikipedia";
  
    // name of the file to save the model
    String modelName = "test";
    HashSet<String> fileNames;


    // Settings for the model builder
    modelBuilder.setDirName(trainDir);
    modelBuilder.setModelName(modelName);
    modelBuilder.setVocabularyName(vocabulary);
    
    // Run model builder
    fileNames = modelBuilder.collectStems();
    modelBuilder.buildModel(fileNames);
    modelBuilder.saveModel();


    // Settings for topic extractor
    topicExtractor.setDirName(testDir);
    topicExtractor.setModelName(modelName);
    topicExtractor.setVocabularyName(vocabulary);
    
    // Run topic extractor
    topicExtractor.loadModel();
    fileNames = topicExtractor.collectStems();
    topicExtractor.extractKeyphrases(fileNames);
  }


  private void loadWikipedia(boolean cacheData) throws Exception {


    wikipedia = new Wikipedia("localhost", "enwiki_20090306", "root", null);


    TextProcessor textProcessor = new CaseFolder();


    File dataDirectory = new File(
        "/Users/alyona/Data/wikipedia/data/20090306");
    
    if (cacheData) {
      ProgressNotifier progress = new ProgressNotifier(5);
      // cache tables that will be used extensively
      TIntHashSet validPageIds = wikipedia.getDatabase().getValidPageIds(
          dataDirectory, 2, progress);
      wikipedia.getDatabase().cachePages(dataDirectory, validPageIds,
          progress);
      wikipedia.getDatabase().cacheAnchors(dataDirectory, textProcessor,
          validPageIds, 2, progress);
      wikipedia.getDatabase().cacheInLinks(dataDirectory, validPageIds,
          progress);
    }
  }


  /**
   * Main method for running the three types of topic indexing. Comment out
   * the required one.
   * 
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {


    Date todaysDate = new java.util.Date();
    SimpleDateFormat formatter = new SimpleDateFormat(
        "EEE, dd-MMM-yyyy HH:mm:ss");
    String formattedDate1 = formatter.format(todaysDate);


    Examples tester = new Examples(true);


  //  tester.testAutomaticTagging();
  //  tester.testTermAssignment();
    tester.testIndexingWithWikipedia();


    todaysDate = new java.util.Date();
    String formattedDate2 = formatter.format(todaysDate);
    System.err.print("Run from " + formattedDate1);
    System.err.println(" to " + formattedDate2);
  }


}
Source Code of maui.main.Examples

Related Classes of maui.main.Examples