Package maui.main

Source Code of maui.main.Examples

package maui.main;

/*
*    MauiModelBuilder.java
*    Copyright (C) 2009 Olena Medelyan
*
*    This program is free software; you can redistribute it and/or modify
*    it under the terms of the GNU General Public License as published by
*    the Free Software Foundation; either version 2 of the License, or
*    (at your option) any later version.
*
*    This program is distributed in the hope that it will be useful,
*    but WITHOUT ANY WARRANTY; without even the implied warranty of
*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*    GNU General Public License for more details.
*
*    You should have received a copy of the GNU General Public License
*    along with this program; if not, write to the Free Software
*    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
import gnu.trove.TIntHashSet;

import java.io.File;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;

import org.wikipedia.miner.model.Wikipedia;
import org.wikipedia.miner.util.ProgressNotifier;
import org.wikipedia.miner.util.text.CaseFolder;
import org.wikipedia.miner.util.text.TextProcessor;

import maui.stemmers.PorterStemmer;
import maui.stopwords.StopwordsEnglish;

/**
* Demonstrates how to use Maui for three types of topic extraction
* 1. Keyphrase extraction - extracting significant phrases from
*  the document, also suitable for automatic tagging.
* 2. Term assignment - indexing documents with terms
* from a controlled vocabulary in SKOS or text format.
* 3. Indexing with Wikipedia - indexing documents with
* terms from Wikipedia, also suitable for
* keyphrase extraction and tagging, or any case where there is no controlled
* vocabulary available, but consistency is required.
*
* @author Olena Medelyan (olena@cs.waikato.ac.nz)
*
*/
public class Examples {

  private MauiTopicExtractor topicExtractor;
  private MauiModelBuilder modelBuilder;

  private Wikipedia wikipedia;


  public Examples (boolean cacheData) throws Exception {
    loadWikipedia(cacheData);
  }
 
  /**
   * Sets general parameters: debugging printout, language specific options
   * like stemmer, stopwords.
   * @throws Exception
   */
  private void setGeneralOptions()  {
    modelBuilder.setDebug(true);
    modelBuilder.setStemmer(new PorterStemmer());
    modelBuilder.setStopwords(new StopwordsEnglish());
    modelBuilder.setDocumentLanguage("en");
    modelBuilder.setMaxPhraseLength(5);
    modelBuilder.setWikipedia(wikipedia);
   
    topicExtractor.setDebug(true);
    topicExtractor.setStemmer(new PorterStemmer());
    topicExtractor.setStopwords(new StopwordsEnglish());
    topicExtractor.setDocumentLanguage("en");
    topicExtractor.setNumTopics(10);
    topicExtractor.setWikipedia(wikipedia);
  }

  /**
   * Set true for all features that will be used
   */
  private void setFeatures() {
    modelBuilder.setBasicFeatures(false);
    modelBuilder.setKeyphrasenessFeature(true);
    modelBuilder.setFrequencyFeatures(false);
    modelBuilder.setPositionsFeatures(false);
    modelBuilder.setLengthFeature(false);
    modelBuilder.setNodeDegreeFeature(false);
    modelBuilder.setBasicWikipediaFeatures(false);
    modelBuilder.setAllWikipediaFeatures(false);
  }

  /**
   * Demonstrates how to perform automatic tagging. Also applicable to
   * keyphrase extraction.
   *
   * @throws Exception
   */
  public void testAutomaticTagging() throws Exception {
    topicExtractor = new MauiTopicExtractor();
    modelBuilder = new MauiModelBuilder();
    setGeneralOptions();
    setFeatures();
   
    // Directories with train & test data
    String trainDir = "data/automatic_tagging/train";
    String testDir = "data/automatic_tagging/test";

    // name of the file to save the model
    String modelName = "test";

    // Settings for the model builder
    modelBuilder.setDirName(trainDir);
    modelBuilder.setModelName(modelName);
   
   
    // change to 1 for short documents
    modelBuilder.setMinNumOccur(3);

    // Run model builder
    HashSet<String> fileNames = modelBuilder.collectStems();
    modelBuilder.buildModel(fileNames);
    modelBuilder.saveModel();

    // Settings for topic extractor
    topicExtractor.setDirName(testDir);
    topicExtractor.setModelName(modelName);
 
   
    // Run topic extractor
    topicExtractor.loadModel();
    fileNames = topicExtractor.collectStems();
    topicExtractor.extractKeyphrases(fileNames);
  }

  /**
   * Demonstrates how to perform term assignment. Applicable to any vocabulary
   * in SKOS or text format.
   *
   * @throws Exception
   */
  public void testTermAssignment() throws Exception {
    topicExtractor = new MauiTopicExtractor();
    modelBuilder = new MauiModelBuilder();
    setGeneralOptions();
    setFeatures();
   
    // Directories with train & test data
    String trainDir = "data/term_assignment/train";
    String testDir = "data/term_assignment/test";

    // Vocabulary
    String vocabulary = "agrovoc";
    String format = "skos";

    // name of the file to save the model
    String modelName = "test";
    HashSet<String> fileNames;

    // Settings for the model builder
    modelBuilder.setDirName(trainDir);
    modelBuilder.setModelName(modelName);
    modelBuilder.setVocabularyFormat(format);
    modelBuilder.setVocabularyName(vocabulary);
   
    // Run model builder
    fileNames = modelBuilder.collectStems();
    modelBuilder.buildModel(fileNames);
    modelBuilder.saveModel();

    // Settings for topic extractor
    topicExtractor.setDirName(testDir);
    topicExtractor.setModelName(modelName);
    topicExtractor.setVocabularyName(vocabulary);
    topicExtractor.setVocabularyFormat(format);
   
    // Run topic extractor
    topicExtractor.loadModel();
    fileNames = topicExtractor.collectStems();
    topicExtractor.extractKeyphrases(fileNames);
  }

  /**
   * Demonstrates how to perform topic indexing
   * with Wikipedia.
   *
   * @throws Exception
   */
  public void testIndexingWithWikipedia() throws Exception {
    topicExtractor = new MauiTopicExtractor();
    modelBuilder = new MauiModelBuilder();
    setGeneralOptions();
    setFeatures();

    // Directories with train & test data
    String trainDir = "data/wikipedia_indexing/train";
    String testDir = "data/wikipedia_indexing/test";

    // Vocabulary
    String vocabulary = "wikipedia";
 
    // name of the file to save the model
    String modelName = "test";
    HashSet<String> fileNames;

    // Settings for the model builder
    modelBuilder.setDirName(trainDir);
    modelBuilder.setModelName(modelName);
    modelBuilder.setVocabularyName(vocabulary);
   
    // Run model builder
    fileNames = modelBuilder.collectStems();
    modelBuilder.buildModel(fileNames);
    modelBuilder.saveModel();

    // Settings for topic extractor
    topicExtractor.setDirName(testDir);
    topicExtractor.setModelName(modelName);
    topicExtractor.setVocabularyName(vocabulary);
   
    // Run topic extractor
    topicExtractor.loadModel();
    fileNames = topicExtractor.collectStems();
    topicExtractor.extractKeyphrases(fileNames);
  }

  private void loadWikipedia(boolean cacheData) throws Exception {

    wikipedia = new Wikipedia("localhost", "enwiki_20090306", "root", null);

    TextProcessor textProcessor = new CaseFolder();

    File dataDirectory = new File(
        "/Users/alyona/Data/wikipedia/data/20090306");
   
    if (cacheData) {
      ProgressNotifier progress = new ProgressNotifier(5);
      // cache tables that will be used extensively
      TIntHashSet validPageIds = wikipedia.getDatabase().getValidPageIds(
          dataDirectory, 2, progress);
      wikipedia.getDatabase().cachePages(dataDirectory, validPageIds,
          progress);
      wikipedia.getDatabase().cacheAnchors(dataDirectory, textProcessor,
          validPageIds, 2, progress);
      wikipedia.getDatabase().cacheInLinks(dataDirectory, validPageIds,
          progress);
    }
  }

  /**
   * Main method for running the three types of topic indexing. Comment out
   * the required one.
   *
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {

    Date todaysDate = new java.util.Date();
    SimpleDateFormat formatter = new SimpleDateFormat(
        "EEE, dd-MMM-yyyy HH:mm:ss");
    String formattedDate1 = formatter.format(todaysDate);

    Examples tester = new Examples(true);

  //  tester.testAutomaticTagging();
  //  tester.testTermAssignment();
    tester.testIndexingWithWikipedia();

    todaysDate = new java.util.Date();
    String formattedDate2 = formatter.format(todaysDate);
    System.err.print("Run from " + formattedDate1);
    System.err.println(" to " + formattedDate2);
  }

}
TOP

Related Classes of maui.main.Examples

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.