Source Code of com.tamingtext.frankenstein.Frankenstein

/*
 * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 * -------------------
 * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
 * http://www.manning.com/ingersoll
 */


package com.tamingtext.frankenstein;




import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Span;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.poi.hwpf.usermodel.Paragraph;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


/**
 * Parse Frankenstein book (located in the test resources folder), identifies sentences and then
 * indexes them into Lucene.
 */
public class Frankenstein {
  protected RAMDirectory directory;
  protected IndexSearcher searcher;
  protected SentenceDetector sentenceDetector;
  protected Map<String, NameFinderME> finders;
  protected Tokenizer tokenizer;


  public static void main(String[] args) throws Exception {
    //<start id="frank.start"/>
    Frankenstein frankenstein = new Frankenstein();
    frankenstein.init();
    frankenstein.index();//<co id="frank.index"/>
    String query = null;
    while (true) {
      query = getQuery();//<co id="frank.query"/>
      if (query != null) {
        Results results = frankenstein.search(query);//<co id="frank.search"/>
        frankenstein.examineResults(results);//<co id="frank.exam"/>
        displayResults(results);
      } else {
        break;
      }
    }
    /*
    <calloutlist>
        <callout arearefs="frank.index"><para>Make the content searchable</para></callout>
        <callout arearefs="frank.query"><para>Prompt the user for a query</para></callout>
        <callout arearefs="frank.search"><para>Perform the search</para></callout>
        <callout arearefs="frank.exam"><para>Parse the results and show interesting items</para></callout>
    </calloutlist>
    */
    //<end id="frank.start"/>
  }


  private void examineResults(Results results) {
    for (Document match : results.matches) {
      //we have a paragraph, let's break sentences and then do NER
      String[] sentencesStr = sentenceDetector.sentDetect(match.get("paragraph"));


      if (sentencesStr != null && sentencesStr.length > 0) {
        Sentence[] sentences = new Sentence[sentencesStr.length];
        results.sentences.put(match.get("id"), sentences);
        //for each sentence, find named entities
        for (int i = 0; i < sentencesStr.length; i++) {
          sentences[i] = new Sentence(sentencesStr[i]);
          String[] tokens = tokenizer.tokenize(sentencesStr[i]);
          for (Map.Entry<String, NameFinderME> finder : finders.entrySet()) {
            String label = finder.getKey();
            Span[] names = finder.getValue().find(tokens);
            //spans index into the tokens array
            if (names != null && names.length > 0) {
              List<String> values = new ArrayList<String>();
              for (int j = 0; j < names.length; j++) {
                StringBuffer cb = new StringBuffer();
                for (int ti = names[j].getStart(); ti < names[j].getEnd(); ti++) {
                  cb.append(tokens[ti]).append(" ");
                }
                values.add(cb.toString());
              }
              sentences[i].names.put(label, values);
            }
          }
        }
      }
    }
  }


  /**
   * Search for the queryStr in the text
   *
   * @param queryStr The query string
   * @return The Results
   * @throws IOException
   * @throws ParseException
   */
  private Results search(String queryStr) throws IOException, ParseException {
    System.out.println("Searching for: " + queryStr);
    if (searcher == null) {
      searcher = new IndexSearcher(directory, true);
    }
    Results result = new Results();
    QueryParser qp = new QueryParser(Version.LUCENE_36, "paragraph", new StandardAnalyzer(Version.LUCENE_36));
    Query query = qp.parse(queryStr);
    TopDocs topDocs = searcher.search(query, 20);
    System.out.println("Found " + topDocs.totalHits + " total hits.");
    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
      Document theDoc = searcher.doc(topDocs.scoreDocs[i].doc);
      result.matches.add(theDoc);
    }
    return result;
  }


  /**
   * Index the content of Frankenstein
   *
   * @throws IOException
   */
  private void index() throws IOException {
    System.out.println("Indexing Frankenstein");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("frankenstein-gutenberg.txt");
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
    //let's index paragraphs at a time
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
    directory = new RAMDirectory();
    IndexWriter iw = new IndexWriter(directory, conf);
    String line;
    StringBuilder paraBuffer = new StringBuilder(2048);
    int lines = 0;
    int paragraphs = 0;
    int paragraphLines = 0;
    while ((line = reader.readLine()) != null) {
      if (line.contains("End of the Project Gutenberg")) {//we are in the license section at the end of the book
        break;
      }
      if (line.startsWith("#")) {//skip comments
        continue;
      }
      //if the line is blank, we have a paragraph, so let's index it
      if (line.matches("^\\s*$") && paraBuffer.length() > 0) {
        Document doc = new Document();
        //We can retrieve by paragraph number if we want
        String theString = paraBuffer.toString();
        theString.trim();
        if (theString.length() > 0 && theString.matches("^\\s*$") == false) {
          addMetadata(doc, lines, paragraphs, paragraphLines);
          doc.add(new Field("paragraph", theString, Field.Store.YES, Field.Index.ANALYZED));//add the main content
          iw.addDocument(doc);//Index the document
          paragraphs++;
        }
        //reset some of our state
        paraBuffer.setLength(0);//we are done w/ this paragraph
        paragraphLines = 0;
      } else {
        paraBuffer.append(line).append(' ');
      }
      lines++;
      paragraphLines++;
    }
    System.out.println("Processed " + lines + " lines.  Paragraphs: " + paragraphs);
    iw.close();
  }


  private void addMetadata(Document doc, int lines, int paragraphs, int paragraphLines) {
    doc.add(new Field("id", "frank_" + paragraphs, Field.Store.YES, Field.Index.NOT_ANALYZED));
    NumericField startLine = new NumericField("startLine", Field.Store.YES, true);
    startLine.setIntValue(lines - paragraphLines);
    doc.add(startLine);
    NumericField finishLine = new NumericField("finishLine", Field.Store.YES, true);
    finishLine.setIntValue(lines);
    doc.add(finishLine);
    NumericField paragraphNumber = new NumericField("paragraphNumber", Field.Store.YES, true);
    paragraphNumber.setIntValue(paragraphs);
    doc.add(paragraphNumber);
  }


  /**
   * Initialize OpenNLP libraries and other resources
   * @throws IOException
   */
  private void init() throws IOException {
    System.out.println("Initializing Frankenstein");
    File models = new File("./opennlp-models");
    File wordnet = new File("./WordNet-3.0");
    if (models.exists() == false) {
      throw new FileNotFoundException("./opennlp-models");
    }
    System.setProperty("model.dir", "./opennlp-models");
    System.setProperty("wordnet.dir", "./WordNet-3.0");


    File modelFile = new File(models, "en-sent.bin");
    InputStream modelStream = new FileInputStream(modelFile);
    SentenceModel model = new SentenceModel(modelStream);
    sentenceDetector = new SentenceDetectorME(model);
    finders = new HashMap<String, NameFinderME>();
    finders.put("Names", new NameFinderME(new TokenNameFinderModel(
            new FileInputStream(getPersonModel()))));
    finders.put("Dates", new NameFinderME(new TokenNameFinderModel(
            new FileInputStream(getDateModel()))));
    finders.put("Locations", new NameFinderME(new TokenNameFinderModel(
            new FileInputStream(getLocationModel()))));


    tokenizer = SimpleTokenizer.INSTANCE;
  }


  private static String getQuery() throws IOException {
    System.out.println("");
    System.out.println("Type your query.  Hit Enter to process the query (the empty string will exit the program):");
    System.out.print('>');
    System.out.flush();
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    String line = in.readLine();


    if (line == null || line.length() == -1 || line.equals("")) {
      return null;
    }
    return line;
  }


  private static void displayResults(Results results) {
    int k = 0;
    for (Document document : results.matches) {
      System.out.println("-----------------------------------");
      System.out.println("Match: [" + k + "] Paragraph: " + document.get("paragraphNumber"));
      System.out.println("Lines: " + document.get("startLine") + "-" + document.get("finishLine"));
      System.out.println("\t" + document.get("paragraph"));
      System.out.println("\t----- Sentences ----");
      Sentence[] sentences = results.sentences.get(document.get("id"));
      for (int i = 0; i < sentences.length; i++) {
        Sentence sentence = sentences[i];
        System.out.println("\t\t[" + i + "] " + sentence.sentence);
        if (sentence.names.isEmpty() == false) {
          for (Map.Entry<String, List<String>> entry : sentence.names.entrySet()) {
            System.out.println("\t\t>>>> " + entry.getKey());
            StringBuffer buff = new StringBuffer();


            if (entry.getValue().isEmpty() == false) {
              for (String val : entry.getValue()) {
                buff.append(val.trim()).append(", ");
              }
              buff.setLength(buff.length() - 2);//drop the last comma and space
              System.out.println("\t\t\t" + buff);
            }
          }
          System.out.println("");
        }
      }
      k++;
    }
  }


  public static File getWordNetDir() {
    String wordnetDir = System.getProperty("wordnet.dir");


    return new File(wordnetDir);
  }


  public static File getWordNetDictionary() {
    return new File(getWordNetDir(), "dict");
  }


  public static File getModelDir() {
    String modelsDirProp = System.getProperty("model.dir");
    return new File(modelsDirProp);
  }


  public static File getPersonModel() {
    return new File(getModelDir(), "en-ner-person.bin");
  }


  public static File getDateModel() {
    return new File(getModelDir(), "en-ner-date.bin");
  }


  public static File getLocationModel() {
    return new File(getModelDir(), "en-ner-location.bin");
  }


}


class Results {
  public List<Document> matches = new ArrayList<Document>();
  public Map<String, Sentence[]> sentences = new HashMap<String, Sentence[]>();


}


class Sentence {
  public String sentence;
  public Map<String, List<String>> names = new HashMap<String, List<String>>();


  public Sentence(String sent) {
    sentence = sent;
  }
}
Source Code of com.tamingtext.frankenstein.Frankenstein

Related Classes of com.tamingtext.frankenstein.Frankenstein