Source Code of com.tamingtext.texttamer.solr.NameFilterTest

/*
 * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 * -------------------
 * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
 * http://www.manning.com/ingersoll
 */


package com.tamingtext.texttamer.solr;


import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;


import junit.framework.TestCase;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;


import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.junit.BeforeClass;
import org.junit.Test;


import com.tamingtext.TamingTextTestJ4;




public class NameFilterTest extends TamingTextTestJ4 {
  private static final String input = 
  "The quick brown fox jumped over William Taft the President. " + 
  "There once was a man from New York City who had to catch the bus at 10:30 " +
  "in the morning of December 21, 1992 ";
  


  
  private static String[] modelName = {
      "date", "location", "money", "organization", 
      "percentage", "person", "time"
  };
  
  private static SentenceDetector detector;
  private static NameFinderME[] finder;


  @BeforeClass 
  public static void setupModels() throws IOException {


    File modelDir = getModelDir();
     
    finder = new NameFinderME[modelName.length];
    for (int i=0; i < modelName.length; i++) {
      finder[i] = new NameFinderME(new TokenNameFinderModel(
          new FileInputStream(
              new File(modelDir, "en-ner-" + modelName[i] + ".bin")
              )));
    }


    File modelFile = new File(modelDir, "en-sent.bin");
    InputStream modelStream = new FileInputStream(modelFile);
    SentenceModel model = new SentenceModel(modelStream);
    detector = new SentenceDetectorME(model);
  }
  
  String[] tokenStrings = {
      "The", "quick", "brown", "fox", "jumped", "over", "NE_person", "William",
      "NE_person", "Taft", "the", "President", ".", "There", "once", "was", "a",
      "man", "from", "NE_location", "New", "NE_location", "York", "NE_location", "City",
      "who", "had", "to", "catch", "the", "bus", "at", "NE_time", "10", "NE_time", ":",
      "NE_time", "30", "in", "the", "morning", "of", "NE_date", "December", "NE_date",
      "21", "NE_date", ",", "NE_date", "1992"
  };
  
  int[] positionIncrements = {
    1, 1, 1, 1, 1, 1, 1, 0, 
    1, 0, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 0, 1, 0, 1, 0, 
    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 
    1, 0, 1, 1, 1, 1, 1, 0, 1, 
    0, 1, 0, 1, 0
  };


  @Test public void testNameFilter() throws IOException {
    Reader in = new StringReader(input);
    Tokenizer tok = new SentenceTokenizer(in, detector);
    NameFilter nf = new NameFilter(tok, modelName, finder);


    CharTermAttribute cta;
    PositionIncrementAttribute pta;
    OffsetAttribute oa; 
    
    int pass = 0;
    
    while (pass < 2) { // test reuse.
      int pos = 0;
      int lastStart = 0;
      int lastEnd   = 0;
      
      while (nf.incrementToken()) {
        cta = (CharTermAttribute) nf.getAttribute(CharTermAttribute.class);
        pta = (PositionIncrementAttribute) nf.getAttribute(PositionIncrementAttribute.class);
        oa  = (OffsetAttribute) nf.getAttribute(OffsetAttribute.class);
        
        System.err.println("'" + cta.toString() + "'");
        System.err.println(pta.toString());
        System.err.println(oa.toString());
        System.err.println("--- pass: " + pass);
        
        TestCase.assertEquals(tokenStrings[pos], cta.toString());
        TestCase.assertEquals(positionIncrements[pos], pta.getPositionIncrement());
        
        if (pta.getPositionIncrement() == 0) {
          TestCase.assertEquals(lastStart, oa.startOffset());
          TestCase.assertEquals(lastEnd, oa.endOffset());
        }
        
        if (!cta.toString().startsWith("NE_")) {
          TestCase.assertEquals(input.substring(oa.startOffset(), oa.endOffset()), cta.toString());
        }
        
        lastStart = oa.startOffset();
        lastEnd   = oa.endOffset();
        
        pos++;
      }
      
      //if (pass == 1) nf.dumpState();
      nf.end();
      
      in.close();
      in = new StringReader(input);
      tok.reset(in);
      pass++;
    }
  }
}
Source Code of com.tamingtext.texttamer.solr.NameFilterTest

Related Classes of com.tamingtext.texttamer.solr.NameFilterTest