Examples of org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB

Package org.apache.ctakes.core.nlp.tokenizer

Examples of org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB

org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB
A class used to break natural text into tokens following PTB rules. See Supplementary Guidelines for ETTB 2.0 dated April 6th, 2009. The token markup is external to the text and is not embedded. Character offset location is used to identify the boundaries of a token. @author Mayo Clinic

   * 
   * @throws FileNotFoundException
   * @throws IOException
   */
  private void initTokenizer() throws FileNotFoundException, IOException {
    this.tokenizer = new TokenizerPTB();
  }

View Full Code Here


  @Override
  public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);
    logger.info("Initializing " + this.getClass().getName());
    tokenizer = new TokenizerPTB();
    skipSegmentsSet = new HashSet<>();
    if(skipSegmentsArray != null){
      Collections.addAll(skipSegmentsSet, skipSegmentsArray);
    }
  }

View Full Code Here

    System.gc();


    if (args.length == 1) { // If no file of hyphenated words given
      try {
        directoryOfDelimitedFiles = args[0];
        tokenizer = new TokenizerPTB();
        new CreateAssertionLuceneIndexFromDelimitedFile(tokenizer);
      } catch (Exception e) {
        e.printStackTrace();
      }
    } else if (args.length == 3) { // else, use the file of hyphenated words
                    // during tokenization
      try {


        directoryOfDelimitedFiles = args[0];
        // ** hyphnated file no longer needed. using the new PTB
        // tokenizer instead. **
        // String hyphFileLoc = args[1];
        // int freqCutoff = Integer.parseInt(args[2]);
        // Map hyphMap = loadHyphMap(hyphFileLoc);
        // System.out.println("Processing hyphMap from : " +
        // hyphFileLoc);


        tokenizer = new TokenizerPTB();
        new CreateAssertionLuceneIndexFromDelimitedFile(tokenizer);
      } catch (Exception e) {
        e.printStackTrace();
      }
    } else {

View Full Code Here

    System.gc();


    if (args.length == 1) { // If no file of hyphenated words given
      try {
        directoryOfDelimitedFiles = args[0];
        tokenizer = new TokenizerPTB();
        new CreateLuceneIndexFromDelimitedFile(tokenizer);
      } catch (Exception e) {
        e.printStackTrace();
      }
    } else if (args.length == 3) { // else, use the file of hyphenated words
                    // during tokenization
      try {


        directoryOfDelimitedFiles = args[0];
        // ** hyphnated file no longer needed. using the new PTB
        // tokenizer instead. **
        // String hyphFileLoc = args[1];
        // int freqCutoff = Integer.parseInt(args[2]);
        // Map hyphMap = loadHyphMap(hyphFileLoc);
        // System.out.println("Processing hyphMap from : " +
        // hyphFileLoc);


        tokenizer = new TokenizerPTB();
        new CreateLuceneIndexFromDelimitedFile(tokenizer);
      } catch (Exception e) {
        e.printStackTrace();
      }
    } else {

View Full Code Here

   */
  private void configInit() throws ResourceAccessException {


    skipSegmentsSet = ParamUtil.getStringParameterValuesSet(PARAM_SEGMENTS_TO_SKIP, context); 


    tokenizer = new TokenizerPTB();


  }

View Full Code Here

TOP

Related Classes of org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB

org.apache.ctakes.core.ae.TokenizerAnnotatorPTB

org.apache.ctakes.dictionary.assertion.CreateAssertionLuceneIndexFromDelimitedFile

org.apache.ctakes.dictionary.lookup.tools.CreateLuceneIndexFromDelimitedFile

org.apache.ctakes.typesystem.type.syntax.BaseToken

org.apache.ctakes.typesystem.type.syntax.ContractionToken

org.apache.ctakes.typesystem.type.syntax.NewlineToken

org.apache.ctakes.typesystem.type.syntax.NumToken

org.apache.ctakes.typesystem.type.syntax.PunctuationToken

org.apache.ctakes.typesystem.type.syntax.SymbolToken

org.apache.ctakes.typesystem.type.syntax.WordToken

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.