Package org.apache.ctakes.core.nlp.tokenizer

Examples of org.apache.ctakes.core.nlp.tokenizer.Token


    List<Token> removalList = new ArrayList<Token>();
    Iterator<Token> tokenItr = tokenList.iterator();

    while (tokenItr.hasNext()) {
      Token token = tokenItr.next();
      if (token.getType() == Token.TYPE_EOL) {
        removalList.add(token);
      }
    }

    tokenList.removeAll(removalList);
View Full Code Here


      while (j > i) {

        StringBuffer candSB = new StringBuffer();
        for (int k = i; k <= j; k++) {
          Token currtok = (Token) inputtoks.get(k);
          candSB.append(" ");
          candSB.append(currtok.getText());
        }
        String cand = candSB.toString().trim();

        // Attempt to look up the candidate in the hyphen map
        if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) {

          // set the initial offsets
          orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset();
          orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset();
          new_startOffset = orig_startOffset;
          new_endOffset = orig_endOffset;

          // compile new text
          String newText = "";
          for (int k = i; k <= j; k++) {
            Token currtok = (Token) inputtoks.get(k);
            newText += currtok.getText() + "-";
          }
          newText = newText.substring(0, newText.length() - 1);

          // Get the new and old lengths of hyphenated spans
          int new_Length = newText.length();
View Full Code Here

    StringBuilder stemmedDesc = new StringBuilder();

    // get first word token and
    while (tokenItr.hasNext()) {
      tCount++;
      Token t = (Token) tokenItr.next();
      if (tCount == 1) {
        firstTokenText = t.getText(); // first token (aka "first word")
        tokenizedDesc.append(firstTokenText);
        if (this.lvgCmd != null) {
          firstTokenStem = stemToken(t);
          stemmedDesc.append(firstTokenStem);
        }
      } else { // use blank to separate tokens
        tokenizedDesc.append(" ").append(t.getText());
        // stem the next token, add it to the stemmed desc only if there
        // is a valid first word
        if (this.lvgCmd != null && firstTokenStem != null) {
          String stemmedWord = stemToken(t);
          stemmedDesc.append(" ").append(stemmedWord);
View Full Code Here

    List<Token> removalList = new ArrayList<Token>();
    Iterator<Token> tokenItr = tokenList.iterator();

    while (tokenItr.hasNext()) {
      Token token = tokenItr.next();
      if (token.getType() == Token.TYPE_EOL) {
        removalList.add(token);
      }
    }

    tokenList.removeAll(removalList);
View Full Code Here

      while (j > i) {

        StringBuffer candSB = new StringBuffer();
        for (int k = i; k <= j; k++) {
          Token currtok = (Token) inputtoks.get(k);
          candSB.append(" ");
          candSB.append(currtok.getText());
        }
        String cand = candSB.toString().trim();

        // Attempt to look up the candidate in the hyphen map
        if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) {

          // set the initial offsets
          orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset();
          orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset();
          new_startOffset = orig_startOffset;
          new_endOffset = orig_endOffset;

          // compile new text
          String newText = "";
          for (int k = i; k <= j; k++) {
            Token currtok = (Token) inputtoks.get(k);
            newText += currtok.getText() + "-";
          }
          newText = newText.substring(0, newText.length() - 1);

          // Get the new and old lengths of hyphenated spans
          int new_Length = newText.length();
View Full Code Here

   * @param results
   */
  public static void printResults(String text, List results) {
    System.out.println("Text: " + text);
    for (int i = 0; i < results.size(); i++) {
      Token t = (Token) results.get(i);
      String typeStr = "";
      switch (t.getType()) {
      case Token.TYPE_WORD:
        typeStr = "word       ";
        break;
      case Token.TYPE_PUNCT:
        typeStr = "punctuation";
        break;
      case Token.TYPE_NUMBER:
        typeStr = "number     ";
        break;
      case Token.TYPE_EOL:
        typeStr = "end of line";
        break;
      case Token.TYPE_CONTRACTION:
        typeStr = "contraction";
        break;
      case Token.TYPE_SYMBOL:
        typeStr = "symbol     ";
        break;
      default:
        typeStr = "unknown    ";
      }

      String capsStr = "";
      switch (t.getCaps()) {
      case Token.CAPS_ALL:
        capsStr = "A";
        break;
      case Token.CAPS_NONE:
        capsStr = "N";
        break;
      case Token.CAPS_MIXED:
        capsStr = "M";
        break;
      case Token.CAPS_FIRST_ONLY:
        capsStr = "F";
        break;
      default:
        capsStr = "?";
      }

      String numPosStr = "";
      switch (t.getNumPosition()) {
      case Token.NUM_FIRST:
        numPosStr = "F";
        break;
      case Token.NUM_MIDDLE:
        numPosStr = "M";
        break;
      case Token.NUM_LAST:
        numPosStr = "L";
        break;
      case Token.NUM_NONE:
        numPosStr = "N";
        break;
      default:
        numPosStr = "?";
      }

      String intStr = "";
      if (t.isInteger()) {
        intStr = "Y";
      } else {
        intStr = "N";
      }

      System.out.println("Token:" + " type=[" + typeStr + "]" + " caps=["
          + capsStr + "]" + " npos=[" + numPosStr + "]" + " int=["
          + intStr + "]" + " offsets=[" + t.getStartOffset() + ","
          + t.getEndOffset() + "]" + "\t\t" + "text=["
          + text.substring(t.getStartOffset(), t.getEndOffset())
          + "]");
    }
  }
View Full Code Here

      List list = tokenizer.tokenize(cuePhrase);
      Collections.sort(list, new OffsetComparator());

      Iterator tokenItr = list.iterator();
      Token t;
      int tCount = 0;
      String firstTokenText = "";
      String tokenizedCuePhrase = "";

      while (tokenItr.hasNext()) {
        tCount++;
        t = (Token) tokenItr.next();
        if (tCount == 1) {
          firstTokenText = t.getText(); // first token (aka
                          // "first word")
          tokenizedCuePhrase += t.getText();
        } else { // use blank to separate tokens
          tokenizedCuePhrase = tokenizedCuePhrase + " " + t.getText();
        }

      }

      doc.add(new StringField(CUE_PHRASE_FIRST_WORD_FIELD_NAME, firstTokenText, Field.Store.YES));
View Full Code Here

   * @param results
   */
  public static void printResults(String text, List results) {
    System.out.println("Text: " + text);
    for (int i = 0; i < results.size(); i++) {
      Token t = (Token) results.get(i);
      String typeStr = "";
      switch (t.getType()) {
      case Token.TYPE_WORD:
        typeStr = "word       ";
        break;
      case Token.TYPE_PUNCT:
        typeStr = "punctuation";
        break;
      case Token.TYPE_NUMBER:
        typeStr = "number     ";
        break;
      case Token.TYPE_EOL:
        typeStr = "end of line";
        break;
      case Token.TYPE_CONTRACTION:
        typeStr = "contraction";
        break;
      case Token.TYPE_SYMBOL:
        typeStr = "symbol     ";
        break;
      default:
        typeStr = "unknown    ";
      }

      String capsStr = "";
      switch (t.getCaps()) {
      case Token.CAPS_ALL:
        capsStr = "A";
        break;
      case Token.CAPS_NONE:
        capsStr = "N";
        break;
      case Token.CAPS_MIXED:
        capsStr = "M";
        break;
      case Token.CAPS_FIRST_ONLY:
        capsStr = "F";
        break;
      default:
        capsStr = "?";
      }

      String numPosStr = "";
      switch (t.getNumPosition()) {
      case Token.NUM_FIRST:
        numPosStr = "F";
        break;
      case Token.NUM_MIDDLE:
        numPosStr = "M";
        break;
      case Token.NUM_LAST:
        numPosStr = "L";
        break;
      case Token.NUM_NONE:
        numPosStr = "N";
        break;
      default:
        numPosStr = "?";
      }

      String intStr = "";
      if (t.isInteger()) {
        intStr = "Y";
      } else {
        intStr = "N";
      }

      System.out.println("Token:" + " type=[" + typeStr + "]" + " caps=["
          + capsStr + "]" + " npos=[" + numPosStr + "]" + " int=["
          + intStr + "]" + " offsets=[" + t.getStartOffset() + ","
          + t.getEndOffset() + "]" + "\t\t" + "text=["
          + text.substring(t.getStartOffset(), t.getEndOffset())
          + "]");
    }
  }
View Full Code Here

      List list = tokenizer.tokenize(desc);
      Collections.sort(list, new OffsetComparator());

      Iterator tokenItr = list.iterator();
      Token t;
      int tCount = 0;
      String firstTokenText = "";
      String tokenizedDesc = "";

      while (tokenItr.hasNext()) {
        tCount++;
        t = (Token) tokenItr.next();
        if (tCount == 1) {
          firstTokenText = t.getText(); // first token (aka
                          // "first word")
          tokenizedDesc += t.getText();
        } else { // use blank to separate tokens
          tokenizedDesc = tokenizedDesc + " " + t.getText();
        }

      }

      doc.add(new Field(FirstWord, firstTokenText, Field.Store.YES,
View Full Code Here

    List<Token> removalList = new ArrayList<Token>();
    Iterator<Token> tokenItr = tokenList.iterator();

    while (tokenItr.hasNext()) {
      Token token = tokenItr.next();
      if (token.getType() == Token.TYPE_EOL) {
        removalList.add(token);
      }
    }

    tokenList.removeAll(removalList);
View Full Code Here

TOP

Related Classes of org.apache.ctakes.core.nlp.tokenizer.Token

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.