Package org.apache.ctakes.core.nlp.tokenizer

Examples of org.apache.ctakes.core.nlp.tokenizer.Token


      while (j > i) {

        StringBuffer candSB = new StringBuffer();
        for (int k = i; k <= j; k++) {
          Token currtok = (Token) inputtoks.get(k);
          candSB.append(" ");
          candSB.append(currtok.getText());
        }
        String cand = candSB.toString().trim();

        // Attempt to look up the candidate in the hyphen map
        if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) {

          // set the initial offsets
          orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset();
          orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset();
          new_startOffset = orig_startOffset;
          new_endOffset = orig_endOffset;

          // compile new text
          String newText = "";
          for (int k = i; k <= j; k++) {
            Token currtok = (Token) inputtoks.get(k);
            newText += currtok.getText() + "-";
          }
          newText = newText.substring(0, newText.length() - 1);

          // Get the new and old lengths of hyphenated spans
          int new_Length = newText.length();
View Full Code Here


  public static List<BaseToken> convertTokens(List<Token> tokens) {
    List<BaseToken> baseTokens = new ArrayList<BaseToken>();

    for (int i = 0; i < tokens.size(); i++) {
      Token t = (Token) tokens.get(i);
      switch (t.getType()) {
      case Token.TYPE_WORD:
        WordToken wt = new WordTokenAdapter(t);
        baseTokens.add(wt);
        break;
      case Token.TYPE_PUNCT:
        PunctuationToken pt = new PunctuationTokenAdapter(t);
        baseTokens.add(pt);
        break;
      case Token.TYPE_NUMBER:
        if (t.isInteger()) {
          IntegerToken it = new IntegerTokenAdapter(t);
          baseTokens.add(it);
        } else {
          DecimalToken dt = new DecimalTokenAdapter(t);
          baseTokens.add(dt);
View Full Code Here

    /**
     * Convert from a JCas object into Java Tokenizer object.
     */
    public static Token convert(BaseToken bta)
    {
        Token token = new Token(bta.getBegin(), bta.getEnd());
        token.setText(bta.getCoveredText());

        if (bta instanceof WordToken)
        {
            WordToken wta = (WordToken) bta;
            token.setType(Token.TYPE_WORD);

            switch (wta.getCapitalization())
            {
            case TokenizerAnnotator.TOKEN_CAP_ALL:
                token.setCaps(Token.CAPS_ALL);
                break;
            case TokenizerAnnotator.TOKEN_CAP_FIRST_ONLY:
                token.setCaps(Token.CAPS_FIRST_ONLY);
                break;
            case TokenizerAnnotator.TOKEN_CAP_MIXED:
                token.setCaps(Token.CAPS_MIXED);
                break;
            case TokenizerAnnotator.TOKEN_CAP_NONE:
                token.setCaps(Token.CAPS_NONE);
                break;
            }

            switch (wta.getNumPosition())
            {
            case TokenizerAnnotator.TOKEN_NUM_POS_FIRST:
                token.setNumPosition(Token.NUM_FIRST);
                break;
            case TokenizerAnnotator.TOKEN_NUM_POS_MIDDLE:
                token.setNumPosition(Token.NUM_MIDDLE);
                break;
            case TokenizerAnnotator.TOKEN_NUM_POS_LAST:
                token.setNumPosition(Token.NUM_LAST);
                break;
            case TokenizerAnnotator.TOKEN_NUM_POS_NONE:
                token.setNumPosition(Token.NUM_NONE);
                break;
            }
        }
        else if (bta instanceof NumToken)
        {
            NumToken nta = (NumToken) bta;
            token.setType(Token.TYPE_NUMBER);

            if (nta.getNumType() == TokenizerAnnotator.TOKEN_NUM_TYPE_INTEGER)
            {
                token.setIsInteger(true);
            }
            else
            {
                token.setIsInteger(false);
            }
        }
        else if (bta instanceof PunctuationToken)
        {
            token.setType(Token.TYPE_PUNCT);
        }
        else if (bta instanceof NewlineToken)
        {
            token.setType(Token.TYPE_EOL);
        }
        else if (bta instanceof ContractionToken)
        {
            token.setType(Token.TYPE_CONTRACTION);
        }
        else if (bta instanceof SymbolToken)
        {
            token.setType(Token.TYPE_SYMBOL);
        }

        return token;
    }   
View Full Code Here

      throw new AnalysisEngineProcessException(e);
    }

    Iterator<Token> tokenItr = tokens.iterator();
    while (tokenItr.hasNext()) {
      Token token = (Token) tokenItr.next();

      // convert token into JCas object
      BaseToken bta = TokenConverter.convert(token, jcas, beginPos);

      bta.setTokenNumber(tokenCount);
View Full Code Here

TOP

Related Classes of org.apache.ctakes.core.nlp.tokenizer.Token

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.