Package org.apache.ctakes.core.ae

Source Code of org.apache.ctakes.core.ae.TokenConverter

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.core.ae;

import org.apache.uima.jcas.JCas;

import org.apache.ctakes.core.nlp.tokenizer.Token;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.ContractionToken;
import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
import org.apache.ctakes.typesystem.type.syntax.NumToken;
import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
import org.apache.ctakes.typesystem.type.syntax.SymbolToken;
import org.apache.ctakes.typesystem.type.syntax.WordToken;

/**
* Utilities methods for converting between Java Tokenizer objects and their
* equivalent JCas objects.
*
* @author Mayo Clinic
*/
public class TokenConverter
{
    /**
     * Converts from Java Tokenizer object into a JCas object.
     */
    public static BaseToken convert(Token t, JCas jcas, int beginPos)
    {
        BaseToken bta = null;

        int begin = beginPos + t.getStartOffset();
        int end = beginPos + t.getEndOffset();

        switch (t.getType())
        {
        case Token.TYPE_WORD:
            WordToken wta = new WordToken(jcas);
            wta.setBegin(begin);
            wta.setEnd(end);
            int cap = -1;
            switch (t.getCaps())
            {
            case Token.CAPS_ALL:
                cap = TokenizerAnnotator.TOKEN_CAP_ALL;
                break;
            case Token.CAPS_FIRST_ONLY:
                cap = TokenizerAnnotator.TOKEN_CAP_FIRST_ONLY;
                break;
            case Token.CAPS_MIXED:
                cap = TokenizerAnnotator.TOKEN_CAP_MIXED;
                break;
            case Token.CAPS_NONE:
                cap = TokenizerAnnotator.TOKEN_CAP_NONE;
                break;
            }

            int numPos = -1;
            switch (t.getNumPosition())
            {
            case Token.NUM_FIRST:
                numPos = TokenizerAnnotator.TOKEN_NUM_POS_FIRST;
                break;
            case Token.NUM_MIDDLE:
                numPos = TokenizerAnnotator.TOKEN_NUM_POS_MIDDLE;
                break;
            case Token.NUM_LAST:
                numPos = TokenizerAnnotator.TOKEN_NUM_POS_LAST;
                break;
            case Token.NUM_NONE:
                numPos = TokenizerAnnotator.TOKEN_NUM_POS_NONE;
                break;
            }
            wta.setCapitalization(cap);
            wta.setNumPosition(numPos);
            bta = wta;
            break;
        case Token.TYPE_NUMBER:
            NumToken nta = new NumToken(jcas);
            nta.setBegin(begin);
            nta.setEnd(end);
            if (t.isInteger())
            {
                nta.setNumType(TokenizerAnnotator.TOKEN_NUM_TYPE_INTEGER);
            }
            else
            {
                nta.setNumType(TokenizerAnnotator.TOKEN_NUM_TYPE_DECIMAL);
            }
            bta = nta;
            break;
        case Token.TYPE_PUNCT:
            PunctuationToken pta = new PunctuationToken(jcas);
            pta.setBegin(begin);
            pta.setEnd(end);
            bta = pta;
            break;
        case Token.TYPE_EOL:
            NewlineToken nlta = new NewlineToken(jcas);
            nlta.setBegin(begin);
            nlta.setEnd(end);
            bta = nlta;
            break;
        case Token.TYPE_CONTRACTION:
            ContractionToken cta = new ContractionToken(
                    jcas);
            cta.setBegin(begin);
            cta.setEnd(end);
            bta = cta;
            break;
        case Token.TYPE_SYMBOL:
            SymbolToken sta = new SymbolToken(jcas);
            sta.setBegin(begin);
            sta.setEnd(end);
            bta = sta;
            break;
        default:
        }

        return bta;
    }

    /**
     * Convert from a JCas object into Java Tokenizer object.
     */
    public static Token convert(BaseToken bta)
    {
        Token token = new Token(bta.getBegin(), bta.getEnd());
        token.setText(bta.getCoveredText());

        if (bta instanceof WordToken)
        {
            WordToken wta = (WordToken) bta;
            token.setType(Token.TYPE_WORD);

            switch (wta.getCapitalization())
            {
            case TokenizerAnnotator.TOKEN_CAP_ALL:
                token.setCaps(Token.CAPS_ALL);
                break;
            case TokenizerAnnotator.TOKEN_CAP_FIRST_ONLY:
                token.setCaps(Token.CAPS_FIRST_ONLY);
                break;
            case TokenizerAnnotator.TOKEN_CAP_MIXED:
                token.setCaps(Token.CAPS_MIXED);
                break;
            case TokenizerAnnotator.TOKEN_CAP_NONE:
                token.setCaps(Token.CAPS_NONE);
                break;
            }

            switch (wta.getNumPosition())
            {
            case TokenizerAnnotator.TOKEN_NUM_POS_FIRST:
                token.setNumPosition(Token.NUM_FIRST);
                break;
            case TokenizerAnnotator.TOKEN_NUM_POS_MIDDLE:
                token.setNumPosition(Token.NUM_MIDDLE);
                break;
            case TokenizerAnnotator.TOKEN_NUM_POS_LAST:
                token.setNumPosition(Token.NUM_LAST);
                break;
            case TokenizerAnnotator.TOKEN_NUM_POS_NONE:
                token.setNumPosition(Token.NUM_NONE);
                break;
            }
        }
        else if (bta instanceof NumToken)
        {
            NumToken nta = (NumToken) bta;
            token.setType(Token.TYPE_NUMBER);

            if (nta.getNumType() == TokenizerAnnotator.TOKEN_NUM_TYPE_INTEGER)
            {
                token.setIsInteger(true);
            }
            else
            {
                token.setIsInteger(false);
            }
        }
        else if (bta instanceof PunctuationToken)
        {
            token.setType(Token.TYPE_PUNCT);
        }
        else if (bta instanceof NewlineToken)
        {
            token.setType(Token.TYPE_EOL);
        }
        else if (bta instanceof ContractionToken)
        {
            token.setType(Token.TYPE_CONTRACTION);
        }
        else if (bta instanceof SymbolToken)
        {
            token.setType(Token.TYPE_SYMBOL);
        }

        return token;
    }   
}
TOP

Related Classes of org.apache.ctakes.core.ae.TokenConverter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.