Package org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState

Examples of org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState.TokenData


     * Steps over the sentences, chunks, tokens of the {@link #sentences}
     */
    public void process() throws EntitySearcherException {
        //int debugedIndex = 0;
        while(state.next()) {
            TokenData token = state.getToken();
            if(log.isDebugEnabled()){
                log.debug("--- preocess Token {}: {} (lemma: {} | pos:{}) chunk: {}",
                    new Object[]{token.index,token.token.getSpan(),
                                 token.morpho != null ? token.morpho.getLemma() : "none",
                                 token.token.getAnnotations(POS_ANNOTATION),
                                 token.inChunk != null ?
                                         (token.inChunk.chunk + " "+ token.inChunk.chunk.getSpan()) :
                                             "none"});
            }
            List<String> searchStrings = new ArrayList<String>(linkerConfig.getMaxSearchTokens());
            searchStrings.add(token.getTokenText());
            //Determine the range we are allowed to search for tokens
            final int minIncludeIndex;
            final int maxIndcludeIndex;
            //NOTE: testing has shown that using Chunks to restrict search for
            //      additional matchable tokens does have an negative impact on
            //      recall. Because of that this restriction is for now deactivated
            boolean restrirctContextByChunks = false; //TODO: maybe make configurable
            if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks() &&
                    restrirctContextByChunks){
                minIncludeIndex = Math.max(
                    state.getConsumedIndex()+1,
                    token.inChunk.startToken);
                maxIndcludeIndex = token.inChunk.endToken;
            } else {
                maxIndcludeIndex = state.getTokens().size() - 1;
                minIncludeIndex = state.getConsumedIndex() + 1;
            }
            int prevIndex,pastIndex; //search away from the currently active token
            int distance = 0;
            do {
                distance++;
                prevIndex = token.index-distance;
                pastIndex = token.index+distance;
                if(minIncludeIndex <= prevIndex){
                    TokenData prevToken = state.getTokens().get(prevIndex);
                    if(log.isDebugEnabled()){
                        log.debug("    {} {}:'{}' (lemma: {} | pos:{})",new Object[]{
                            prevToken.isMatchable? '+':'-',prevToken.index,
                            prevToken.token.getSpan(),
                            prevToken.morpho != null ? prevToken.morpho.getLemma() : "none",
                            prevToken.token.getAnnotations(POS_ANNOTATION)
                        });
                    }
                    if(prevToken.isMatchable){
                        searchStrings.add(0,prevToken.getTokenText());
                    }
                }
                if(maxIndcludeIndex >= pastIndex){
                    TokenData pastToken = state.getTokens().get(pastIndex);
                    if(log.isDebugEnabled()){
                        log.debug("    {} {}:'{}' (lemma: {} | pos:{})",new Object[]{
                            pastToken.isMatchable? '+':'-',pastToken.index,
                            pastToken.token.getSpan(),
                            pastToken.morpho != null ? pastToken.morpho.getLemma() : "none",
                            pastToken.token.getAnnotations(POS_ANNOTATION)
                        });
                    }
                    if(pastToken.isMatchable){
                        searchStrings.add(pastToken.getTokenText());
                    }
                }
            } while(searchStrings.size() < linkerConfig.getMaxSearchTokens() && distance <
                    linkerConfig.getMaxSearchDistance() &&
                    (prevIndex > minIncludeIndex || pastIndex < maxIndcludeIndex));
View Full Code Here


        int firstProcessableFoundIndex = -1;
        int lastFoundIndex = -1;
        int lastProcessableFoundIndex = -1;
        int firstFoundLabelIndex = -1;
        int lastfoundLabelIndex = -1;
        TokenData currentToken;
        String currentTokenText;
        int currentTokenLength;
        int notFound = 0;
        int matchedTokensNotWithinProcessableTokenSpan = 0;
        int foundTokensWithinCoveredProcessableTokens = 0;
        float minTokenMatchFactor = linkerConfig.getMinTokenMatchFactor();
        //search for matches within the correct order
        for(int currentIndex = state.getToken().index;
                currentIndex < state.getTokens().size()
                && search ;currentIndex++){
            currentToken = state.getTokens().get(currentIndex);
            if(currentToken.hasAlphaNumeric){
                currentTokenText = currentToken.getTokenText();
                if(!linkerConfig.isCaseSensitiveMatching()){
                    currentTokenText = currentTokenText.toLowerCase();
                }
                currentTokenLength = currentTokenText.length();
                boolean found = false;
                float matchFactor = 0f;
                //iteration starts at the next token after the last matched one
                //so it is OK to skip tokens in the label, but not within the text
                for(int i = lastfoundLabelIndex+1;!found && i < labelTokens.length;i ++){
                    String labelTokenText = labelTokens[i];
                    int labelTokenLength = labelTokenText.length();
                    float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
                    float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
                    if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison
                        int matchCount = compareTokens(currentTokenText, labelTokenText);
                        if(matchCount/maxLength >= minTokenMatchFactor){
                            lastfoundLabelIndex = i; //set the last found index to the current position
                            found = true; //set found to true -> stops iteration
                            matchFactor = matchCount/maxLength; //how good is the match
                            //remove matched labels from the set to disable them for
                            //a later random oder search
                            labelTokenSet.remove(labelTokenText);
                        }
                    }
                }
                if(!found){
                    //search for a match in the wrong order
                    //currently only exact matches (for testing)
                    if(found = labelTokenSet.remove(currentTokenText)){
                        matchFactor = 0.7f;
                    }
                }
                //int found = text.indexOf(currentToken.getText().toLowerCase());
                if(found){ //found
                    if(currentToken.isMatchable){
                        foundProcessableTokens++; //only count processable Tokens
                        if(firstProcessableFoundIndex < 0){
                            firstProcessableFoundIndex = currentIndex;
                        }
                        lastProcessableFoundIndex = currentIndex;
                        foundTokensWithinCoveredProcessableTokens++;
                        if(matchedTokensNotWithinProcessableTokenSpan > 0){
                            foundTokensWithinCoveredProcessableTokens = foundTokensWithinCoveredProcessableTokens +
                                    matchedTokensNotWithinProcessableTokenSpan;
                            matchedTokensNotWithinProcessableTokenSpan = 0;
                        }
                    } else {
                        matchedTokensNotWithinProcessableTokenSpan++;
                    }
                    foundTokens++;
                    foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
                    if(firstFoundIndex < 0){
                        firstFoundIndex = currentIndex;
                        firstFoundLabelIndex = lastfoundLabelIndex;
                    }
                    lastFoundIndex = currentIndex;
                } else { //not found
                    notFound++;
                    if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
                        //stop as soon as a token that needs to be processed is
                        //not found in the label or the maximum number of tokens
                        //that are not processable are not found
                        search = false;
                    }
                }
            } // else token without alpha or numeric characters are not processed
        }
        //search backwards for label tokens until firstFoundLabelIndex if there
        //are unconsumed Tokens in the sentence before state.getTokenIndex
        int currentIndex = state.getToken().index-1;
        int labelIndex = firstFoundLabelIndex-1;
        notFound = 0;
        matchedTokensNotWithinProcessableTokenSpan = 0;
        search = true;
        while(search && labelIndex >= 0 && currentIndex > state.getConsumedIndex()){
            String labelTokenText = labelTokens[labelIndex];
            if(labelTokenSet.contains(labelTokenText)){ //still not matched
                currentToken = state.getTokens().get(currentIndex);
                currentTokenText = currentToken.getTokenText();
                if(!linkerConfig.isCaseSensitiveMatching()){
                    currentTokenText = currentTokenText.toLowerCase();
                }
                currentTokenLength = currentTokenText.length();
                boolean found = false;
View Full Code Here

TOP

Related Classes of org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState.TokenData

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.