Package org.apache.stanbol.enhancer.nlp.model

Examples of org.apache.stanbol.enhancer.nlp.model.Token


        while(sections.hasNext()){
            Section section = sections.next();
            //Tokenize section
            opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
            for(int i=0;i<tokenSpans.length;i++){
                Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
                log.trace(" > add {}",token);
            }
        }
    }
View Full Code Here


            for(PhraseBuilder pb : phraseBuilders){
                pb.nextSection(sentence);
            }
            Iterator<Token> tokens = sentence.getTokens();
            while(tokens.hasNext()){
                Token token = tokens.next();
                for(PhraseBuilder pb : phraseBuilders){
                    pb.nextToken(token);
                }
            }
        }
View Full Code Here

        return states[0];
    }
   
    @SuppressWarnings("unchecked") //varargs with generic types
    private void buildPhrase(Token token) {
        Token lastConsumedToken = null;
        if(valid){
            //search backwards for the first token matching an allowed end
            //category
            int endIndex = current.size()-1;
            while(endIndex > 0 && !checkCategories(current.get(endIndex),
View Full Code Here

        OffsetAttribute offset = null;
        try {
          tokenStream.reset(); //required with Solr 4
            while (tokenStream.incrementToken()){
                offset = tokenStream.addAttribute(OffsetAttribute.class);
                Token token = at.addToken(offset.startOffset(), offset.endOffset());
                //Get the POS attribute and init the PosTag
                PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
                PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
                if(posTag == null){
                    posTag = adhocTags.get(posAttr.getPartOfSpeech());
                    if(posTag == null){
                        posTag = new PosTag(posAttr.getPartOfSpeech());
                        adhocTags.put(posAttr.getPartOfSpeech(), posTag);
                        log.warn(" ... missing PosTag mapping for {}",posAttr.getPartOfSpeech());
                    }
                }
                //Sentence detection by POS tag
                if(sentStartOffset < 0){ //the last token was a sentence ending
                  sentStartOffset = offset.startOffset();
                }
                if(posTag.hasPos(Pos.Point)) {
                    Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
                    //add the sentence as context to the NerData instances
                    while(nerSentIndex < nerList.size()){
                        nerList.get(nerSentIndex).context = sent.getSpan();
                        nerSentIndex++;
                    }
                    sentStartOffset = -1;
                }
                //POS
                token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
                //NER
                NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
                if(ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))){
                    //write NER annotation
                    Chunk chunk = at.addChunk(ner.start, ner.end);
                    chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
                    //NOTE that the fise:TextAnnotation are written later based on the nerList
                    //clean up
                    ner = null;
                }
                if(nerTag != null){
                    if(ner == null){
                        ner = new NerData(nerTag, offset.startOffset());
                        nerList.add(ner);
                    }
                    ner.end = offset.endOffset();
                }
                BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
                MorphoFeatures morpho = null;
                if(baseFormAttr != null && baseFormAttr.getBaseForm() != null){
                  morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
                  morpho.addPos(posTag); //and add the posTag
                }
                InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
                inflectionAttr.getInflectionForm();
                inflectionAttr.getInflectionType();
                if(morpho != null){ //if present add the morpho
                  token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
                }
            }
            //we still need to write the last sentence
            Sentence lastSent = null;
            if(offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset){
View Full Code Here

            while(sentences.hasNext()){
                // (1) get Tokens and POS information for the sentence
                Section sentence = sentences.next();
                Iterator<Token> tokens = sentence.getTokens();
                while(tokens.hasNext()){
                    Token token = tokens.next();
                    tokenList.add(token);
                    tokenTextList.add(token.getSpan());
                    Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
                    if(posValue == null){
                        throw new EngineException("Missing POS value for Token '"
                            + token.getSpan()+"' of ContentItem "+ci.getUri()
                            + "(Sentence: '"+sentence.getSpan()+"'). This may "
                            + "indicate that a POS tagging Engine is missing in "
                            + "the EnhancementChain or that the used POS tagging "
                            + "does not provide POS tags for each token!");
                    } else {
View Full Code Here

        int sentence = text.indexOf('.')+1;
        Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
        expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " +
            "cities such as Paris and people such as Bob Marley.");
       
        Token the = sent1.addToken(0, 3);
        expectedTokens.put(the, "The");
        the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("PREP",Pos.Preposition), 0.85));
       
        Token stanbol = sent1.addToken(4,11);
        expectedTokens.put(stanbol, "Stanbol");
        stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("PN", Pos.ProperNoun),0.95));
        stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(
            0.5));
       
        //use index to create Tokens
        int enhancerStart = sent1.getSpan().toString().indexOf("enhancer");
        Token enhancer = sent1.addToken(enhancerStart,enhancerStart+"enhancer".length());
        expectedTokens.put(enhancer, "enhancer");
        enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("PN", Pos.ProperNoun),0.95));
        enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
            new PosTag("N", LexicalCategory.Noun),0.87));
        MorphoFeatures morpho = new MorphoFeatures("enhance");
        morpho.addCase(new CaseTag("test-case-1",Case.Comitative));
        morpho.addCase(new CaseTag("test-case-2",Case.Abessive));
        morpho.addDefinitness(Definitness.Definite);
        morpho.addPerson(Person.First);
        morpho.addPos(new PosTag("PN", Pos.ProperNoun));
        morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
        morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
        morpho.addTense(new TenseTag("test-tense", Tense.Present));
        morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
        enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));

        //create a chunk
        Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
        expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
        stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(
            new NerTag("organization", DBPEDIA_ORGANISATION)));
        stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(
            new PhraseTag("NP", LexicalCategory.Noun),0.98));
View Full Code Here

        ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
        Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));
       
        //Add some Tokens with POS annotations to test the usage of
        //existing POS annotations by the lemmatizer
        Token verbrachten = at.addToken(de_verbStart,de_verbStart+de_verb.length());
        verbrachten.addAnnotation(POS_ANNOTATION, Value.value(
            new PosTag("V",LexicalCategory.Verb), de_verbProb));
       
        Token schonen = at.addToken(de_adjectiveStart,de_adjectiveStart+de_adjective.length());
        schonen.addAnnotation(POS_ANNOTATION, Value.value(
            new PosTag("ADJ",LexicalCategory.Adjective), de_adjectiveProb));
       
        Token urlaub = at.addToken(de_nounStart,de_nounStart+de_noun.length());
        urlaub.addAnnotation(POS_ANNOTATION, Value.value(
            new PosTag("NC",LexicalCategory.Noun), de_nounProb));
       
        Assert.assertEquals("Can not enhance Test ContentItem",
            EnhancementEngine.ENHANCE_ASYNC,engine.canEnhance(ci));
        //compute the enhancements
        try {
            engine.computeEnhancements(ci);
        } catch (EngineException e) {
            RemoteServiceHelper.checkServiceUnavailable(e);
            return; //deactivate test
        }
        //now validate the enhancements
        boolean foundVerb = false;
        boolean foundAdjective = false;
        boolean foundNoun = false;
        for(Iterator<Token> tokens = at.getTokens(); tokens.hasNext();){
            Token token = tokens.next();
            log.info("Token: {}",token);
            List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
            if(de_verb.equals(token.getSpan())){
                foundVerb = !mfs.isEmpty();
                validateMorphFeatureProbability(mfs,LexicalCategory.Verb,de_verbProb);
            } else if(de_adjective.equals(token.getSpan())){
                foundAdjective = !mfs.isEmpty();
                validateMorphFeatureProbability(mfs,LexicalCategory.Adjective,de_adjectiveProb);
            } else if(de_noun.equals(token.getSpan())){
                foundNoun = !mfs.isEmpty();
                validateMorphFeatureProbability(mfs,LexicalCategory.Noun,de_nounProb);
            }
            for(Value<MorphoFeatures> mf : mfs){
                log.info("  - {}",mf);
View Full Code Here

        //TODO: locking for AnalysedText not yet defined
//        ci.getLock().writeLock().lock();
//        try {
        Iterator<Token> tokens = analysedText.getTokens();
        while(tokens.hasNext()){
            Token token = tokens.next();
            boolean process = !adjectivesOnly;
            if(!process){ //check POS types
                Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();
                boolean ignore = false;
                while(!ignore && !process && posTags.hasNext()) {
                    Value<PosTag> value = posTags.next();
                    PosTag tag = value.value();
                    boolean state = classifier.isAdjective(tag) || classifier.isNoun(tag);
                    ignore = !state && value.probability() >= minPOSConfidence;
                    process = state && value.probability() >= (minPOSConfidence/2.0);
                }
            } //else process all tokens ... no POS tag checking needed
            if(process){
                double sentiment = classifier.classifyWord(token.getSpan());
                if(sentiment != 0.0){
                    token.addAnnotation(SENTIMENT_ANNOTATION, new Value<Double>(sentiment));
                } //else do not set sentiments with 0.0
            }
        }
//        } finally {
//            ci.getLock().writeLock().unlock();
View Full Code Here

            // get the tokens, words of the current sentence
            List<Token> tokens = new ArrayList<Token>(32);
            List<String> words = new ArrayList<String>(32);
            for(Iterator<Token> it =sentences.get(i).getTokens();it.hasNext();){
                Token t = it.next();
                tokens.add(t);
                words.add(t.getSpan());
            }
            Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
            double[] probs = finder.probs();
            //int lastStartPosition = 0;
            for (int j = 0; j < nameSpans.length; j++) {
View Full Code Here

            while(sentences.hasNext()){
                // (1) get Tokens and POS information for the sentence
                Section sentence = sentences.next();
                Iterator<Token> tokens = sentence.getTokens();
                while(tokens.hasNext()){
                    Token token = tokens.next();
                    tokenList.add(token);
                    tokenTextList.add(token.getSpan());
                    Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
                    if(posValue == null){
                        throw new EngineException("Missing POS value for Token '"
                            + token.getSpan()+" of ContentItem "+ci.getUri()
                            + "(Sentence: '"+sentence.getSpan()+"'). This may "
                            + "indicate that a POS tagging Engine is missing in "
                            + "the EnhancementChain or that the used POS tagging "
                            + "does not provide POS tags for each token!");
                    } else {
View Full Code Here

TOP

Related Classes of org.apache.stanbol.enhancer.nlp.model.Token

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.