Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

Package org.apache.stanbol.enhancer.nlp.model

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

org.apache.stanbol.enhancer.nlp.model.AnalysedText
Provides access to NLP processing results of the text/plain {@link Blob} of an ContentItem. Intended to be{@link ContentItem#addPart(org.apache.clerezza.rdf.core.UriRef,Object) addedas ContentPart} by using {@link #ANALYSED_TEXT_URI}. @see ContentItem#addPart(UriRef,Object)

     *          expected
     */
    @Override
    public void computeEnhancements(final ContentItem ci) throws EngineException {
        //get/create the AnalysedText
        final AnalysedText at = NlpEngineHelper.initAnalysedText(this, analysedTextFactory, ci);
        final Blob blob = at.getBlob();
        //send the text to the server
        final String language = getLanguage(this, ci, true);
        final HttpPost request = new HttpPost(analysisServiceUrl);
        request.addHeader(HttpHeaders.CONTENT_LANGUAGE, language);
        request.setEntity(new InputStreamEntity(
            blob.getStream(), blob.getContentLength(),
            ContentType.create(blob.getMimeType(), 
                blob.getParameter().get("charset"))));
        //execute the request
        try {
            AccessController.doPrivileged(new PrivilegedExceptionAction<AnalysedText>() {
                public AnalysedText run() throws ClientProtocolException, IOException {
                    return httpClient.execute(request, new AnalysisResponseHandler(at));
                }
            });
        } catch (PrivilegedActionException pae) {
            Exception e = pae.getException();
            if(e instanceof ClientProtocolException) {
                throw new EngineException(this, ci, "Exception while executing Request "
                    + "on RESTful NLP Analysis Service at "+analysisServiceUrl, e);
            } else if(e instanceof IOException) {
                throw new EngineException(this, ci, "Exception while executing Request "
                        + "on RESTful NLP Analysis Service at "+analysisServiceUrl, e);
            } else {
                throw RuntimeException.class.cast(e);
            }
        }
        Iterator<Span> spans = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence,SpanTypeEnum.Chunk));
        Sentence context = null;
        MGraph metadata = ci.getMetadata();
        Language lang = new Language(language);
        LiteralFactory lf = LiteralFactory.getInstance();
        ci.getLock().writeLock().lock();
        try { //write TextAnnotations for Named Entities
            while(spans.hasNext()){
                Span span = spans.next();
                switch (span.getType()) {
                    case Sentence:
                        context = (Sentence)context;
                        break;
                    default:
                        Value<NerTag> nerAnno = span.getAnnotation(NER_ANNOTATION);
                        if(nerAnno != null){
                            UriRef ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
                            //add span related data
                            metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, 
                                new PlainLiteralImpl(span.getSpan(), lang)));
                            metadata.add(new TripleImpl(ta, ENHANCER_START, 
                                lf.createTypedLiteral(span.getStart())));
                            metadata.add(new TripleImpl(ta, ENHANCER_END, 
                                lf.createTypedLiteral(span.getEnd())));
                            metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, 
                                new PlainLiteralImpl(context == null ?
                                        getDefaultSelectionContext(at.getSpan(), span.getSpan(), span.getStart()) :
                                            context.getSpan(), lang)));
                            //add the NER type
                            if(nerAnno.value().getType() != null){
                                metadata.add(new TripleImpl(ta,DC_TYPE,nerAnno.value().getType()));
                            }

View Full Code Here

     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText analysedText = getAnalysedText(this,ci, true);
        String language = getLanguage(this, ci, true);
        SentimentClassifier classifier = classifiers.get(language);
        if(classifier == null){
            throw new IllegalStateException("Sentiment Classifier for language '"
                + language +"' not available. As this is also checked in "
                + " canEnhance this may indicate an Bug in the used "
                + "EnhancementJobManager!");
        }
        //TODO: locking for AnalysedText not yet defined
//        ci.getLock().writeLock().lock();
//        try {
        Iterator<Token> tokens = analysedText.getTokens();
        while(tokens.hasNext()){
            Token token = tokens.next();
            boolean process = !adjectivesOnly;
            if(!process){ //check POS types
                Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();

View Full Code Here

     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = getAnalysedText(this, ci, true);
        String language = getLanguage(this, ci, true);
        isLangaugeConfigured(this, languageConfiguration, language, true);
        ChunkerME chunker = initChunker(language);
        if(chunker == null){
            return;
        }
        //init the Phrase TagSet
        TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
        if(tagSet == null){ 
        }
        if(tagSet == null){
            log.warn("No Phrase TagSet registered for Language '{}'. Will build an "
                + "adhoc set based on encountered Tags!",language);
            //for now only created to avoid checks for tagSet == null
            //TODO: in future we might want to automatically create posModels based
            //on tagged texts. However this makes no sense as long we can not
            //persist TagSets.
            tagSet = new TagSet<PhraseTag>("dummy", language);
        }
        //holds PosTags created for POS tags that where not part of the posModel
        //(will hold all PosTags in case tagSet is NULL
        Map<String,PhraseTag> adhocTags = languageAdhocTags.get(language);
        if(adhocTags == null){
            adhocTags = new HashMap<String,PhraseTag>();
            languageAdhocTags.put(language, adhocTags);
        }        
        ci.getLock().writeLock().lock();
        try {
            Iterator<? extends Section> sentences = at.getSentences();
            if(!sentences.hasNext()){ //no sentences ... iterate over the whole text
                sentences = Collections.singleton(at).iterator();
            }
            List<String> tokenTextList = new ArrayList<String>(64);
            List<String> posList = new ArrayList<String>(64);
            List<Token> tokenList = new ArrayList<Token>(64);
            //process each sentence seperatly
            while(sentences.hasNext()){
                // (1) get Tokens and POS information for the sentence
                Section sentence = sentences.next();
                Iterator<Token> tokens = sentence.getTokens();
                while(tokens.hasNext()){
                    Token token = tokens.next();
                    tokenList.add(token);
                    tokenTextList.add(token.getSpan());
                    Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
                    if(posValue == null){
                        throw new EngineException("Missing POS value for Token '"
                            + token.getSpan()+" of ContentItem "+ci.getUri()
                            + "(Sentence: '"+sentence.getSpan()+"'). This may "
                            + "indicate that a POS tagging Engine is missing in "
                            + "the EnhancementChain or that the used POS tagging "
                            + "does not provide POS tags for each token!");
                    } else {
                        posList.add(posValue.value().getTag());
                    }
                }
                String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
                String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
                if(log.isTraceEnabled()){
                    log.trace("Tokens: {}"+Arrays.toString(tokenStrings));
                }
                tokenTextList.clear(); //free memory
                posList.clear(); //free memory
                
                // (2) Chunk the sentence
                
                String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
                double[] chunkProb = chunker.probs();
                if(log.isTraceEnabled()){
                    log.trace("Chunks: {}"+Arrays.toString(chunkTags));
                }
                tokenStrings = null; //free memory
                tokenPos = null; //free memory
                
                // (3) Process the results and write the Annotations
                double chunkProps = 0;
                int chunkTokenCount = 0;
                PhraseTag tag = null;
                int i;
                /*
                 * This assumes:
                 *  - 'B-{tag}' ... for start of a new chunk
                 *  - '???' ... anything other for continuing the current chunk
                 *  - 'O' ... no chunk (ends current chunk)
                 */
                for(i=0;i<tokenList.size();i++){
                    boolean start = chunkTags[i].charAt(0) == 'B';
                    boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
                    if(end){ //add the current phrase
                        //add at AnalysedText level, because offsets are absolute
                        //NOTE we are already at the next token when we detect the end
                        Chunk chunk = at.addChunk( 
                            tokenList.get(i-chunkTokenCount).getStart(), 
                            tokenList.get(i-1).getEnd());
                        chunk.addAnnotation(PHRASE_ANNOTATION, 
                            new Value<PhraseTag>(tag,
                                    chunkProps/(double)chunkTokenCount));
                        //reset the state
                        tag = null;
                        chunkTokenCount = 0;
                        chunkProps = 0;
                    }
                    if(start){ //create the new tag
                        tag = getPhraseTag(tagSet,adhocTags,
                            chunkTags[i].substring(2), language); //skip 'B-'
                        
                    }
                    if(tag != null){ //count this token for the current chunk
                        chunkProps = chunkProps + chunkProb[i];
                        chunkTokenCount++;
                    }
                }
                if(tag != null){
                    Chunk chunk = at.addChunk( 
                        tokenList.get(i-chunkTokenCount).getStart(), 
                        tokenList.get(i-1).getEnd());
                    chunk.addAnnotation(PHRASE_ANNOTATION, 
                        new Value<PhraseTag>(tag,
                                chunkProps/(double)chunkTokenCount));

View Full Code Here

            throw new IllegalStateException("For the language '"+language+"' of ContentItem "+ci.getUri() 
                + " no NER model is configured: This is also checked in the canEnhance "
                + "method! -> This indicated an Bug in the implementation of the "
                + "EnhancementJobManager!");
        }
        final AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
        //validate data in the AnalysedText
        final String text;
        if(at != null && at.getTokens().hasNext()){ //if the AnalysedText is present and tokens are present
            if(log.isDebugEnabled()){
                log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}",
                    ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));
            }
            text = null;
        } else { //no AnalysedText with tokens ...
            //fallback to processing the plain text is still supported
            Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);

View Full Code Here

               ENHANCE_ASYNC : CANNOT_ENHANCE; 
    }


    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
        //configure the spanTypes based on the configuration
        EnumSet<Span.SpanTypeEnum> spanTypes = EnumSet.noneOf(SpanTypeEnum.class);
        if(writeNounPhraseSentiments){
            spanTypes.add(SpanTypeEnum.Chunk);
        }

View Full Code Here

     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
        String language = getLanguage(this, ci, true);
        SentenceDetector sentenceDetector = getSentenceDetector(language);
        if(sentenceDetector != null){
            for(opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
                //detect sentences and add it to the AnalyzedText.
                Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
                log.trace(" > add {}",sentence);
            }
        } else {
            log.warn("SentenceDetector model for language {} is no longer available. "
                + "This might happen if the model becomes unavailable during enhancement. "

View Full Code Here

    @Test
    public void testEngine() throws EngineException {
        Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
        engine.computeEnhancements(contentItem);
        //assert the results
        AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem);
        Assert.assertNotNull(at);
        Assert.assertTrue(at.getTokens().hasNext()); //assert that tokens are present
    }

View Full Code Here

     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
        String language = getLanguage(this, ci, true);
        
        Tokenizer tokenizer = getTokenizer(language);
        if(tokenizer == null){
            log.warn("Tokenizer for language {} is no longer available. "
                    + "This might happen if the model becomes unavailable during enhancement. "
                    + "If this happens more often it might also indicate an bug in the used "
                    + "EnhancementJobManager implementation as the availability is also checked "
                    + "in the canEnhance(..) method of this Enhancement Engine.");
            return;
        }
        //Try to use sentences for tokenizing
        Iterator<? extends Section> sections = at.getSentences();
        if(!sections.hasNext()){
            //if no sentences are annotated
            sections = Collections.singleton(at).iterator();
        }

View Full Code Here

        return ENHANCE_ASYNC;
    }
    
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = getAnalysedText(this, ci, true);
        String language = getLanguage(this, ci, true);
        isLangaugeConfigured(this, languageConfig, language, true);
        List<LexicalEntry> terms;
        try {
            terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
        } catch (IOException e) {
            throw new EngineException("Error while calling the CELI Lemmatizer" 
                    + " service (configured URL: " + serviceURL + ")!", e);
        } catch (SOAPException e) {
            throw new EngineException("Error wile encoding/decoding the request/" + 
                    "response to the CELI lemmatizer service!", e);
        }
        Map<LexicalCategory,Double> tokenLexCats = new EnumMap<LexicalCategory,Double>(LexicalCategory.class);
        for(LexicalEntry term : terms){
            if(term.getTermReadings().isEmpty()){
                //TODO: maybe still add them and use the Lemmatizer as Tokenizer
                continue; //ignore terms without readings
            }
            //Add the LexicalEntry as Token to the Text. NOTE that if a
            //Token with the same start/end positions already exist this
            //Method returns the existing instance
            Token token = at.addToken(term.getFrom(), term.getTo());
            //Now try to get POS annotations for the Token
            for(Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)){
                if(posAnno.value().isMapped()){
                    for(LexicalCategory cat :posAnno.value().getCategories()){
                        if(!tokenLexCats.containsKey(cat)){ //do not override with lover prob

View Full Code Here

        return ENHANCE_ASYNC;
    }
    
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = getAnalysedText(this, ci, true);
        String language = getLanguage(this, ci, true);
        isLangaugeConfigured(this, languageConfig, language, true);
        List<SentimentExpression> seList;
        try {
          seList = this.client.extractSentimentExpressions(at.getSpan(), language);
        } catch (IOException e) {
            throw new EngineException("Error while calling the CELI Sentiment Analysis service (configured URL: " + serviceURL + ")!", e);
        } catch (SOAPException e) {
            throw new EngineException("Error wile encoding/decoding the request/response to the CELI Sentiment Analysis service!", e);
        }
        
        for(SentimentExpression se : seList){
            //Add the Sentiment Expression as Token to the Text. NOTE that if a Token with the same start/end positions already exist this
            //Method returns the existing instance
            Token token = at.addToken(se.getStartSnippet(),se.getEndSnippet());
            token.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, new Value<Double>(se.getSentimentPolarityAsDoubleValue()) );
        }
    }

View Full Code Here

0 1 2 3 4

TOP

Related Classes of org.apache.stanbol.enhancer.nlp.model.AnalysedText

org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliAnalyzedTextLemmatizerEngine

org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliAnalyzedTextLemmatizerEngineTest

org.apache.stanbol.enhancer.engines.celi.sentimentanalysis.impl.CeliAnalyzedTextSentimentAnalysisEngine

org.apache.stanbol.enhancer.engines.celi.sentimentanalysis.impl.CeliAnalyzedTextSentimentAnalysisEngineTest

org.apache.stanbol.enhancer.engines.entitycomention.EntityCoMentionEngine

org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine

org.apache.stanbol.enhancer.engines.kuromoji.impl.KuromojiNlpEngine

org.apache.stanbol.enhancer.engines.kuromoji.impl.TestKuromojiNlpEngine

org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngine

org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.