Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

Package org.apache.stanbol.enhancer.nlp.model

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText

org.apache.stanbol.enhancer.nlp.model.AnalysedText
Provides access to NLP processing results of the text/plain {@link Blob} of an ContentItem. Intended to be{@link ContentItem#addPart(org.apache.clerezza.rdf.core.UriRef,Object) addedas ContentPart} by using {@link #ANALYSED_TEXT_URI}. @see ContentItem#addPart(UriRef,Object)

     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);


        String language = getLanguage(this,ci,false);
        if(!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
            throw new IllegalStateException("The detected language is NOT 'zh'! "
                + "As this is also checked within the #canEnhance(..) method this "
                + "indicates an Bug in the used EnhancementJobManager implementation. "
                + "Please report this on the dev@apache.stanbol.org or create an "
                + "JIRA issue about this.");
        }
        if(!at.getSentences().hasNext()) { //no sentences  ... use this engine to detect
            //first the sentences
            TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
            try {
                while(sentences.incrementToken()){
                    OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
                    Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
                    if(log.isTraceEnabled()) {
                        log.trace("detected {}:{}",s,s.getSpan());
                    }
                }
            } catch (IOException e) {
                String message = String.format("IOException while reading from "
                    +"CharSequenceReader of AnalyzedText for ContentItem %s",ci.getUri());
                log.error(message,e);
                throw new EngineException(this, ci, message, e);
            }
        }
        //now the tokens
        TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
        try {
            while(tokens.incrementToken()){
                OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
                Token t = at.addToken(offset.startOffset(), offset.endOffset());
                log.trace("detected {}",t);
            }
        } catch (IOException e) {
            String message = String.format("IOException while reading from "
                +"CharSequenceReader of AnalyzedText for ContentItem %s",ci.getUri());

View Full Code Here

    
    @Test
    public void testEngine() throws IOException, EngineException {
        ContentItem ci = ciFactory.createContentItem(new StringSource(text));
        Assert.assertNotNull(ci);
        AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
        Assert.assertNotNull(at);
        ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("it")));
        Assert.assertEquals("it", EnhancementEngineHelper.getLanguage(ci));
        
        Assert.assertEquals("Can not enhance Test ContentItem", EnhancementEngine.ENHANCE_ASYNC,engine.canEnhance(ci));
        //compute the enhancements
        try {
            engine.computeEnhancements(ci);
        } catch (EngineException e) {
            RemoteServiceHelper.checkServiceUnavailable(e);
            return; //deactivate test
        }
        //now validate the enhancements
        int sentimentExpressionCnt=0;
        for(Iterator<Token> tokens = at.getTokens(); tokens.hasNext();){
            Token token = tokens.next();
            log.info("Token: {}",token);
            List<Value<Double>> sentimentExpressionsList = token.getAnnotations(NlpAnnotations.SENTIMENT_ANNOTATION);
            if(sentimentExpressionsList!=null && sentimentExpressionsList.size()>0)
              sentimentExpressionCnt++;

View Full Code Here

                new Object[]{ getName(), ci.getUri(), language});
            return CANNOT_ENHANCE;
        }
        //we need a detected language, the AnalyzedText contentPart with
        //Tokens.
        AnalysedText at = getAnalysedText(this, ci, false);
        return at != null && at.getTokens().hasNext() ?
                ENHANCE_ASYNC : CANNOT_ENHANCE;
    }

View Full Code Here

    public void computeEnhancements(ContentItem ci) throws EngineException {
        log.trace(" enhance ci {}",ci.getUri());
        if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
            throw new EngineException(this,ci,"Offline mode is not supported by the used EntitySearcher!",null);
        }
        AnalysedText at = getAnalysedText(this, ci, true);
        log.debug("  > AnalysedText {}",at);
        String language = getLanguage(this, ci, true);
        if(log.isDebugEnabled()){
            log.debug("computeEnhancements for ContentItem {} language {} text={}", 
                new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
        }
        log.debug("  > Language {}",language);
        LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
        if(languageConfig == null){
            throw new IllegalStateException("The language '"+language+"' is not configured "

View Full Code Here

        
    }


    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
        String language = getLanguage(this,ci,false);
        if(!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
            throw new IllegalStateException("The detected language is NOT 'zh'! "
                + "As this is also checked within the #canEnhance(..) method this "
                + "indicates an Bug in the used EnhancementJobManager implementation. "
                + "Please report this on the dev@apache.stanbol.org or create an "
                + "JIRA issue about this.");
        }
        PaodingAnalyzer pa;
        try {
            pa = AccessController.doPrivileged(new PrivilegedExceptionAction<PaodingAnalyzer>() {
                public PaodingAnalyzer run() throws Exception {
                    return new PaodingAnalyzer();
                }
            });
        } catch (PrivilegedActionException pae){
            Exception e = pae.getException();
            log.error("Unable to initialise PoadingAnalyzer",e);
            throw new EngineException("Unable to initialise PoadingAnalyzer",e);
        }
        TokenStream ts = pa.tokenStream("dummy", new CharSequenceReader(at.getText()));
        int lastEnd = 0;
        try {
            while(ts.incrementToken()){
                OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
                //when tokenizing labels we need to preserve all chars
                if(offset.startOffset() > lastEnd){ //add token for stopword
                    at.addToken(lastEnd,offset.startOffset());
                }
                at.addToken(offset.startOffset(), offset.endOffset());
                lastEnd = offset.endOffset();
            }
        } catch (IOException e) {
            log.warn("IOException while reading the parsed Text",e);
            throw new EngineException("IOException while reading the parsed Text",e);

View Full Code Here

                ENHANCE_ASYNC : CANNOT_ENHANCE;
    }


    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = getAnalysedText(this, ci, true);
        String lang = EnhancementEngineHelper.getLanguage(ci);
        Language language = lang == null ? null : new Language(lang);
        //now iterate over the AnalysedText data and create the RDF representation
        //TODO: make configureable
        boolean sentences = true;
        boolean phrases = true;
        boolean words = true;
        
        EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
        if(sentences){
            activeTypes.add(SpanTypeEnum.Sentence);
        }
        if(phrases){
            activeTypes.add(SpanTypeEnum.Chunk);
        }
        if(words){
            activeTypes.add(SpanTypeEnum.Token);
        }
        MGraph metadata = ci.getMetadata();
        UriRef base = ci.getUri();
        ci.getLock().writeLock().lock();
        try {
            Iterator<Span> spans = at.getEnclosed(activeTypes);
            UriRef sentence = null;
            UriRef phrase = null;
            UriRef word = null;
            boolean firstWordInSentence = true;
            while(spans.hasNext()){

View Full Code Here

     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
        String language = getLanguage(this, ci, true);
        
        POSTagger posTagger = getPOSTagger(language);
        if(posTagger == null){
            //this means that the POS tagger became unavailable in-between
            //the call to canEnhance and computeEnhancement
            throw new EngineException("PosTagger for langauge '"+language
                + "is not available."); 
        }
        TagSet<PosTag> tagSet = tagSetRegistry.getTagSet(language);
        if(tagSet == null){
            log.warn("No POS TagSet registered for Language '{}'. Will build an "
                    + "adhoc set based on encountered Tags!",language);
            //for now only created to avoid checks for tagSet == null
            //TODO: in future we might want to automatically create posModels based
            //on tagged texts. However this makes no sense as long we can not
            //persist TagSets.
            tagSet = new TagSet<PosTag>("dummy", language);
        }
        //holds PosTags created for POS tags that where not part of the posModel
        //(will hold all PosTags in case tagSet is NULL
        Map<String,PosTag> adhocTags = languageAdhocTags.get(language);
        if(adhocTags == null){
                adhocTags =  new HashMap<String,PosTag>();
                languageAdhocTags.put(language, adhocTags);
        }
        //(1) Sentence detection
        
        //Try to read existing Sentence Annotations
        Iterator<Sentence> sentences = at.getSentences();
        List<Section> sentenceList;
        if(!sentences.hasNext()){
            //if non try to detect sentences
            log.trace(" > detect sentences for {}",at);
            sentenceList = detectSentences(at,language);

View Full Code Here

0 1 2 3 4

TOP

Related Classes of org.apache.stanbol.enhancer.nlp.model.AnalysedText

org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliAnalyzedTextLemmatizerEngine

org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliAnalyzedTextLemmatizerEngineTest

org.apache.stanbol.enhancer.engines.celi.sentimentanalysis.impl.CeliAnalyzedTextSentimentAnalysisEngine

org.apache.stanbol.enhancer.engines.celi.sentimentanalysis.impl.CeliAnalyzedTextSentimentAnalysisEngineTest

org.apache.stanbol.enhancer.engines.entitycomention.EntityCoMentionEngine

org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine

org.apache.stanbol.enhancer.engines.kuromoji.impl.KuromojiNlpEngine

org.apache.stanbol.enhancer.engines.kuromoji.impl.TestKuromojiNlpEngine

org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngine

org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngineTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.