Package org.apache.stanbol.enhancer.nlp.model

Examples of org.apache.stanbol.enhancer.nlp.model.AnalysedText


     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);

        String language = getLanguage(this,ci,false);
        if(!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
            throw new IllegalStateException("The detected language is NOT 'zh'! "
                + "As this is also checked within the #canEnhance(..) method this "
                + "indicates an Bug in the used EnhancementJobManager implementation. "
                + "Please report this on the dev@apache.stanbol.org or create an "
                + "JIRA issue about this.");
        }
        if(!at.getSentences().hasNext()) { //no sentences  ... use this engine to detect
            //first the sentences
            TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
            try {
                while(sentences.incrementToken()){
                    OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
                    Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
                    if(log.isTraceEnabled()) {
                        log.trace("detected {}:{}",s,s.getSpan());
                    }
                }
            } catch (IOException e) {
                String message = String.format("IOException while reading from "
                    +"CharSequenceReader of AnalyzedText for ContentItem %s",ci.getUri());
                log.error(message,e);
                throw new EngineException(this, ci, message, e);
            }
        }
        //now the tokens
        TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
        try {
          tokens.reset();
            while(tokens.incrementToken()){
                OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
                Token t = at.addToken(offset.startOffset(), offset.endOffset());
                log.trace("detected {}",t);
            }
        } catch (IOException e) {
            String message = String.format("IOException while reading from "
                +"CharSequenceReader of AnalyzedText for ContentItem %s",ci.getUri());
View Full Code Here


        return ENHANCE_ASYNC;
    }
   
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = getAnalysedText(this, ci, true);
        String language = getLanguage(this, ci, true);
        isLangaugeConfigured(this, languageConfig, language, true);
        List<LexicalEntry> terms;
        try {
            terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
        } catch (IOException e) {
            throw new EngineException("Error while calling the CELI Lemmatizer"
                    + " service (configured URL: " + serviceURL + ")!", e);
        } catch (SOAPException e) {
            throw new EngineException("Error wile encoding/decoding the request/" +
                    "response to the CELI lemmatizer service!", e);
        }
        Map<LexicalCategory,Double> tokenLexCats = new EnumMap<LexicalCategory,Double>(LexicalCategory.class);
        for(LexicalEntry term : terms){
            if(term.getTermReadings().isEmpty()){
                //TODO: maybe still add them and use the Lemmatizer as Tokenizer
                continue; //ignore terms without readings
            }
            //Add the LexicalEntry as Token to the Text. NOTE that if a
            //Token with the same start/end positions already exist this
            //Method returns the existing instance
            Token token = at.addToken(term.getFrom(), term.getTo());
            //Now try to get POS annotations for the Token
            for(Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)){
                if(posAnno.value().isMapped()){
                    for(LexicalCategory cat :posAnno.value().getCategories()){
                        if(!tokenLexCats.containsKey(cat)){ //do not override with lover prob
View Full Code Here

     *          if the underlying process failed to work as
     *          expected
     */
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);

        String language = getLanguage(this,ci,false);
        if(!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
            throw new IllegalStateException("The detected language is NOT 'zh'! "
                + "As this is also checked within the #canEnhance(..) method this "
                + "indicates an Bug in the used EnhancementJobManager implementation. "
                + "Please report this on the dev@apache.stanbol.org or create an "
                + "JIRA issue about this.");
        }
        //first the sentences
        TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
        try {
          sentences.reset();
            while(sentences.incrementToken()){
                OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
                Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
                if(log.isTraceEnabled()) {
                    log.trace("detected {}:{}",s,s.getSpan());
                }
            }
        } catch (IOException e) {
View Full Code Here

                + "are available", new Object[] {getName(), ci.getUri(), language});
                return CANNOT_ENHANCE;
        }
        // we need a detected language, the AnalyzedText contentPart with
        // Tokens.
        AnalysedText at = getAnalysedText(this, ci, false);
        return at != null && at.getTokens().hasNext() ? ENHANCE_ASYNC : CANNOT_ENHANCE;
    }
View Full Code Here

        return at != null && at.getTokens().hasNext() ? ENHANCE_ASYNC : CANNOT_ENHANCE;
    }

    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = getAnalysedText(this, ci, true);
        log.debug("  > AnalysedText {}", at);
        String language = getLanguage(this, ci, true);
        log.debug("  > Language {}", language);
        if (log.isDebugEnabled()) {
            log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] {
                    ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
        }
        // TODO: we need to do the same for the the default matching language
        TaggingSession session;
        try {
            session = TaggingSession.createSession(indexConfig, language);
        } catch (CorpusException e) {
            throw new EngineException(this, ci, e);
        }
        long taggingStart = System.currentTimeMillis();
        final NavigableMap<int[],Tag> tags = new TreeMap<int[],Tag>(Tag.SPAN_COMPARATOR);
        try {
            //process the language of the document
            Corpus corpus = null;
            if(session.getLanguageCorpus() != null){
                corpus = session.getLanguageCorpus();
                long t = System.currentTimeMillis();
                int d = tag(at, session,corpus,tags);
                log.debug(" - {}: fst: {}ms (callback: {}ms)", new Object[]{
                        corpus.getIndexedField(), System.currentTimeMillis()-t, d
                });
            }
            if(session.getDefaultCorpus() != null){
                if(corpus == null){
                    corpus = session.getDefaultCorpus();
                }
                long t = System.currentTimeMillis();
                int d = tag(at, session, session.getDefaultCorpus(),tags);
                log.debug(" - {}: fst: {}ms (callback: {}ms)",new Object[]{
                        session.getDefaultCorpus().getIndexedField(),
                        System.currentTimeMillis()-t, d});
            }
            long taggingEnd = System.currentTimeMillis();
            if(corpus == null){
                throw new EngineException(this,ci,"No FST corpus found to process contentItem "
                    + "language '"+session.getLanguage()+"'!",null);
            } else {
                if(session.getLanguageCorpus() != null && session.getDefaultCorpus() != null){
                    log.debug(" - sum fst: {} ms", taggingEnd - taggingStart);
                }
            }
            int matches = match(at,tags.values());
            log.debug(" - loaded {} ({} loaded, {} cached, {} appended) Matches in {} ms",
                    new Object[]{matches, session.getSessionDocLoaded(),
                        session.getSessionDocCached(), session.getSessionDocAppended(),
                        System.currentTimeMillis()-taggingEnd});
            if(log.isDebugEnabled() && session.getDocumentCache() != null){
                log.debug("EntityCache Statistics: {}",
                    session.getDocumentCache().printStatistics());
            }
        } catch (IOException e) {
            throw new EngineException(this,ci,e);
        } finally {
            session.close();
        }
        if(log.isTraceEnabled()){
            log.trace("Tagged Entities:");
            for(Tag tag : tags.values()){
                log.trace("[{},{}]: {}", new Object[]{tag.getStart(),tag.getEnd(),tag.getMatches()});
            }
        }
        ci.getLock().writeLock().lock();
        try {
            writeEnhancements(ci,at.getSpan(),tags.values(),language);
        } finally {
            ci.getLock().writeLock().unlock();
        }
        tags.clear(); //help the GC
    }
View Full Code Here

     * @return the AnalysedText or <code>null</code> if not found.
     * @throws IllegalStateException if exception is <code>true</code> and the
     * {@link AnalysedText} could not be retrieved from the parsed {@link ContentItem}.
     */
    public static AnalysedText getAnalysedText(EnhancementEngine engine, ContentItem ci, boolean exception) {
        AnalysedText at;
        try {
            at = AnalysedTextUtils.getAnalysedText(ci);
        }catch (RuntimeException e) {
            log.warn("Unable to retrieve AnalysedText for ContentItem "
                + ci + "because of an "+e.getClass().getSimpleName()+" with message "
View Full Code Here

     * ContentPart is not yet present in the parsed {@link ContentItem}
     */
    public static AnalysedText initAnalysedText(EnhancementEngine engine,
                                                AnalysedTextFactory analysedTextFactory,
                                                ContentItem ci) throws EngineException {
        AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
        if(at == null){
            if(analysedTextFactory == null){
                throw new IllegalStateException("Unable to initialise AnalysedText"
                    + "ContentPart because the parsed AnalysedTextFactory is NULL");
            }
View Full Code Here

        Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.ner.NerTag\""));
        Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.morpho\" : {"));
        Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures\""));
        //deserialize
        AnalyzedTextParser parser = AnalyzedTextParser.getDefaultInstance();
        AnalysedText parsedAt = parser.parse(new ByteArrayInputStream(data), null,
            atFactory.createAnalysedText(textBlob.getValue()));
        Assert.assertEquals(analysedTextWithData, parsedAt);
        Iterator<Span> origSpanIt = analysedTextWithData.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
        Iterator<Span> parsedSpanIt = parsedAt.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
        while(origSpanIt.hasNext() && parsedSpanIt.hasNext()){
            Span orig = origSpanIt.next();
            Span parsed = parsedSpanIt.next();
            Assert.assertEquals(orig, parsed);
            Set<String> origKeys = orig.getKeys();
View Full Code Here

    String serialized = getSerializedString();
    Assert.assertTrue(serialized.contains(jsonCheckObama));
    Assert.assertTrue(serialized.contains(jsonCheckVisited));
    Assert.assertTrue(serialized.contains(jsonCheckChina));
   
    AnalysedText parsedAt = getParsedAnalysedText(serialized);
    assertAnalysedTextEquality(parsedAt);
  }
View Full Code Here

    String serialized = getSerializedString();
   
    Assert.assertTrue(serialized.contains(jsonCorefCheckObama));
    Assert.assertTrue(serialized.contains(jsonCorefCheckHe));
   
    AnalysedText parsedAt = getParsedAnalysedText(serialized);
    assertAnalysedTextEquality(parsedAt);
  }
View Full Code Here

TOP

Related Classes of org.apache.stanbol.enhancer.nlp.model.AnalysedText

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.