Package opennlp.tools.chunker

Examples of opennlp.tools.chunker.ChunkerME


    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = getAnalysedText(this, ci, true);
        String language = getLanguage(this, ci, true);
        isLangaugeConfigured(this, languageConfiguration, language, true);
        ChunkerME chunker = initChunker(language);
        if(chunker == null){
            return;
        }
        //init the Phrase TagSet
        TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
        if(tagSet == null){
        }
        if(tagSet == null){
            log.warn("No Phrase TagSet registered for Language '{}'. Will build an "
                + "adhoc set based on encountered Tags!",language);
            //for now only created to avoid checks for tagSet == null
            //TODO: in future we might want to automatically create posModels based
            //on tagged texts. However this makes no sense as long we can not
            //persist TagSets.
            tagSet = new TagSet<PhraseTag>("dummy", language);
        }
        //holds PosTags created for POS tags that where not part of the posModel
        //(will hold all PosTags in case tagSet is NULL
        Map<String,PhraseTag> adhocTags = languageAdhocTags.get(language);
        if(adhocTags == null){
            adhocTags = new HashMap<String,PhraseTag>();
            languageAdhocTags.put(language, adhocTags);
        }       
        ci.getLock().writeLock().lock();
        try {
            Iterator<? extends Section> sentences = at.getSentences();
            if(!sentences.hasNext()){ //no sentences ... iterate over the whole text
                sentences = Collections.singleton(at).iterator();
            }
            List<String> tokenTextList = new ArrayList<String>(64);
            List<String> posList = new ArrayList<String>(64);
            List<Token> tokenList = new ArrayList<Token>(64);
            //process each sentence seperatly
            while(sentences.hasNext()){
                // (1) get Tokens and POS information for the sentence
                Section sentence = sentences.next();
                Iterator<Token> tokens = sentence.getTokens();
                while(tokens.hasNext()){
                    Token token = tokens.next();
                    tokenList.add(token);
                    tokenTextList.add(token.getSpan());
                    Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
                    if(posValue == null){
                        throw new EngineException("Missing POS value for Token '"
                            + token.getSpan()+"' of ContentItem "+ci.getUri()
                            + "(Sentence: '"+sentence.getSpan()+"'). This may "
                            + "indicate that a POS tagging Engine is missing in "
                            + "the EnhancementChain or that the used POS tagging "
                            + "does not provide POS tags for each token!");
                    } else {
                        posList.add(posValue.value().getTag());
                    }
                }
                String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
                String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
                if(log.isTraceEnabled()){
                    log.trace("Tokens: {}"+Arrays.toString(tokenStrings));
                }
                tokenTextList.clear(); //free memory
                posList.clear(); //free memory
               
                // (2) Chunk the sentence
               
                String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
                double[] chunkProb = chunker.probs();
                if(log.isTraceEnabled()){
                    log.trace("Chunks: {}"+Arrays.toString(chunkTags));
                }
                tokenStrings = null; //free memory
                tokenPos = null; //free memory
View Full Code Here


        }
        if(model == null){
            log.trace("no Chunker Model for language {}",language);
            return null;
        } else {
            return new ChunkerME(model);
        }
    }
View Full Code Here

        return new POSTaggerME(new POSModel(
                getResourceAsStream(taggerModelFile)));
    }

    public static Chunker getDefaultChunker() throws IOException {
        return new ChunkerME(new ChunkerModel(
                getResourceAsStream(chunkerModelFile)));
    }
View Full Code Here

  private int incompleteIndex;

  public Parser(ParserModel model, int beamSize, double advancePercentage) {
    this(model.getBuildModel(), model.getCheckModel(),
        new POSTaggerME(model.getParserTaggerModel(), 10, 0),
        new ChunkerME(model.getParserChunkerModel(),
            ChunkerME.DEFAULT_BEAM_SIZE,
            new ParserChunkerSequenceValidator(model.getParserChunkerModel()),
            new ChunkContextGenerator(ChunkerME.DEFAULT_BEAM_SIZE)),
            model.getHeadRules(), beamSize, advancePercentage);
  }
View Full Code Here

    //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n");
    List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>();
    SentenceDetectorME  sentenceDetector = new SentenceDetectorME((SentenceModel)sentenceModel);
    TokenizerME tokenizer = new TokenizerME((TokenizerModel)tokenModel);
    POSTaggerME posTagger = new POSTaggerME((POSModel)posModel);
    ChunkerME chunker = new ChunkerME((ChunkerModel)chunkModel);

    Span[] sentSpans = sentenceDetector.sentPosDetect(intext);
    for (Span sentSpan : sentSpans) {
      String sentence = sentSpan.getCoveredText(intext).toString();
      int start = sentSpan.getStart();
      Span[] tokSpans = tokenizer.tokenizePos(sentence);
      String[] tokens = new String[tokSpans.length];
      // System.out.println("\n\nTokens:");
      for (int i = 0; i < tokens.length; i++) {
        tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
        // System.out.println(tokens[i]);
      }
      String[] tags = posTagger.tag(tokens);
      Span[] chunks = chunker.chunkAsSpans(tokens, tags);
      for (Span chunk : chunks) {
        if ("NP".equals(chunk.getType())) {
          //Note: getStart()/getEnd() methods of Chunk spans only give the start and end token indexes of the chunk.
          //The actual Start/End positions of the chunk in the sentence need to be extracted from POS sentenceSpans.
          //They are offsets from the begining of the sentence in question. Need to add the start postion of the sentence
View Full Code Here

    POSTaggerME posTagger = new POSTaggerME(posModel);
    return posTagger.tag(tokens);
  }

  private String[] chunk(String[] tokens, String[] posTags) {
    ChunkerME chunker = new ChunkerME(chunkerModel);
    return chunker.chunk(tokens, posTags);
  }
View Full Code Here

    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        AnalysedText at = getAnalysedText(this, ci, true);
        String language = getLanguage(this, ci, true);
        isLangaugeConfigured(this, languageConfiguration, language, true);
        ChunkerME chunker = initChunker(language);
        if(chunker == null){
            return;
        }
        //init the Phrase TagSet
        TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
        if(tagSet == null){
        }
        if(tagSet == null){
            log.warn("No Phrase TagSet registered for Language '{}'. Will build an "
                + "adhoc set based on encountered Tags!",language);
            //for now only created to avoid checks for tagSet == null
            //TODO: in future we might want to automatically create posModels based
            //on tagged texts. However this makes no sense as long we can not
            //persist TagSets.
            tagSet = new TagSet<PhraseTag>("dummy", language);
        }
        //holds PosTags created for POS tags that where not part of the posModel
        //(will hold all PosTags in case tagSet is NULL
        Map<String,PhraseTag> adhocTags = languageAdhocTags.get(language);
        if(adhocTags == null){
            adhocTags = new HashMap<String,PhraseTag>();
            languageAdhocTags.put(language, adhocTags);
        }       
        ci.getLock().writeLock().lock();
        try {
            Iterator<? extends Section> sentences = at.getSentences();
            if(!sentences.hasNext()){ //no sentences ... iterate over the whole text
                sentences = Collections.singleton(at).iterator();
            }
            List<String> tokenTextList = new ArrayList<String>(64);
            List<String> posList = new ArrayList<String>(64);
            List<Token> tokenList = new ArrayList<Token>(64);
            //process each sentence seperatly
            while(sentences.hasNext()){
                // (1) get Tokens and POS information for the sentence
                Section sentence = sentences.next();
                Iterator<Token> tokens = sentence.getTokens();
                while(tokens.hasNext()){
                    Token token = tokens.next();
                    tokenList.add(token);
                    tokenTextList.add(token.getSpan());
                    Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
                    if(posValue == null){
                        throw new EngineException("Missing POS value for Token '"
                            + token.getSpan()+" of ContentItem "+ci.getUri()
                            + "(Sentence: '"+sentence.getSpan()+"'). This may "
                            + "indicate that a POS tagging Engine is missing in "
                            + "the EnhancementChain or that the used POS tagging "
                            + "does not provide POS tags for each token!");
                    } else {
                        posList.add(posValue.value().getTag());
                    }
                }
                String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
                String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
                if(log.isTraceEnabled()){
                    log.trace("Tokens: {}"+Arrays.toString(tokenStrings));
                }
                tokenTextList.clear(); //free memory
                posList.clear(); //free memory
               
                // (2) Chunk the sentence
               
                String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
                double[] chunkProb = chunker.probs();
                if(log.isTraceEnabled()){
                    log.trace("Chunks: {}"+Arrays.toString(chunkTags));
                }
                tokenStrings = null; //free memory
                tokenPos = null; //free memory
View Full Code Here

        }
        if(model == null){
            log.trace("no Chunker Model for language {}",language);
            return null;
        } else {
            return new ChunkerME(model);
        }
    }
View Full Code Here

  private int[] attachments;

  public Parser(ParserModel model, int beamSize, double advancePercentage) {
    this(model.getBuildModel(), model.getAttachModel(), model.getCheckModel(),
        new POSTaggerME(model.getParserTaggerModel()),
        new ChunkerME(model.getParserChunkerModel(),
        ChunkerME.DEFAULT_BEAM_SIZE,
        new ParserChunkerSequenceValidator(model.getParserChunkerModel()),
        new ChunkContextGenerator(ChunkerME.DEFAULT_BEAM_SIZE)),
        model.getHeadRules(),
        beamSize, advancePercentage);
View Full Code Here

  private int incompleteIndex;

  public Parser(ParserModel model, int beamSize, double advancePercentage) {
    this(model.getBuildModel(), model.getCheckModel(),
        new POSTaggerME(model.getParserTaggerModel(), 10, 0),
        new ChunkerME(model.getParserChunkerModel(),
            ChunkerME.DEFAULT_BEAM_SIZE,
            new ParserChunkerSequenceValidator(model.getParserChunkerModel()),
            new ChunkContextGenerator(ChunkerME.DEFAULT_BEAM_SIZE)),
            model.getHeadRules(), beamSize, advancePercentage);
  }
View Full Code Here

TOP

Related Classes of opennlp.tools.chunker.ChunkerME

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.