/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.chunker;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import opennlp.tools.ml.BeamSearch;
import opennlp.tools.ml.EventTrainer;
import opennlp.tools.ml.SequenceTrainer;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.TrainerFactory.TrainerType;
import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.ml.model.SequenceClassificationModel;
import opennlp.tools.ml.model.TrainUtil;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.postag.POSSampleSequenceStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
/**
* The class represents a maximum-entropy-based chunker. Such a chunker can be used to
* find flat structures based on sequence inputs such as noun phrases or named entities.
*/
public class ChunkerME implements Chunker {
public static final int DEFAULT_BEAM_SIZE = 10;
private Sequence bestSequence;
/**
* The model used to assign chunk tags to a sequence of tokens.
*/
protected SequenceClassificationModel<String> model;
private ChunkerContextGenerator contextGenerator;
private SequenceValidator<String> sequenceValidator;
/**
* Initializes the current instance with the specified model and
* the specified beam size.
*
* @param model The model for this chunker.
* @param beamSize The size of the beam that should be used when decoding sequences.
* @param sequenceValidator The {@link SequenceValidator} to determines whether the outcome
* is valid for the preceding sequence. This can be used to implement constraints
* on what sequences are valid.
* @deprecated Use {@link #ChunkerME(ChunkerModel, int)} instead
* and use the {@link ChunkerFactory} to configure the {@link SequenceValidator} and {@link ChunkerContextGenerator}.
*/
@Deprecated
public ChunkerME(ChunkerModel model, int beamSize, SequenceValidator<String> sequenceValidator,
ChunkerContextGenerator contextGenerator) {
this.sequenceValidator = sequenceValidator;
this.contextGenerator = contextGenerator;
if (model.getChunkerSequenceModel() != null) {
this.model = model.getChunkerSequenceModel();
}
else {
this.model = new opennlp.tools.ml.BeamSearch<String>(beamSize,
model.getChunkerModel(), 0);
}
}
/**
* Initializes the current instance with the specified model and
* the specified beam size.
*
* @param model The model for this chunker.
* @param beamSize The size of the beam that should be used when decoding sequences.
* @param sequenceValidator The {@link SequenceValidator} to determines whether the outcome
* is valid for the preceding sequence. This can be used to implement constraints
* on what sequences are valid.
* @deprecated Use {@link #ChunkerME(ChunkerModel, int)} instead
* and use the {@link ChunkerFactory} to configure the {@link SequenceValidator}.
*/
@Deprecated
public ChunkerME(ChunkerModel model, int beamSize,
SequenceValidator<String> sequenceValidator) {
this(model, beamSize, sequenceValidator,
new DefaultChunkerContextGenerator());
}
/**
* Initializes the current instance with the specified model and
* the specified beam size.
*
* @param model The model for this chunker.
* @param beamSize The size of the beam that should be used when decoding sequences.
*
* @deprecated beam size is now stored inside the model
*/
@Deprecated
public ChunkerME(ChunkerModel model, int beamSize) {
contextGenerator = model.getFactory().getContextGenerator();
sequenceValidator = model.getFactory().getSequenceValidator();
if (model.getChunkerSequenceModel() != null) {
this.model = model.getChunkerSequenceModel();
}
else {
this.model = new opennlp.tools.ml.BeamSearch<String>(beamSize,
model.getChunkerModel(), 0);
}
}
/**
* Initializes the current instance with the specified model.
* The default beam size is used.
*
* @param model
*/
public ChunkerME(ChunkerModel model) {
this(model, DEFAULT_BEAM_SIZE);
}
@Deprecated
public List<String> chunk(List<String> toks, List<String> tags) {
bestSequence =
model.bestSequence(toks.toArray(new String[toks.size()]), new Object[] { tags.toArray(new String[tags.size()]) },
contextGenerator, sequenceValidator);
return bestSequence.getOutcomes();
}
public String[] chunk(String[] toks, String[] tags) {
bestSequence = model.bestSequence(toks, new Object[] {tags}, contextGenerator, sequenceValidator);
List<String> c = bestSequence.getOutcomes();
return c.toArray(new String[c.size()]);
}
public Span[] chunkAsSpans(String[] toks, String[] tags) {
String[] preds = chunk(toks, tags);
return ChunkSample.phrasesAsSpanList(toks, tags, preds);
}
@Deprecated
public Sequence[] topKSequences(List<String> sentence, List<String> tags) {
return topKSequences(sentence.toArray(new String[sentence.size()]),
tags.toArray(new String[tags.size()]));
}
public Sequence[] topKSequences(String[] sentence, String[] tags) {
return model.bestSequences(DEFAULT_BEAM_SIZE, sentence,
new Object[] { tags }, contextGenerator, sequenceValidator);
}
public Sequence[] topKSequences(String[] sentence, String[] tags, double minSequenceScore) {
return model.bestSequences(DEFAULT_BEAM_SIZE, sentence, new Object[] { tags }, minSequenceScore,
contextGenerator, sequenceValidator);
}
/**
* Populates the specified array with the probabilities of the last decoded sequence. The
* sequence was determined based on the previous call to <code>chunk</code>. The
* specified array should be at least as large as the numbe of tokens in the previous call to <code>chunk</code>.
*
* @param probs An array used to hold the probabilities of the last decoded sequence.
*/
public void probs(double[] probs) {
bestSequence.getProbs(probs);
}
/**
* Returns an array with the probabilities of the last decoded sequence. The
* sequence was determined based on the previous call to <code>chunk</code>.
* @return An array with the same number of probabilities as tokens were sent to <code>chunk</code>
* when it was last called.
*/
public double[] probs() {
return bestSequence.getProbs();
}
public static ChunkerModel train(String lang, ObjectStream<ChunkSample> in,
TrainingParameters mlParams, ChunkerFactory factory) throws IOException {
String beamSizeString = mlParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
if (beamSizeString != null) {
beamSize = Integer.parseInt(beamSizeString);
}
Map<String, String> manifestInfoEntries = new HashMap<String, String>();
TrainerType trainerType = TrainerFactory.getTrainerType(mlParams.getSettings());
MaxentModel chunkerModel = null;
SequenceClassificationModel<String> seqChunkerModel = null;
if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
ObjectStream<Event> es = new ChunkerEventStream(in, factory.getContextGenerator());
EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams.getSettings(),
manifestInfoEntries);
chunkerModel = trainer.train(es);
}
else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
mlParams.getSettings(), manifestInfoEntries);
// TODO: This will probably cause issue, since the feature generator uses the outcomes array
ChunkSampleSequenceStream ss = new ChunkSampleSequenceStream(in, factory.getContextGenerator());
seqChunkerModel = trainer.train(ss);
}
else {
throw new IllegalArgumentException("Trainer type is not supported: " + trainerType);
}
if (chunkerModel != null) {
return new ChunkerModel(lang, chunkerModel, manifestInfoEntries, factory);
}
else {
return new ChunkerModel(lang, seqChunkerModel, manifestInfoEntries, factory);
}
}
/**
* @deprecated Use
* {@link train(String, ObjectStream, TrainingParameters, ChunkerFactory)}
* instead.
*/
public static ChunkerModel train(String lang, ObjectStream<ChunkSample> in,
ChunkerContextGenerator contextGenerator, TrainingParameters mlParams)
throws IOException {
Map<String, String> manifestInfoEntries = new HashMap<String, String>();
ObjectStream<Event> es = new ChunkerEventStream(in, contextGenerator);
MaxentModel maxentModel = TrainUtil.train(es, mlParams.getSettings(), manifestInfoEntries);
return new ChunkerModel(lang, maxentModel, manifestInfoEntries);
}
}