if (!entity.getClass().equals(EventMention.class)) {
List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, entity);
if (tokens.size() > 0){
BaseToken lastToken = tokens.get(tokens.size() - 1);
String value = String.format("%s_%s", entity.getClass().getSimpleName(), entity.getTypeID());
endOfEntityFeatures.put(lastToken, new Feature("EndOf", value));
}
}
}
Random rand = new Random();
//TRY SMOTE algorithm here to generate more minority class samples
SMOTEplus smote = new SMOTEplus((int)Math.ceil(this.smoteNumOfNeighbors));
// classify tokens within each sentence
for (Sentence sentence : JCasUtil.selectCovered(jCas, Sentence.class, segment)) {
List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
// during training, the list of all outcomes for the tokens
List<String> outcomes;
if (this.isTraining()) {
List<EventMention> events = Lists.newArrayList();
for (EventMention event : JCasUtil.selectCovered(jCas, EventMention.class, sentence)) {
if (event.getClass().equals(EventMention.class)) {
events.add(event);
}
}
outcomes = this.eventChunking.createOutcomes(jCas, tokens, events);
}
// during prediction, the list of outcomes predicted so far
else {
outcomes = new ArrayList<String>();
}
// get BIO entity tags for each entity type
int[] entityTypeIDs = new int[] {
CONST.NE_TYPE_ID_ANATOMICAL_SITE,
CONST.NE_TYPE_ID_DISORDER,
CONST.NE_TYPE_ID_DRUG,
CONST.NE_TYPE_ID_FINDING,
CONST.NE_TYPE_ID_PROCEDURE,
CONST.NE_TYPE_ID_UNKNOWN };
List<IdentifiedAnnotation> entities;
if (this.isTraining()) {
entities = Lists.newArrayList();
for (IdentifiedAnnotation entity : JCasUtil.selectCovered(jCas, IdentifiedAnnotation.class, sentence)) {
if (!entity.getClass().equals(EventMention.class)) {
entities.add(entity);
}
}
} else {
entities = JCasUtil.selectCovered(jCas, IdentifiedAnnotation.class, sentence);
}
List<ChunkingExtractor> chunkingExtractors = Lists.newArrayList();
for (int typeID : entityTypeIDs) {
Predicate<IdentifiedAnnotation> hasTypeID = hasEntityType(typeID);
List<IdentifiedAnnotation> subEntities = Lists.newArrayList(Iterables.filter(entities, hasTypeID));
chunkingExtractors.add(new ChunkingExtractor("EntityTag", this.entityChunking, jCas, tokens, subEntities));
}
// add extractor for phase chunks
List<Chunk> chunks = JCasUtil.selectCovered(jCas, Chunk.class, sentence);
chunkingExtractors.add(new ChunkingExtractor("PhraseTag", this.phraseChunking, jCas, tokens, chunks));
// extract features for all tokens
int tokenIndex = -1;
int nChunkLabelsBefore = 2;
int nChunkLabelsAfter = 2;
int nPreviousClassifications = 2;
for (BaseToken token : tokens) {
++tokenIndex;
List<Feature> features = new ArrayList<Feature>();
// features from previous classifications
for (int i = nPreviousClassifications; i > 0; --i) {
int index = tokenIndex - i;
String previousOutcome = index < 0 ? "O" : outcomes.get(index);
features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
}
// features from token attributes
features.addAll(this.tokenFeatureExtractor.extract(jCas, token));