//int lineNumber = 1;
for (Annotation currentAnnotation : getAnnotationsByType().get(AnnotationType.ASSERTION))
{
final int lineNumber = currentAnnotation.getBegin().getLine();
AssertionAnnotation currentAssertionAnnotation = (AssertionAnnotation)currentAnnotation;
if (!currentAssertionAnnotation.getConceptType().equals(ConceptType.PROBLEM))
{
// skip this one
continue;
}
TrainingInstance trainingInstance = new TrainingInstance();
List<Annotation> allLineAnnotations = indexer.getAnnotationByLine().get((long) lineNumber);
trainingInstance.setFilename(getTextFilename());
trainingInstance.setLineNumber(lineNumber);
trainingInstance.setAssertAnnotateForTI(currentAssertionAnnotation); //link training instance to corresponding assertion
trainingInstance.setAnnotationsForLine(allLineAnnotations); //list of annotations for the line this training instance is on
trainingInstance.setTokensForLine(textLookup[lineNumber-1]); //token string for the line this training instance is on
AssertionValue assertionValue = currentAssertionAnnotation.getAssertionValue();
String assertionValueString = (assertionValue == null) ? "" : assertionValue.toString().toLowerCase();
trainingInstance.setExpectedValue(assertionValueString);
String conceptText = currentAssertionAnnotation.getConceptText();
if (checkForEnabledFeature("conceptTextFeature"))
{
String conceptTextFeature = MedFactsRunner.constructConceptPhraseFeature(conceptText);
trainingInstance.addFeature(conceptTextFeature);
}
if (checkForEnabledFeature("conceptPseudoHeadFeature"))
{
int ln = currentAssertionAnnotation.getEnd().getLine();
int pos = currentAssertionAnnotation.getEnd().getTokenOffset();
// logger.finest(String.format("conceptPseudoHeadFeature:: ln == %d; pos == %d", ln, pos));
// logger.finest(String.format("textLookup's size: %d", textLookup.length));
// logger.finest(String.format("textLookup[ln-1]'s size: %d", textLookup[ln-1].length));
// logger.finest(ZonerCli.printOutLineOfTokens(textLookup[ln-1]));
String conceptHead = textLookup[ln-1][pos];
trainingInstance.addFeature(constructConceptHeadFeature(conceptHead));
//Matcher conceptHeadMatcher = conceptHeadPattern.matcher(conceptText);
//if (conceptHeadMatcher.find())
//{
// String conceptHeadText = conceptHeadMatcher.group(1);
// String conceptHeadFeature = constructConceptHeadFeature(conceptHeadText);
// trainingInstance.addFeature(conceptHeadFeature);
//}
}
Location conceptBeginLocation = currentAssertionAnnotation.getBegin();
int conceptBeginLine = conceptBeginLocation.getLine();
int conceptBeginTokenOffset = conceptBeginLocation.getTokenOffset();
Location conceptEndLocation = currentAssertionAnnotation.getEnd();
int conceptEndTokenOffset = conceptEndLocation.getTokenOffset();
String currentLine[] = textLookup[conceptBeginLine-1];
if (checkForEnabledFeature("conceptUnigrams")) {
for (int k = conceptBeginTokenOffset; k <= conceptEndTokenOffset; k++) {
trainingInstance.addFeature("concept_unigram_" + StringHandling.escapeStringForFeatureName(currentLine[k]));
}
}
if (checkForEnabledFeature("wordLeftFeature"))
{
List<String> wordLeftFeatureList = FeatureUtility.constructWordLeftFeatureList(conceptBeginTokenOffset, conceptEndTokenOffset, currentLine);
for (String currentFeature : wordLeftFeatureList)
{
trainingInstance.addFeature(currentFeature);
}
}
if (checkForEnabledFeature("wordRightFeature"))
{
List<String> wordRightFeatureList = FeatureUtility.constructWordRightFeatureList(conceptBeginTokenOffset, conceptEndTokenOffset, currentLine);
for (String currentFeature : wordRightFeatureList)
{
trainingInstance.addFeature(currentFeature);
}
}
if (checkForEnabledFeature("posRightFeature")){
trainingInstance.addFeature(FeatureUtility.constructPosRightFeatureList(conceptBeginTokenOffset, conceptEndTokenOffset, (conceptBeginLine-1), currentLine, indexer));
}
if (checkForEnabledFeature("posLeftFeature")){
trainingInstance.addFeature(FeatureUtility.constructPosLeftFeatureList(conceptBeginTokenOffset, conceptEndTokenOffset, (conceptBeginLine-1), currentLine, indexer));
}
if (checkForEnabledFeature("cueWord_DEFINITE_left_2")) {
for (int i = 1; i < 3; i++) {
int relPos = conceptBeginTokenOffset - i;
if (relPos >= 0) {
if (currentLine[relPos].equals("her") || currentLine[relPos].equals("his") || currentLine[relPos].equals("patient's") ||
currentLine[relPos].equals("your") || currentLine[relPos].equals("this")) {
trainingInstance.addFeature("cueWord_DEFINITE_left_2");
}
}
}
}
if (checkForEnabledFeature("cueWordOrderingsLeft")) {
List<CueWordAnnotation> annots = new ArrayList<CueWordAnnotation>();
for (Annotation a : allLineAnnotations) {
if (a instanceof CueWordAnnotation) {
CueWordAnnotation an = (CueWordAnnotation)a;
if ((inCWSCueWordSet(an.getCueWordType())) && (an.getBegin().getTokenOffset() < conceptBeginTokenOffset)) {
annots.add(an);
}
}
}
if (annots.size() > 0) {
Collections.sort(annots);
StringBuilder str = new StringBuilder("CWS_left");
for (CueWordAnnotation a : annots) {
str.append("_");
str.append(a.getCueWordType());
}
trainingInstance.addFeature(str.toString());
}
}
if (checkForEnabledFeature("cueWordOrderingsRight")) {
List<CueWordAnnotation> annots = new ArrayList<CueWordAnnotation>();
for (Annotation a : allLineAnnotations) {
if (a instanceof CueWordAnnotation) {
CueWordAnnotation an = (CueWordAnnotation)a;
if ((inCWSCueWordSet(an.getCueWordType())) && (an.getBegin().getTokenOffset() > conceptEndTokenOffset)) {
annots.add(an);
}
}
}
Collections.sort(annots);
if (annots.size() > 0) {
StringBuilder str = new StringBuilder("CWS_right");
for (CueWordAnnotation a : annots) {
str.append("_");
str.append(a.getCueWordType());
}
trainingInstance.addFeature(str.toString());
}
}
//logger.info(String.format("lineNumber: %d%n", lineNumber);
String tokensOnCurrentLine[] = textLookup[lineNumber-1];
for (int currentTokenOffset=0; currentTokenOffset < tokensOnCurrentLine.length; currentTokenOffset++)
{
String currentToken = tokensOnCurrentLine[currentTokenOffset];
List<Annotation> annotationsAtCurrentPosition = indexer.findAnnotationsForPosition(lineNumber, currentTokenOffset);
int scopeCount = 0;
if (annotationsAtCurrentPosition != null)
for (Annotation a : annotationsAtCurrentPosition)
{
if (checkForEnabledFeature("concepts")) {
if (a instanceof ConceptAnnotation) {
ConceptAnnotation concept = (ConceptAnnotation) a;
String conceptType = concept.getConceptType().toString();
int thisConceptBegin = concept.getBegin().getTokenOffset();
int thisConceptEnd = concept.getEnd().getTokenOffset();
if (concept.getBegin().getTokenOffset() < conceptBeginTokenOffset) {
trainingInstance.addFeature("concept_" + conceptType + "_left");
if ((conceptBeginTokenOffset - thisConceptEnd) < 4) {
trainingInstance.addFeature("concept_" + conceptType + "_left_3");
}
} else {
if ((thisConceptBegin - conceptEndTokenOffset) < 4) {
trainingInstance.addFeature("concept_" + conceptType + "_right_3");
}
trainingInstance.addFeature("concept_" + conceptType + "_right");
}
}
}
if (a instanceof ScopeAnnotation)
{
ScopeAnnotation scope = (ScopeAnnotation)a;
scopeCount++;
if (checkForEnabledFeature("scope"))
{
trainingInstance.addFeature("scope");
}
if (checkForEnabledFeature("inScope"))
{
trainingInstance.addFeature("in_scope_" + currentToken);
}
if (checkForEnabledFeature("inScopeId"))
{
trainingInstance.addFeature("in_scope_id_" + scope.getScopeId() + "_" + currentToken);
}
}
if (a instanceof CueAnnotation)
{
CueAnnotation cue = (CueAnnotation)a;
if (checkForEnabledFeature("cue"))
{
String cueType = cue.getCueSubType().toString();
int cueBegin = cue.getBegin().getTokenOffset();
if (cueBegin < conceptBeginTokenOffset) {
trainingInstance.addFeature("cue_" + cueType + "_left");
if ((conceptBeginTokenOffset - cueBegin) < 4) {
trainingInstance.addFeature("cue_" + cueType + "_left_3");
}
} else {
int cueEnd = cue.getEnd().getTokenOffset();
trainingInstance.addFeature("cue_" + cueType + "_right");
if ((cueEnd - conceptEndTokenOffset) < 4) {
trainingInstance.addFeature("cue_" + cueType + "_right_3");
}
}
}
if (checkForEnabledFeature("inCue"))
{
trainingInstance.addFeature("in_cue_" + currentToken);
}
if (checkForEnabledFeature("inCueForScopeId"))
{
trainingInstance.addFeature("in_cue_for_scope_id_" + cue.getScopeIdReference() + "_" + currentToken);
}
}
if (a instanceof CueWordAnnotation)
{
CueWordAnnotation cueWord = (CueWordAnnotation)a;
String cueWordType = cueWord.getCueWordType().toString();
String cueWordText = cueWord.getCueWordText();
String escapedCueWordText = escapeFeatureName(cueWordText);
String escapedCueWordClass = null;
String cueWordClass = cueWord.getCueWordClass();
boolean cueWordClassIsNotEmpty = (cueWordClass != null) && (!cueWordClass.isEmpty());
if (checkForEnabledFeature("cueWordClassValue") && cueWordClassIsNotEmpty)
{
escapedCueWordClass = escapeFeatureName(cueWordClass);
}
if (checkForEnabledFeature("cueWordTextPositional"))
{
int cueWordBegin = cueWord.getBegin().getTokenOffset();
int cueWordEnd = cueWord.getEnd().getTokenOffset();
if (cueWordBegin < conceptBeginTokenOffset) {
trainingInstance.addFeature("cueWordTextPositional_" + escapedCueWordText + "_left");
if ((conceptBeginTokenOffset - cueWordBegin) < 4) {
trainingInstance.addFeature("cueWordTextPositional_" + escapedCueWordText + "_left_3");
}
} else if (cueWordBegin > conceptEndTokenOffset) {
trainingInstance.addFeature("cueWordTextPositional_" + escapedCueWordText + "_right");
if ((cueWordEnd - conceptEndTokenOffset) < 4) {
trainingInstance.addFeature("cueWordTextPositional_" + escapedCueWordText + "_right_3");
}
} else {
trainingInstance.addFeature("cueWordTextPositional_" + escapedCueWordText + "_within");
}
}
if (checkForEnabledFeature("cueWordTypePositional"))
{
int cueWordBegin = cueWord.getBegin().getTokenOffset();
int cueWordEnd = cueWord.getEnd().getTokenOffset();
if (cueWordBegin < conceptBeginTokenOffset) {
trainingInstance.addFeature("cueWordTypePositional_" + cueWordType + "_left");
if ((conceptBeginTokenOffset - cueWordBegin) < 4) {
trainingInstance.addFeature("cueWordTypePositional_" + cueWordType + "_left_3");
}
} else if (cueWordBegin > conceptEndTokenOffset) {
trainingInstance.addFeature("cueWordTypePositional_" + cueWordType + "_right");
if ((cueWordEnd - conceptEndTokenOffset) < 4) {
trainingInstance.addFeature("cueWordTypePositional_" + cueWordType + "_right_3");
}
} else {
trainingInstance.addFeature("cueWordTypePositional_" + cueWordType + "_within");
}
}
if (checkForEnabledFeature("cueWordClassPositional") && cueWordClassIsNotEmpty)
{
int cueWordBegin = cueWord.getBegin().getTokenOffset();
int cueWordEnd = cueWord.getEnd().getTokenOffset();
if (cueWordBegin < conceptBeginTokenOffset) {
trainingInstance.addFeature("cueWordClassPositional_" + escapedCueWordClass + "_left");
if ((conceptBeginTokenOffset - cueWordBegin) < 4) {
trainingInstance.addFeature("cueWordClassPositional_" + escapedCueWordClass + "_left_3");
}
} else if (cueWordBegin > conceptEndTokenOffset) {
trainingInstance.addFeature("cueWordClassPositional_" + escapedCueWordClass + "_right");
if ((cueWordEnd - conceptEndTokenOffset) < 4) {
trainingInstance.addFeature("cueWordClassPositional_" + escapedCueWordClass + "_right_3");
}
} else {
trainingInstance.addFeature("cueWordClassPositional_" + escapedCueWordClass + "_within");
}
}
if (checkForEnabledFeature("cueWordTextValue"))
{
trainingInstance.addFeature("cueWordTextValue_" + escapedCueWordText);
}
if (checkForEnabledFeature("cueWordTypeValue") && cueWord.getCueWordType() != null)
{
trainingInstance.addFeature("cueWordTypeValue_" + cueWord.getCueWordType().toString());
}
if (checkForEnabledFeature("cueWordClassValue") && cueWord.getCueWordClass() != null && !cueWord.getCueWordClass().isEmpty())
{
trainingInstance.addFeature("cueWordClassValue_" + escapedCueWordClass);
}
}
if (checkForEnabledFeature("zone"))
{
if (a instanceof ZoneAnnotation)
{
ZoneAnnotation zone = (ZoneAnnotation)a;
final String zoneFeatureName = "zone_" + escapeFeatureName(zone.getZoneName());
//logger.info("### zone feature: " + zoneFeatureName);
trainingInstance.addFeature(zoneFeatureName);
}
}
}
if (scopeCount > 0)
{
if (checkForEnabledFeature("scopeCountNumber"))
{
trainingInstance.addFeature("scope_count_" + scopeCount);
}
if (checkForEnabledFeature("scopeCountEvenOrOdd"))
{
boolean scopeCountIsEven = (scopeCount % 2) == 0;
trainingInstance.addFeature("scope_count_" + (scopeCountIsEven ? "even" : "odd"));
}
}
}
//Features based on negation and speculation scopes enclosing the text of the entire training instance -Alex Yeh
int enclosingNegationScopeCnt = 0;
int enclosingSpeculationScopeCnt = 0;
AssertionAnnotation assertForTI = trainingInstance.getAssertAnnotateForTI();
//Count number of enclosing negation and speculation scopes
for (ScopeAnnotation enclosingScope : assertForTI.getEnclosingScopes())
{
CueAnnotation cueForScope = enclosingScope.getCueForScope();
CueSubType scopeType = cueForScope.getCueSubType();
if (scopeType == CueSubType.NEGATION) enclosingNegationScopeCnt++;
else if (scopeType == CueSubType.SPECULATION) enclosingSpeculationScopeCnt++;