return aggregateBuilder;
}
protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
throws Exception {
AggregateBuilder aggregateBuilder = new AggregateBuilder();
aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
// read manual annotations into gold view
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
ViewCreatorAnnotator.class,
ViewCreatorAnnotator.PARAM_VIEW_NAME,
GOLD_VIEW_NAME));
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
ViewTextCopierAnnotator.class,
ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
CAS.NAME_DEFAULT_SOFA,
ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
GOLD_VIEW_NAME));
switch (this.xmlFormat) {
case Anafora:
aggregateBuilder.add(
THYMEAnaforaXMLReader.getDescription(this.xmlDirectory),
CAS.NAME_DEFAULT_SOFA,
GOLD_VIEW_NAME);
break;
case Knowtator:
aggregateBuilder.add(
THYMEKnowtatorXMLReader.getDescription(this.xmlDirectory),
CAS.NAME_DEFAULT_SOFA,
GOLD_VIEW_NAME);
break;
case I2B2:
aggregateBuilder.add(
I2B2TemporalXMLReader.getDescription(this.xmlDirectory),
CAS.NAME_DEFAULT_SOFA,
GOLD_VIEW_NAME);
break;
}
// identify segments
if(this.xmlFormat == XMLFormat.I2B2){
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
}else{
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SegmentsFromBracketedSectionTagsAnnotator.class));
}
// identify sentences
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
SentenceDetector.class,
SentenceDetector.SD_MODEL_FILE_PARAM,
"org/apache/ctakes/core/sentdetect/sd-med-model.zip"));
// identify tokens
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class));
// merge some tokens
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class));
// identify part-of-speech tags
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
POSTagger.class,
TypeSystemDescriptionFactory.createTypeSystemDescription(),
TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class),
POSTagger.POS_MODEL_FILE_PARAM,
"org/apache/ctakes/postagger/models/mayo-pos.zip"));
// identify chunks
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
Chunker.class,
Chunker.CHUNKER_MODEL_FILE_PARAM,
FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"),
Chunker.CHUNKER_CREATOR_CLASS_PARAM,
DefaultChunkCreator.class));
// identify UMLS named entities
// adjust NP in NP NP to span both
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
ChunkAdjuster.class,
ChunkAdjuster.PARAM_CHUNK_PATTERN,
new String[] { "NP", "NP" },
ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
1));
// adjust NP in NP PP NP to span all three
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
ChunkAdjuster.class,
ChunkAdjuster.PARAM_CHUNK_PATTERN,
new String[] { "NP", "PP", "NP" },
ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
2));
// add lookup windows for each NP
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class));
// maximize lookup windows
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
OverlapAnnotator.class,
"A_ObjectClass",
LookupWindowAnnotation.class,
"B_ObjectClass",
LookupWindowAnnotation.class,
"OverlapType",
"A_ENV_B",
"ActionType",
"DELETE",
"DeleteAction",
new String[] { "selector=B" }));
// add UMLS on top of lookup windows
aggregateBuilder.add(
UmlsDictionaryLookupAnnotator.createAnnotatorDescription()
);
// add lvg annotator
String[] XeroxTreebankMap = {
"adj|JJ",
"adv|RB",
"aux|AUX",
"compl|CS",
"conj|CC",
"det|DET",
"modal|MD",
"noun|NN",
"prep|IN",
"pron|PRP",
"verb|VB" };
String[] ExclusionSet = {
"and",
"And",
"by",
"By",
"for",
"For",
"in",
"In",
"of",
"Of",
"on",
"On",
"the",
"The",
"to",
"To",
"with",
"With" };
AnalysisEngineDescription lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
LvgAnnotator.class,
"UseSegments",
false,
"SegmentsToSkip",
new String[0],
"UseCmdCache",
false,
"CmdCacheFileLocation",
"/org/apache/ctakes/lvg/2005_norm.voc",
"CmdCacheFrequencyCutoff",
20,
"ExclusionSet",
ExclusionSet,
"XeroxTreebankMap",
XeroxTreebankMap,
"LemmaCacheFileLocation",
"/org/apache/ctakes/lvg/2005_lemma.voc",
"UseLemmaCache",
false,
"LemmaCacheFrequencyCutoff",
20,
"PostLemmas",
false,
"LvgCmdApi",
ExternalResourceFactory.createExternalResourceDescription(
LvgCmdApiResourceImpl.class,
new File(LvgCmdApiResourceImpl.class.getResource(
"/org/apache/ctakes/lvg/data/config/lvg.properties").toURI())));
aggregateBuilder.add(lvgAnnotator);
// add dependency parser
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPDependencyParserAE.class));
// add semantic role labeler
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPSemanticRoleLabelerAE.class));
// add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps
if(this.treebankDirectory != null){
aggregateBuilder.add(THYMETreebankReader.getDescription(this.treebankDirectory));
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TimexAnnotationCorrector.class));
}else{
// add ctakes constituency parses to system view
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class,
ConstituencyParser.PARAM_MODEL_FILENAME,
"org/apache/ctakes/constituency/parser/models/sharpacq-3.1.bin"));
// aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(BerkeleyParserWrapper.class,
// BerkeleyParserWrapper.PARAM_MODEL_FILENAME,
//
// "org/apache/ctakes/constituency/parser/models/thyme.gcg.4sm.bin"));
// "org/apache/ctakes/constituency/parser/models/thyme.4sm.bin"));
}
// write out the CAS after all the above annotations
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
XMIWriter.class,
XMIWriter.PARAM_XMI_DIRECTORY,
this.xmiDirectory));
return aggregateBuilder;