/*
* AnnotateBySpecPR.java
*
* This file is is free software, licenced under the
* GNU Library General Public License, Version 2, June 1991.
* See http://www.gnu.org/licenses/gpl-2.0.html
*
* $Id: $
*/
package at.ofai.gate.virtualdocuments;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Controller;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.LanguageAnalyser;
import gate.corpora.DocumentImpl;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ControllerAwarePR;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import gate.util.Strings;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import javax.naming.InvalidNameException;
/**
* This PR runs a PR on a virtual document that is created from the values
* of a feature of all the annotations of some given annotation type.
* Each value in the virtual document is originall annotated with a given
* annotation type in a given annotation set. After processing, a specific
* feature set in another given annotation type in a given annotation set will
* be used to set a new feature in the annotation where the original value
* was taken from.
* <p>
* Runtime Parameters:
* <ul>
* <li>InputSpecification: the specification of annotation set, type and
* feature name where to take the text for the virtual document from in the
* form [set:]type[.feature] If no set is specified, the default annotationset
* is assumed, if no feature is specified, the underlying document string is
* used.
* <li>VirtualSpecification: the specification of the annotation in the
* virtual document that is created for each value, and the feature name
* where the final value is taken from in the form [set:]type.feature
* <li>OutputFeature: the name of the feature to be created in the input
* annotation from the feature in the virtual document created by the
* virtualSpecification. The output feature will always be added to the original
* annotation where we took the text from, if that feature already exists,
* the value is overwritten.
* </ul>
*
* @author Johann Petrak
*/
@CreoleResource(name = "Feature Language Analyser PR",
comment = "Create a virtual document from feature values, run PR and set feature from the result")
public class FeatureLanguageAnalyserPR
extends AbstractLanguageAnalyser
implements LanguageAnalyser, ControllerAwarePR
{
public static final long serialVersionUID = 1L;
@RunTime
@CreoleParameter(
comment = "The specification of where the text comes from in the form [set:]type[.feature]",
defaultValue = "")
public void setInputSpecification(String ss) {
this.inputSpecification = ss;
}
public String getInputSpecification() {
return inputSpecification;
}
private String inputSpecification;
private String inputSpecificationSet;
private String inputSpecificationType;
private String inputSpecificationFeature;
@RunTime
@CreoleParameter(
comment = "The specification of the annotation and the feature to use in the virtual document ([set:]type.feature)",
defaultValue = "")
public void setVirtualSpecification(String ss) {
this.virtualSpecification = ss;
}
public String getVirtualSpecification() {
return virtualSpecification;
}
private String virtualSpecification;
private String virtualSpecificationSet;
private String virtualSpecificationType;
private String virtualSpecificationFeature;
@RunTime
@CreoleParameter(
comment = "the new feature to create in the original annotation",
defaultValue = "")
public void setOutputFeature(String name) {
outputFeature = name;
}
public String getOutputFeature() {
return outputFeature;
}
private String outputFeature;
@RunTime
@CreoleParameter(
comment = "String to insert between the text extracted from the input spec",
defaultValue = "\\n")
public void setSeparatorString(String str) {
separatorString = str;
}
public String getSeparatorString() {
return separatorString;
}
private String separatorString = "\\n";
private String actualSeparatorString = "\n";
@RunTime
@CreoleParameter(comment = "If true, keep virtual document", defaultValue = "false")
public void setDebug(Boolean debug) {
this.debug = debug;
}
public Boolean getDebug() {
return this.debug;
}
private Boolean debug = false;
/*
* For now we do not do any backward mapping in addition to setting the
* output feature
@RunTime
@Optional
@CreoleParameter(
comment = "A list of Annotation Set/type names to map back, default: all",
defaultValue = "")
public void setMapBackAnnotations(List<String> as) {
this.mapBackAnnotations = as;
}
public List<String> getMapBackAnnotations() {
return mapBackAnnotations;
}
private List<String> mapBackAnnotations;
*/
@RunTime
@CreoleParameter(
comment = "A Language Analyser PR that will be run on the virtual document")
public void setLanguageAnalyser(LanguageAnalyser theLA) {
this.languageAnalyser = theLA;
}
public LanguageAnalyser getLanguageAnalyser() {
return languageAnalyser;
}
LanguageAnalyser languageAnalyser;
@Override
public void execute()
throws ExecutionException
{
if (corpus == null) {
startup();
}
fireStatusChanged("FeatureLanguageAnalyserPR processing: "
+ getDocument().getName());
if (!(document instanceof DocumentImpl)) {
throw new GateRuntimeException("Can only handle DocumentImpl not " +
document.getClass());
}
// Get the annotations in document order
AnnotationSet anns =
document.getAnnotations(inputSpecificationSet).get(inputSpecificationType);
List<Annotation> annlist = gate.Utils.inDocumentOrder(anns);
List<AnnotationSpec> annspecs = new LinkedList<AnnotationSpec>();
StringBuilder newText = new StringBuilder();
long curoffset = 0;
// System.err.println("Processing original annotations: "+anns.size());
for(Annotation ann : annlist) {
String txt;
if(inputSpecificationFeature == null) {
txt = gate.Utils.stringFor(document, ann);
newText.append(txt);
annspecs.add(new AnnotationSpec(ann,curoffset,curoffset+txt.length(),ann.getId()));
curoffset += txt.length();
newText.append(actualSeparatorString);
curoffset += actualSeparatorString.length();
} else {
txt = (String)ann.getFeatures().get(inputSpecificationFeature);
if(txt != null) {
newText.append(txt);
annspecs.add(new AnnotationSpec(ann,curoffset,curoffset+txt.length(),ann.getId()));
curoffset += txt.length();
newText.append(actualSeparatorString);
curoffset += actualSeparatorString.length();
}
}
}
FeatureMap theparms = Factory.newFeatureMap();
theparms.put("collectRepositioningInfo", document.getCollectRepositioningInfo());
theparms.put("encoding", ((DocumentImpl) document).getEncoding());
theparms.put("markupAware", document.getMarkupAware());
theparms.put("mimeType", ((DocumentImpl) document).getMimeType());
theparms.put("preserveOriginalContent", document.getPreserveOriginalContent());
theparms.put("stringContent", newText.toString());
FeatureMap thefeats = Factory.newFeatureMap();
FeatureMap docfeats = document.getFeatures();
thefeats.putAll(docfeats);
String theName = document.getName();
// create a copy of the current document
Document newDoc;
try {
newDoc = (Document) Factory.createResource(
"gate.corpora.DocumentImpl",
theparms,
thefeats,
theName+"_virtual");
} catch (ResourceInstantiationException ex) {
throw new GateRuntimeException(ex);
}
// set the initial annotations in the virtual document
AnnotationSet newSet = newDoc.getAnnotations(virtualSpecificationSet);
for(AnnotationSpec annspec : annspecs) {
FeatureMap fm = Factory.newFeatureMap();
fm.putAll(annspec.annotation.getFeatures());
fm.put("orig_id",annspec.origId);
try {
newSet.add(annspec.fromOffset, annspec.toOffset, virtualSpecificationType, fm);
} catch(InvalidOffsetException ex) {
throw new GateRuntimeException(
"Invalid offset when creating annotation for virtual document: from/to/doclength: "+
annspec.fromOffset+"/"+annspec.toOffset+"/"+newDoc.getContent().size(),ex);
}
}
languageAnalyser.setDocument(newDoc);
languageAnalyser.execute();
// Go through the annotations in the generated document and map the created
// feature back to the original document
List<Annotation> virtanns =
gate.Utils.inDocumentOrder(
newDoc.getAnnotations(virtualSpecificationSet).get(virtualSpecificationType));
// System.err.println("Processing virtual annotations: "+virtanns.size());
for(Annotation virtann : virtanns) {
String value = (String)virtann.getFeatures().get(virtualSpecificationFeature);
Integer id = (Integer)virtann.getFeatures().get("orig_id");
Annotation origann = anns.get(id);
if(origann != null) {
origann.getFeatures().put(outputFeature, value);
} else {
// This should never happen!
System.err.println("Could not find original annotation with id:"+id);
}
}
virtanns = null;
if(!debug) {
Factory.deleteResource(newDoc);
}
fireStatusChanged("FeatureLanguageAnalyserPR completed");
}
@Override
public void controllerExecutionAborted(Controller arg0, Throwable arg1)
throws ExecutionException {
}
@Override
public void controllerExecutionFinished(Controller arg0)
throws ExecutionException {
}
@Override
public void controllerExecutionStarted(Controller arg0)
throws ExecutionException {
startup();
}
public void startup() throws ExecutionException {
if(getInputSpecification() == null || getInputSpecification().isEmpty()) {
throw new ExecutionException("InputSpecification must be specified");
}
if(getOutputFeature() == null || getOutputFeature().isEmpty()) {
throw new ExecutionException("OutputFeature must be specified");
}
if(getVirtualSpecification() == null || getVirtualSpecification().isEmpty()) {
throw new ExecutionException("VirtualSpecification must be specified");
}
if(languageAnalyser == null) {
throw new ExecutionException("Language Analyser PR not set!");
}
String specfields[];
specfields = splitSpecification(getInputSpecification());
inputSpecificationSet = specfields[0];
inputSpecificationType = specfields[1];
inputSpecificationFeature = specfields[2];
specfields = splitSpecification(getVirtualSpecification());
virtualSpecificationSet = specfields[0];
virtualSpecificationType = specfields[1];
virtualSpecificationFeature = specfields[2];
if(virtualSpecificationFeature == null) {
throw new ExecutionException("VirtualSpecification must include a feature name");
}
if(separatorString != null && !separatorString.isEmpty()) {
actualSeparatorString = Strings.unescape(separatorString);
} else {
actualSeparatorString = "";
}
}
// split a specification of the form [set:]type[.name] in its three
// components and return an array of length 3 with each component.
// If a component is missing, the corresponding array entry is set to null
private String[] splitSpecification(String specification) {
String[] ret = new String[3];
String[] tmp = null;
tmp = specification.split(":",2);
String tmpstr = null;
if(tmp.length == 1) {
ret[0] = null;
tmpstr = tmp[0];
} else {
ret[0] = tmp[0];
tmpstr = tmp[1];
}
tmp = tmpstr.split("\\.",2);
if(tmp.length == 1) {
ret[1] = tmp[0];
ret[2] = null;
} else {
ret[1] = tmp[0];
ret[2] = tmp[1];
}
return ret;
}
private static class AnnotationSpec {
public Annotation annotation;
public long fromOffset;
public long toOffset;
public int origId;
public AnnotationSpec(Annotation ann, long from, long to, int id) {
annotation = ann;
fromOffset = from;
toOffset = to;
origId = id;
}
}
}