Package at.ofai.gate.virtualdocuments

Source Code of at.ofai.gate.virtualdocuments.FeatureLanguageAnalyserPR

/*
*  AnnotateBySpecPR.java
*
*  This file is is free software, licenced under the
*  GNU Library General Public License, Version 2, June 1991.
*  See http://www.gnu.org/licenses/gpl-2.0.html
*
*  $Id: $
*/

package at.ofai.gate.virtualdocuments;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Controller;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.LanguageAnalyser;
import gate.corpora.DocumentImpl;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ControllerAwarePR;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.GateRuntimeException;

import gate.util.InvalidOffsetException;
import gate.util.Strings;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import javax.naming.InvalidNameException;

/**
* This PR runs a PR on a virtual document that is created from the values
* of a feature of all the annotations of some given annotation type.
* Each value in the virtual document is originall annotated with a given
* annotation type in a given annotation set. After processing, a specific
* feature set in another given annotation type in a given annotation set will
* be used to set a new feature in the annotation where the original value
* was taken from.
* <p>
* Runtime Parameters:
* <ul>
* <li>InputSpecification: the specification of annotation set, type and
* feature name where to take the text for the virtual document from in the
* form [set:]type[.feature] If no set is specified, the default annotationset
* is assumed, if no feature is specified, the underlying document string is
* used.
* <li>VirtualSpecification: the specification of the annotation in the
* virtual document that is created for each value, and the feature name
* where the final value is taken from in the form [set:]type.feature
* <li>OutputFeature: the name of the feature to be created in the input
* annotation from the feature in the virtual document created by the
* virtualSpecification. The output feature will always be added to the original
* annotation where we took the text from, if that feature already exists,
* the value is overwritten.
* </ul>
*
* @author Johann Petrak
*/
@CreoleResource(name = "Feature Language Analyser PR",
        comment = "Create a virtual document from feature values, run PR and set feature from the result")
public class FeatureLanguageAnalyserPR
  extends AbstractLanguageAnalyser
  implements LanguageAnalyser, ControllerAwarePR
{

  public static final long serialVersionUID = 1L;
 
  @RunTime
  @CreoleParameter(
    comment = "The specification of where the text comes from in the form [set:]type[.feature]",
    defaultValue = "")
  public void setInputSpecification(String ss) {
    this.inputSpecification = ss;
  }
 
  public String getInputSpecification() {
    return inputSpecification;
  }
  private String inputSpecification;

  private String inputSpecificationSet;
  private String inputSpecificationType;
  private String inputSpecificationFeature;
 
  @RunTime
  @CreoleParameter(
    comment = "The specification of the annotation and the feature to use in the virtual document ([set:]type.feature)",
    defaultValue = "")
  public void setVirtualSpecification(String ss) {
    this.virtualSpecification = ss;
  }
 
  public String getVirtualSpecification() {
    return virtualSpecification;
  }
  private String virtualSpecification;

  private String virtualSpecificationSet;
  private String virtualSpecificationType;
  private String virtualSpecificationFeature;
 
  @RunTime
  @CreoleParameter(
    comment = "the new feature to create in the original annotation",
    defaultValue = "")
  public void setOutputFeature(String name) {
    outputFeature = name;
  }
  public String getOutputFeature() {
    return outputFeature;
  }
  private String outputFeature;
 
  @RunTime
  @CreoleParameter(
    comment = "String to insert between the text extracted from the input spec",
    defaultValue = "\\n")
  public void setSeparatorString(String str) {
    separatorString = str;
  }
  public String getSeparatorString() {
    return separatorString;
  }
  private String separatorString = "\\n";
  private String actualSeparatorString = "\n";

  @RunTime
  @CreoleParameter(comment = "If true, keep virtual document", defaultValue = "false")
  public void setDebug(Boolean debug) {
    this.debug = debug;
  }
  public Boolean getDebug() {
    return this.debug;
  }
  private Boolean debug = false;

  /*
   * For now we do not do any backward mapping in addition to setting the
   * output feature
  @RunTime
  @Optional
  @CreoleParameter(
    comment = "A list of Annotation Set/type names to map back, default: all",
    defaultValue = "")
  public void setMapBackAnnotations(List<String> as) {
    this.mapBackAnnotations = as;
  }
  public List<String> getMapBackAnnotations() {
    return mapBackAnnotations;
  }
  private List<String> mapBackAnnotations;
  */
 

  @RunTime
  @CreoleParameter(
    comment = "A Language Analyser PR that will be run on the virtual document")
  public void setLanguageAnalyser(LanguageAnalyser theLA) {
    this.languageAnalyser = theLA;
  }
  public LanguageAnalyser getLanguageAnalyser() {
    return languageAnalyser;
  }
  LanguageAnalyser languageAnalyser;


  @Override
  public void execute()
    throws ExecutionException
  {
    if (corpus == null) {
      startup();
    }
    fireStatusChanged("FeatureLanguageAnalyserPR processing: "
            + getDocument().getName());


    if (!(document instanceof DocumentImpl)) {
      throw new GateRuntimeException("Can only handle DocumentImpl not " +
          document.getClass());
    }
   
    // Get the annotations in document order
    AnnotationSet anns =
      document.getAnnotations(inputSpecificationSet).get(inputSpecificationType);
    List<Annotation> annlist = gate.Utils.inDocumentOrder(anns);
    List<AnnotationSpec> annspecs = new LinkedList<AnnotationSpec>();
    StringBuilder newText = new StringBuilder();
    long curoffset = 0;
    // System.err.println("Processing original annotations: "+anns.size());
    for(Annotation ann : annlist) {
      String txt;
      if(inputSpecificationFeature == null) {
        txt = gate.Utils.stringFor(document, ann);
        newText.append(txt);
        annspecs.add(new AnnotationSpec(ann,curoffset,curoffset+txt.length(),ann.getId()));
        curoffset += txt.length();
        newText.append(actualSeparatorString);
        curoffset += actualSeparatorString.length();
      } else {
        txt = (String)ann.getFeatures().get(inputSpecificationFeature);
        if(txt != null) {
          newText.append(txt);
          annspecs.add(new AnnotationSpec(ann,curoffset,curoffset+txt.length(),ann.getId()));
          curoffset += txt.length();
          newText.append(actualSeparatorString);
          curoffset += actualSeparatorString.length();
        }
      }
    }
   
    FeatureMap theparms = Factory.newFeatureMap();
    theparms.put("collectRepositioningInfo", document.getCollectRepositioningInfo());
    theparms.put("encoding", ((DocumentImpl) document).getEncoding());
    theparms.put("markupAware", document.getMarkupAware());
    theparms.put("mimeType", ((DocumentImpl) document).getMimeType());
    theparms.put("preserveOriginalContent", document.getPreserveOriginalContent());
    theparms.put("stringContent", newText.toString());
    FeatureMap thefeats = Factory.newFeatureMap();
    FeatureMap docfeats = document.getFeatures();
    thefeats.putAll(docfeats);

    String theName = document.getName();
    // create a copy of the current document
    Document newDoc;
    try {
      newDoc = (Document) Factory.createResource(
              "gate.corpora.DocumentImpl",
              theparms,
              thefeats,
              theName+"_virtual");
    } catch (ResourceInstantiationException ex) {
      throw new GateRuntimeException(ex);
    }

    // set the initial annotations in the virtual document
    AnnotationSet newSet = newDoc.getAnnotations(virtualSpecificationSet);
    for(AnnotationSpec annspec : annspecs) {
      FeatureMap fm = Factory.newFeatureMap();
      fm.putAll(annspec.annotation.getFeatures());
      fm.put("orig_id",annspec.origId);     
      try {
        newSet.add(annspec.fromOffset, annspec.toOffset, virtualSpecificationType, fm);
      } catch(InvalidOffsetException ex) {
        throw new GateRuntimeException(
          "Invalid offset when creating annotation for virtual document: from/to/doclength: "+
          annspec.fromOffset+"/"+annspec.toOffset+"/"+newDoc.getContent().size(),ex);
      }
    }
   
    languageAnalyser.setDocument(newDoc);
    languageAnalyser.execute();

    // Go through the annotations in the generated document and map the created
    // feature back to the original document
    List<Annotation> virtanns =
    gate.Utils.inDocumentOrder(
      newDoc.getAnnotations(virtualSpecificationSet).get(virtualSpecificationType));
    // System.err.println("Processing virtual annotations: "+virtanns.size());
    for(Annotation virtann : virtanns) {
      String value = (String)virtann.getFeatures().get(virtualSpecificationFeature);
      Integer id = (Integer)virtann.getFeatures().get("orig_id");
      Annotation origann = anns.get(id);
      if(origann != null) {
        origann.getFeatures().put(outputFeature, value);
      } else {
        // This should never happen!
        System.err.println("Could not find original annotation with id:"+id);
      }
    }
   
    virtanns = null;

    if(!debug) {
      Factory.deleteResource(newDoc);
    }
    fireStatusChanged("FeatureLanguageAnalyserPR completed");

  }

@Override
public void controllerExecutionAborted(Controller arg0, Throwable arg1)
    throws ExecutionException {
}

@Override
public void controllerExecutionFinished(Controller arg0)
    throws ExecutionException {
}

@Override
public void controllerExecutionStarted(Controller arg0)
    throws ExecutionException {
  startup();
}

public void startup() throws ExecutionException {
  if(getInputSpecification() == null || getInputSpecification().isEmpty()) {
    throw new ExecutionException("InputSpecification must be specified");
  }
  if(getOutputFeature() == null || getOutputFeature().isEmpty()) {
    throw new ExecutionException("OutputFeature must be specified");
  }
  if(getVirtualSpecification() == null || getVirtualSpecification().isEmpty()) {
    throw new ExecutionException("VirtualSpecification must be specified");
  }
  if(languageAnalyser == null) {
    throw new ExecutionException("Language Analyser PR not set!");
  }
 
  String specfields[];
  specfields = splitSpecification(getInputSpecification());
  inputSpecificationSet = specfields[0];
  inputSpecificationType = specfields[1];
  inputSpecificationFeature = specfields[2];
 
  specfields = splitSpecification(getVirtualSpecification());
  virtualSpecificationSet = specfields[0];
  virtualSpecificationType = specfields[1];
  virtualSpecificationFeature = specfields[2];
  if(virtualSpecificationFeature == null) {
    throw new ExecutionException("VirtualSpecification must include a feature name");
  }
  if(separatorString != null && !separatorString.isEmpty()) {
    actualSeparatorString = Strings.unescape(separatorString);
  } else {
    actualSeparatorString = "";
  }
 
}
 
// split a specification of the form [set:]type[.name] in its three
// components and return an array of length 3 with each component.
// If a component is missing, the corresponding array entry is set to null
private String[] splitSpecification(String specification) {
  String[] ret = new String[3];
  String[] tmp = null;
  tmp = specification.split(":",2);
  String tmpstr = null;
  if(tmp.length == 1) {
    ret[0] = null;
    tmpstr = tmp[0];
  } else {
    ret[0] = tmp[0];
    tmpstr = tmp[1];
  }
  tmp = tmpstr.split("\\.",2);
  if(tmp.length == 1) {
    ret[1] = tmp[0];
    ret[2] = null;
  } else {
    ret[1] = tmp[0];
    ret[2] = tmp[1];
  }
  return ret;
}

private static class AnnotationSpec {
  public Annotation annotation;
  public long fromOffset;
  public long toOffset;
  public int origId;
  public AnnotationSpec(Annotation ann, long from, long to, int id) {
    annotation = ann;
    fromOffset = from;
    toOffset = to;
    origId = id;
  }
}


}
TOP

Related Classes of at.ofai.gate.virtualdocuments.FeatureLanguageAnalyserPR

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.