Package at.ofai.gate.extendedgazetteer

Source Code of at.ofai.gate.extendedgazetteer.IndirectExtendedGazetteer

/*
* IndirectExtendedGazeteer.java
*
* $Id: IndirectExtendedGazetteer.java  $
*
*/
package at.ofai.gate.extendedgazetteer;

// TODO: allow extended config files: special lines in the def file to
// e.g. define which features are allowed at all, the feature string
// separator (instead of using the parameter), which features to intern,
// which features to interpret as uriRefs, the baseURI for all uriRefs etc.
// How: if the first line consists of a single non-whitespace character,
// this character is to be interpreted as the extended config line character.
// All lines starting with this character are extended config lines and follow
// the syntax of property files (should we use YAML?)
// inside an extended config line the "#" character is a comment so
// #
// ## this is a comment
//

// TODO: allow to specify the output Annotation type in addition to the output
// annotation set. It might also be interesting to overwrite this from
// the extended config file, potentially for each lst file separately:
// #
// # outputAnnotationType = Lookup1
// filename1.lst
// filename2.lst
// # outputAnnotationType = Lookup2

// TODO: optionally allow string interning - maybe just with extended config files?


// TODO: better case mapping by case-normalizing the document content string
// or the gazetteer input string instead of individual characters
// (after normalization, see below).
// TODO: do Unicode string normalization (optionally) using
// java.text.Normalizer (Normalizer.normalize("a\u0301", Normalizer.Form.NFC).equals("á"))
// for both the document string and the gazetteer strings.
// PROBLEM:  these two would change the length of the content string so
// we would have to map offsets.
// Maybe it would be better to instead make this gazetteer to work as a
// flexible gazetteer by including part of the virtualdocuments logic:
// Optionally specify a annotation specification and do on the fly generation
// of the new content and mapping of othe generated annotations.
// This would be useful in itself but also allow to preprocess the document
// so that e.g. the tokens get a "normalizedUCString" feature that has normalized
// Unicode and uppercase representation for the gazetteer



// NOTE: for storing lots of features without the need to fit into memory:
//   Possibly useful: Berkley DB Java Edition
//   http://www.oracle.com/technetwork/database/berkeleydb/downloads/index.html?ssSourceSiteId=ocomen
//   or this: http://code.google.com/p/orient/
//

// NOTE: VR for default gazetteer cannot be overwritten/disabled

import at.ofai.gate.virtualdocuments.AnnotatedDocumentTransformer;
import java.util.*;

import gate.*;
import gate.corpora.DocumentImpl;
import gate.creole.*;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.HiddenCreoleParameter;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.*;
import javax.naming.InvalidNameException;


/**
*  See documentation in the wiki:
* http://code.google.com/p/gateplugin-stringannotation/wiki/IndirectExtendedGazetteerOld
*
@author Johann Petrak
*/
@CreoleResource(
  name = "Indirect Extended List Gazetteer OLD",
  comment = "The Extended List Gazetteer running on a virtual document according to an annotation specification",
  icon="shefGazetteer.gif",
  helpURL="http://code.google.com/p/gateplugin-stringannotation/wiki/IndirectExtendedGazetteerOld" 
)
public class IndirectExtendedGazetteer
  extends AbstractExtendedGazetteer
  implements LanguageAnalyser, ControllerAwarePR {
  private static final long serialVersionUID = 1L;

  @RunTime
  @Optional
  @CreoleParameter(
    comment = "The input annotation set for which to process the specifications",
  defaultValue = "")
  public void setInputAnnotationSetName(String ias) {
    this.inputAnnotationSetName = ias;
  }
  public String getInputAnnotationSetName() {
    return inputAnnotationSetName;
  }
  private String inputAnnotationSetName = "";
 
 
  @RunTime
  @CreoleParameter(comment = "A list of annotation specifications",
      defaultValue = "")
  public void setAnnotationSpecifications(List<String> ss) {
    this.annotationSpecifications = ss;
 
  public List<String> getAnnotationSpecifications() {
    return annotationSpecifications;
  }
  private List<String> annotationSpecifications;
 
  @RunTime
  @CreoleParameter(comment = "Keep the virtual document(s) for debugging",
      defaultValue = "false")
  public void setDebug(Boolean parm) {
    debug = parm;
 
  public Boolean getDebug() {
    return debug;
  }
  private Boolean debug = false;
 
  @RunTime
  @CreoleParameter(comment = "Insert a single blank space between what is selected according to the annotation specifications",
      defaultValue = "true")
  public void setInsertSpace(Boolean parm) {
    insertSpace = parm;
 
  public Boolean getInsertSpace() {
    return insertSpace;
  }
  private Boolean insertSpace = true;
 

  @HiddenCreoleParameter
  @Override
  public void setContainingAnnotationSpec(String spec) {
    // just here to hide the parameter
  }
 
 
 
  private FeatureMap processingOptions = Factory.newFeatureMap();
 
  protected AnnotatedDocumentTransformer annotatedDocumentTransformer = null;
 
  @Override
  public void execute() throws ExecutionException {
    if(corpus == null) { startup(); }
    fireStatusChanged("IndirectLanguageAnalyserPR processing: "
            + getDocument().getName());


    if (!(document instanceof DocumentImpl)) {
      throw new GateRuntimeException("Can only handle DocumentImpl not " +
          document.getClass());
    }
    String newText = annotatedDocumentTransformer.getStringForDocument(
            getDocument(), inputAnnotationSetName);
    FeatureMap theparms = Factory.newFeatureMap();
    theparms.put("encoding", ((DocumentImpl) document).getEncoding());
    theparms.put("stringContent", newText);
    FeatureMap thefeats = Factory.newFeatureMap();
    FeatureMap docfeats = document.getFeatures();
    thefeats.putAll(docfeats);

    String theName = document.getName();
    // create a copy of the current document
    Document newDoc;
    try {
      newDoc = (Document) Factory.createResource(
              "gate.corpora.DocumentImpl",
              theparms,
              thefeats,
              theName+"_virtual");
    } catch (ResourceInstantiationException ex) {
      throw new GateRuntimeException(ex);
    }

    doExecute(newDoc);

    List<String> effectiveMapFromAnnsetNames = new ArrayList<String>();
    if(getAnnotationSetName() == null) {
      effectiveMapFromAnnsetNames.add("");     
    } else {
      effectiveMapFromAnnsetNames.add(getAnnotationSetName());
    }
    //System.out.println("Mapping back annotations from "+effectiveMapFromAnnsetNames);
    annotatedDocumentTransformer.addBackMappedAnnotations(
              document, newDoc,
              effectiveMapFromAnnsetNames);
   

    if(!debug) {
      Factory.deleteResource(newDoc);
    }
    fireStatusChanged("IndirectExtendedGazetteer completed");

  }   
 

  @Override
  public void controllerExecutionAborted(Controller arg0, Throwable arg1)
    throws ExecutionException {
  }

  @Override
  public void controllerExecutionFinished(Controller arg0)
    throws ExecutionException {
  }

  @Override
  public void controllerExecutionStarted(Controller arg0)
    throws ExecutionException {
    startup();
  }

  public void startup() throws ExecutionException {
    if (getAnnotationSpecifications() == null || getAnnotationSpecifications().size() == 0) {
      throw new ExecutionException("SourceSpecifications must not be empty");
    }
    if(insertSpace) {
      processingOptions.put("separator"," ");
    } else {
      if(processingOptions.containsKey("separator")) {
        processingOptions.remove("separator");
      }
    }
    try {
      annotatedDocumentTransformer =
        new AnnotatedDocumentTransformer(
        getAnnotationSpecifications(), processingOptions,
        false, true);
    } catch (InvalidNameException ex) {
      throw new ExecutionException(ex);
    }

  }

} // ExtendedGazetteer
TOP

Related Classes of at.ofai.gate.extendedgazetteer.IndirectExtendedGazetteer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.