Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

Package uk.ac.cam.ch.wwmm.oscar3.recogniser.document

Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument
A document, with data structures to store information such as tokens. This extra information is essential for many document processing tasks. These should be created using the ProcessingDocumentFactory class. @author ptc24

  
  private static NamedEntity makeNE(List<Token> neTokens, String neType) {
    Token firstToken = neTokens.get(0);
    Token lastToken = neTokens.get(neTokens.size()-1);
    String surf = firstToken.getTokenSequence().getSubstring(firstToken.getId(), lastToken.getId());
    return new NamedEntity(neTokens, surf, "GENIA-" + neType);
  }

View Full Code Here

      }
    }
  }
  
  Document annotateDoc(Document doc) throws Exception {
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, false, false, false);
    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(doc);
    //nr.makeTokenisers(false);
    Element safholder = new Element("saf");
    Document safDoc = new Document(safholder);
    for(TokenSequence t : procDoc.getTokenSequences()) {
      for(Token token : t.getTokens()) {
        //System.out.println(token.getValue());
        String value = token.getValue();
        value = value.toLowerCase();
        if(prwStrings.contains(value)) {

View Full Code Here

    /*NameRecogniser nr = new NameRecogniser();
    nr.halfProcess(doc);


    nr.makeTokenisers(false);*/
    
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, true, false, false);
    
    List<NamedEntity> entities = new ArrayList<NamedEntity>();


    Set<String> testNEs = new HashSet<String>();
    
    for(TokenSequence tokSeq : procDoc.getTokenSequences()) {
      Nodes neNodes = tokSeq.getElem().query(".//ne");
      for(int k=0;k<neNodes.size();k++) {
        Element neElem = (Element)neNodes.get(k);
        String neStr = "[NE:" + neElem.getAttributeValue("type") + ":" + neElem.getAttributeValue("xtspanstart") + ":" + neElem.getAttributeValue("xtspanend") + ":" + neElem.getValue() + "]";
        testNEs.add(neStr);

View Full Code Here

    
    /*NameRecogniser nr = new NameRecogniser();
    nr.halfProcess(doc);


    nr.makeTokenisers(false);*/
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, true, false, false);
    
    List<NamedEntity> entities = new ArrayList<NamedEntity>();


    Set<String> testNEs = new HashSet<String>();
    
    for(TokenSequence tokSeq : procDoc.getTokenSequences()) {
      Nodes neNodes = tokSeq.getElem().query(".//ne");
      for(int k=0;k<neNodes.size();k++) {
        Element neElem = (Element)neNodes.get(k);
        String neStr = "[NE:" + neElem.getAttributeValue("type") + ":" + neElem.getAttributeValue("xtspanstart") + ":" + neElem.getAttributeValue("xtspanend") + ":" + neElem.getValue() + "]";
        testNEs.add(neStr);

View Full Code Here

   * @param sourceDoc The source SciXML document.
   * @return A SAF XML document containing various information.
   * @throws Exception
   */
  public Document runGenia(Document sourceDoc) throws Exception {
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, true);
    Document safDoc = new Document(new Element("saf"));
    for(List<Token> sentence : procDoc.getSentences()) {
      if(sentence.size() > 0) {
        Token first = sentence.get(0);
        Token last = sentence.get(sentence.size()-1);
        Element sentenceAnnot = SafTools.makeAnnot(first.getStartXPoint(), last.getEndXPoint(),  "sentence");
        safDoc.getRootElement().appendChild(sentenceAnnot);
      }
    }
    for(TokenSequence ts : procDoc.getTokenSequences()) {
      for(Token t : ts.getTokens()) {
        Element safElem = SafTools.makeAnnot(t.getStartXPoint(), t.getEndXPoint(), "genia");
        SafTools.setSlot(safElem, "surface", t.getValue());
        SafTools.setSlot(safElem, "stem", t.getGeniaData()[1]);
        SafTools.setSlot(safElem, "tag", t.getGeniaData()[2]);

View Full Code Here

        for(int i=0;i<n.size();i++) XOMTools.removeElementPreservingText((Element)n.get(i));
        Document safDoc = InlineToSAF.extractSAFs(doc, copy, "foo");
        doc = copy;
        
        
        ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, true, false, false, safDoc);
        //NameRecogniser nr = new NameRecogniser();
        //nr.halfProcess(doc);
        //n = doc.query(XMLStrings.CHEMICAL_PLACES_XPATH);
        if(Oscar3Props.getInstance().verbose) System.out.println(f);
        for(TokenSequence tokSeq : procDoc.getTokenSequences()) {
          afterHyphen.addAll(tokSeq.getAfterHyphens());
          Map<String, List<List<String>>> neMap = tokSeq.getNes();
          List<List<String>> neList = new ArrayList<List<String>>();
          if(neMap.containsKey(NETypes.COMPOUND)) neList.addAll(neMap.get(NETypes.COMPOUND));
          if(neMap.containsKey(NETypes.ADJECTIVE)) neList.addAll(neMap.get(NETypes.ADJECTIVE));

View Full Code Here

          if(Oscar3Props.getInstance().verbose) System.out.println("NCM: " + ne.getValue());
        }
      }
    }
    
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, true, false, false);
    
  //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(doc);
    //if(patternFeatures) {
    //  nr.findForReps(true);
    //} else {
      //nr.makeTokenisers(true);
    //}
    for(TokenSequence ts : procDoc.getTokenSequences()) {
      trainOnSentence(ts, domain);
    }
    if(Oscar3Props.getInstance().verbose) System.out.println(System.currentTimeMillis() - time);
  }

View Full Code Here

    for(int i=0;i<n.size();i++) n.get(i).detach();
    n = doc.query("//ne[@type='CPR']");
    for(int i=0;i<n.size();i++) XOMTools.removeElementPreservingText((Element)n.get(i));
    
    
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, true, false, false);
    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(doc);
    //if(patternFeatures) {
    //  nr.findForReps(true);
    //} else {
      //nr.makeTokenisers(true);
    //}
    for(TokenSequence ts : procDoc.getTokenSequences()) {
      cvFeatures(ts, domain);
    }
    if(Oscar3Props.getInstance().verbose) System.out.println(System.currentTimeMillis() - time);
  }

View Full Code Here

    //if(patternFeatures) {
    //  nr.findForReps(true);
    //} else {
      nr.makeTokenisers(true);
    //}*/
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, false, false, false);
    
    for(TokenSequence t : procDoc.getTokenSequences()) {
      trainOnSentence(t);
    }
    System.out.println(System.currentTimeMillis() - time);
  }

View Full Code Here

    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(doc);
    //nr.makeTokenisers(true);
    Set<String> testDataSections = new LinkedHashSet<String>();


    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, false, false, false);
    
    for(TokenSequence tokSeq : procDoc.getTokenSequences()) {
      Nodes neNodes = tokSeq.getElem().query(".//ne");
      for(int k=0;k<neNodes.size();k++) {
        Element neElem = (Element)neNodes.get(k);
        String neStr = "["+ neElem.getAttributeValue("xtspanstart") + ":" + neElem.getAttributeValue("xtspanend") + "]";
        testDataSections.add(neStr);
      }
    }
    List<String> tdsl = new ArrayList<String>(testDataSections);
    Collections.sort(tdsl);
    System.out.println(tdsl);
      
    Set<String> results = new HashSet<String>();
    for(TokenSequence t : procDoc.getTokenSequences()) {
      results.addAll(testOnSentence(t));
    }
    List<String> rl = new ArrayList<String>(results);
    Collections.sort(rl);
    System.out.println(rl);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

dk.brics.automaton.Automaton

dk.brics.automaton.RunAutomaton

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.