Package uk.ac.cam.ch.wwmm.oscar3.indexersearcher

Examples of uk.ac.cam.ch.wwmm.oscar3.indexersearcher.VectorCollector


   * @param sourceDoc The source SciXML document.
   * @return A SAF XML document containing various information.
   * @throws Exception
   */
  public Document runGenia(Document sourceDoc) throws Exception {
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, true);
    Document safDoc = new Document(new Element("saf"));
    for(List<Token> sentence : procDoc.getSentences()) {
      if(sentence.size() > 0) {
        Token first = sentence.get(0);
        Token last = sentence.get(sentence.size()-1);
        Element sentenceAnnot = SafTools.makeAnnot(first.getStartXPoint(), last.getEndXPoint()"sentence");
        safDoc.getRootElement().appendChild(sentenceAnnot);
      }
    }
    for(TokenSequence ts : procDoc.getTokenSequences()) {
      for(Token t : ts.getTokens()) {
        Element safElem = SafTools.makeAnnot(t.getStartXPoint(), t.getEndXPoint(), "genia");
        SafTools.setSlot(safElem, "surface", t.getValue());
        SafTools.setSlot(safElem, "stem", t.getGeniaData()[1]);
        SafTools.setSlot(safElem, "tag", t.getGeniaData()[2]);
View Full Code Here


        for(int i=0;i<n.size();i++) XOMTools.removeElementPreservingText((Element)n.get(i));
        Document safDoc = InlineToSAF.extractSAFs(doc, copy, "foo");
        doc = copy;
       
       
        ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, true, false, false, safDoc);
        //NameRecogniser nr = new NameRecogniser();
        //nr.halfProcess(doc);
        //n = doc.query(XMLStrings.CHEMICAL_PLACES_XPATH);
        if(Oscar3Props.getInstance().verbose) System.out.println(f);
        for(TokenSequence tokSeq : procDoc.getTokenSequences()) {
          afterHyphen.addAll(tokSeq.getAfterHyphens());
          Map<String, List<List<String>>> neMap = tokSeq.getNes();
          List<List<String>> neList = new ArrayList<List<String>>();
          if(neMap.containsKey(NETypes.COMPOUND)) neList.addAll(neMap.get(NETypes.COMPOUND));
          if(neMap.containsKey(NETypes.ADJECTIVE)) neList.addAll(neMap.get(NETypes.ADJECTIVE));
View Full Code Here

          if(Oscar3Props.getInstance().verbose) System.out.println("NCM: " + ne.getValue());
        }
      }
    }
   
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, true, false, false);
   
  //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(doc);
    //if(patternFeatures) {
    //  nr.findForReps(true);
    //} else {
      //nr.makeTokenisers(true);
    //}
    for(TokenSequence ts : procDoc.getTokenSequences()) {
      trainOnSentence(ts, domain);
    }
    if(Oscar3Props.getInstance().verbose) System.out.println(System.currentTimeMillis() - time);
  }
View Full Code Here

    for(int i=0;i<n.size();i++) n.get(i).detach();
    n = doc.query("//ne[@type='CPR']");
    for(int i=0;i<n.size();i++) XOMTools.removeElementPreservingText((Element)n.get(i));
   
   
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, true, false, false);
    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(doc);
    //if(patternFeatures) {
    //  nr.findForReps(true);
    //} else {
      //nr.makeTokenisers(true);
    //}
    for(TokenSequence ts : procDoc.getTokenSequences()) {
      cvFeatures(ts, domain);
    }
    if(Oscar3Props.getInstance().verbose) System.out.println(System.currentTimeMillis() - time);
  }
View Full Code Here

    //if(patternFeatures) {
    //  nr.findForReps(true);
    //} else {
      nr.makeTokenisers(true);
    //}*/
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, false, false, false);
   
    for(TokenSequence t : procDoc.getTokenSequences()) {
      trainOnSentence(t);
    }
    System.out.println(System.currentTimeMillis() - time);
  }
View Full Code Here

    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(doc);
    //nr.makeTokenisers(true);
    Set<String> testDataSections = new LinkedHashSet<String>();

    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, false, false, false);
   
    for(TokenSequence tokSeq : procDoc.getTokenSequences()) {
      Nodes neNodes = tokSeq.getElem().query(".//ne");
      for(int k=0;k<neNodes.size();k++) {
        Element neElem = (Element)neNodes.get(k);
        String neStr = "["+ neElem.getAttributeValue("xtspanstart") + ":" + neElem.getAttributeValue("xtspanend") + "]";
        testDataSections.add(neStr);
      }
    }
    List<String> tdsl = new ArrayList<String>(testDataSections);
    Collections.sort(tdsl);
    System.out.println(tdsl);
     
    Set<String> results = new HashSet<String>();
    for(TokenSequence t : procDoc.getTokenSequences()) {
      results.addAll(testOnSentence(t));
    }
    List<String> rl = new ArrayList<String>(results);
    Collections.sort(rl);
    System.out.println(rl);
View Full Code Here

   
   
   
    for(File file : files) {
      System.out.println(file);
      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(new Builder().build(file), false, false, false);
      Bag<String> b = new Bag<String>();
      for(TokenSequence ts : procDoc.getTokenSequences()) {
        for(Token t : ts.getTokens()) {
          String s = t.getValue().intern();
          if(!TermSets.getClosedClass().contains(s.toLowerCase())) b.add(s);
        }
      }
View Full Code Here

    for(int i=0;i<nodes.size();i++) {
      XOMTools.removeElementPreservingText((Element)nodes.get(i));
    }
    Document safDoc = InlineToSAF.extractSAFs(doc, sourceDoc, "foo");

    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, false);

    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(sourceDoc);
    //nr.makeTokenisers(false);

    Nodes n = safDoc.query("/saf/annot[slot[@name='type']['PRW']]");

    Map<TokenSequence,Boolean> tokSeqs = new HashMap<TokenSequence,Boolean>();
    Map<TokenSequence,Bag<String>> tokSeqPRWs = new HashMap<TokenSequence,Bag<String>>();
   
    for(int i=0;i<n.size();i++) {
      Element e = (Element)n.get(i);
      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));
View Full Code Here

      for(int i=0;i<nodes.size();i++) {
        XOMTools.removeElementPreservingText((Element)nodes.get(i));
      }
      Document safDoc = InlineToSAF.extractSAFs(doc, sourceDoc, "foo");

      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, false);
      //NameRecogniser nr = new NameRecogniser();
      //nr.halfProcess(sourceDoc);
      //nr.makeTokenisers(false);
      Set<String> tokenSet = new HashSet<String>();
      Bag<String> tokenBag = new Bag<String>();
      for(TokenSequence t : procDoc.getTokenSequences()) {
        //System.out.println(t.getSourceString());
        for(Token token : t.getTokens()) {
          //tokenSet.add("stem=" + stemmer.getStem(token.getValue().toLowerCase()));
          //tokenSet.add(token.getValue().toLowerCase());
          tokenBag.add(token.getValue().toLowerCase());
View Full Code Here

      //files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/oscarworkspace/corpora/paperset1"), "source.xml");
      //files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/paperset1"), "source.xml");
      //files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/BioIE"), "source.xml");
      files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/roughPubMed"), "source.xml");
      //files = FileTools.getFilesFromDirectoryByName(new File("/scratch/pubmed/2005"), "source.xml");
      StringSource ss = new StringSource(files, false);
     
      Bag<String> wordCounts = new Bag<String>();
     
      ss.reset();
      for(String s : ss) {
        TokenSequence t = Tokeniser.getInstance().tokenise(s);
        for(String word : t.getTokenStringList()) {
          if(!word.matches(".*[a-z][a-z].*")) continue;
          word = StringTools.normaliseName(word);
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.indexersearcher.VectorCollector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.