Package uk.ac.cam.ch.wwmm.oscar3.recogniser.document

Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument


      }
      File f = new File(fn);
      f = f.getParentFile();
      Document safDoc = new Builder().build(new File(f, "saf.xml"));
      Document sourceDoc = new Builder().build(new File(f, "source.xml"));     
      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, false, safDoc);
      //NameRecogniser nr = new NameRecogniser();
      //nr.halfProcess(sourceDoc);
      //nr.buildTokenTables(safDoc.getRootElement(), false, false);

      for(Entry e : entriesByFileName.get(fn)) {
        Token t = procDoc.getTokenByStart(e.start);
        if(t != null) {
          List<String> features = featuresForToken(t);
          Event event = new Event(e.type, features.toArray(new String[0]));
          events.add(event);
        }
View Full Code Here


   
    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(sourceDoc);
    //nr.makeTokenisers(false);

    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, false, safDoc);
   
    Nodes n = safDoc.query("/saf/annot[slot[@name='type']['PRW']]");

    Map<TokenSequence,Boolean> tokSeqs = new HashMap<TokenSequence,Boolean>();
    Map<TokenSequence,Bag<String>> tokSeqPRWs = new HashMap<TokenSequence,Bag<String>>();
   
    for(int i=0;i<n.size();i++) {
      Element e = (Element)n.get(i);
      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        //boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));
View Full Code Here

    ResourceGetter rg = new ResourceGetter("uk/ac/cam/ch/wwmm/oscar3/test/testcard/resources/");
    String s = rg.getString("testcard.txt");
    assertTrue("Have testcard string", s != null && s.length() > 0);
    Document doc = TextToSciXML.textToSciXML(s);

    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, false, false, false);
    assertTrue(procDoc != null);
    List<NamedEntity> neList;
    ChemicalEntityRecogniser cei = new PatternRecogniser();
    neList = cei.findNamedEntities(procDoc);
    assertTrue(neList != null);
View Full Code Here

    ResourceGetter rg = new ResourceGetter("uk/ac/cam/ch/wwmm/oscar3/test/testcard/resources/");
    String s = rg.getString("testcard.txt");
    assertTrue("Have testcard string", s != null && s.length() > 0);
    Document doc = TextToSciXML.textToSciXML(s);
   
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, false, false, false);
    assertTrue(procDoc != null);
    List<NamedEntity> neList;
    ChemicalEntityRecogniser cei = new MEMMRecogniser();
    neList = cei.findNamedEntities(procDoc);
    assertTrue(neList != null);
View Full Code Here

            Element e = (Element)n.get(i);
            e.addAttribute(new Attribute("type", "CHEMICAL"));
            //XOMTools.removeElementPreservingText((Element)n.get(i));
          }         
        }
        ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, true, false, false);

        //NameRecogniser nr = new NameRecogniser();
        //nr.halfProcess(doc);

        //if(memm.patternFeatures) {
        //  nr.findForReps(cheatTokenisation);         
        //} else {
          //nr.makeTokenisers(cheatTokenisation);
        //}
       
        int paperTestNEs = 0;
       
        List<Double> paperGoodProbs = new ArrayList<Double>();
        List<Double> paperBadProbs = new ArrayList<Double>();

        Map<NamedEntity,Double> confidences = new HashMap<NamedEntity,Double>();
       
        List<NamedEntity> entities;
       
        Set<String> testNEs = new LinkedHashSet<String>();

        for(TokenSequence tokSeq : procDoc.getTokenSequences()) {
          Nodes neNodes = tokSeq.getElem().query(".//ne");
          for(int k=0;k<neNodes.size();k++) {
            Element neElem = (Element)neNodes.get(k);
            if(filterType != null && !neElem.getAttributeValue("type").equals(filterType)) continue;
            if(antiFilterType != null && neElem.getAttributeValue("type").equals(antiFilterType)) continue;
View Full Code Here

      }*/
      //NameRecogniser nr = new NameRecogniser();
      //nr.halfProcess(xmlDoc);
      //nr.makeTokenisers(false);
     
      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(xmlDoc, false, false, false);
     
      //for(Tokeniser t : nr.getTokenisers()) {
        TokenStream ts = new Oscar3TokenStream(procDoc.getTokenSequences());
        ts = new Oscar3Filters(ts);
        luceneDoc.add(new Field("txt", ts, Field.TermVector.WITH_POSITIONS_OFFSETS));
       
      //}
     
View Full Code Here

   * @param sourceDoc The SciXML document to parse, now unlikely to be altered.
   * @param safDoc An (empty) SAF XML document.
   * @throws Exception C
   */
  private void processDocumentInternal(Document sourceDoc, Document safDoc) throws Exception {
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(new Document((Element)XOMTools.safeCopy(sourceDoc.getRootElement())), false, false, false);
    List<NamedEntity> neList;
    ChemicalEntityRecogniser cer = null;
    try
    {
      cer = (ChemicalEntityRecogniser) Class.forName(Oscar3Props.getInstance().chemicalEntityRecogniser).newInstance();
View Full Code Here

      //files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/oscarworkspace/corpora/paperset1"), "source.xml");
      //files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/paperset1"), "source.xml");
      //files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/BioIE"), "source.xml");
      files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/roughPubMed"), "source.xml");
      //files = FileTools.getFilesFromDirectoryByName(new File("/scratch/pubmed/2005"), "source.xml");
      StringSource ss = new StringSource(files, false);
     
      Bag<String> wordCounts = new Bag<String>();
     
      ss.reset();
      for(String s : ss) {
        TokenSequence t = Tokeniser.getInstance().tokenise(s);
        for(String word : t.getTokenStringList()) {
          if(!word.matches(".*[a-z][a-z].*")) continue;
          word = StringTools.normaliseName(word);
View Full Code Here

    Set<String> engWords = new HashSet<String>(NGramBuilder.getInstance().engSet);
   
    List<File> files = new ArrayList<File>();
    files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/roughPubMed"), "source.xml");

    StringSource ss = new StringSource(files, false);
   
    Bag<String> wordCounts = new Bag<String>();
   
    ss.reset();
    for(String s : ss) {
      TokenSequence t = Tokeniser.getInstance().tokenise(s);
      for(String word : t.getTokenStringList()) {
        if(!word.matches(".*[a-z][a-z].*")) continue;
        word = StringTools.normaliseName(word);
View Full Code Here

        neByLastToken.put(ne.getLastToken(), ne);
      }
    }

    for(NamedEntity ne : entities) {
      Token prev = ne.getFirstToken().getNAfter(-1);
      Token next = ne.getLastToken().getNAfter(1);
      if(prev != null && next != null && prev.getValue().equals("(") && next.getValue().equals(")")) {
        Token prev2 = ne.getFirstToken().getNAfter(-2);
        if(prev2 != null) {
          String surf = ne.getSurface();
          if(surf.matches(".*[A-Z]s") || prev2.getValue().endsWith("s")) surf = surf.substring(0, surf.length()-1);
          List<String> featuresForAbbrev;
          if(abbrevFeatures.containsKey(surf)) {
            featuresForAbbrev = abbrevFeatures.get(surf);
          } else {
            featuresForAbbrev = new ArrayList<String>();
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.