Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

Package uk.ac.cam.ch.wwmm.oscar3.recogniser.document

Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument
A document, with data structures to store information such as tokens. This extra information is essential for many document processing tasks. These should be created using the ProcessingDocumentFactory class. @author ptc24

    
    
    
    for(File file : files) {
      System.out.println(file);
      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(new Builder().build(file), false, false, false);
      Bag<String> b = new Bag<String>();
      for(TokenSequence ts : procDoc.getTokenSequences()) {
        for(Token t : ts.getTokens()) {
          String s = t.getValue().intern();
          if(!TermSets.getClosedClass().contains(s.toLowerCase())) b.add(s);
        }
      }

View Full Code Here

    for(int i=0;i<nodes.size();i++) {
      XOMTools.removeElementPreservingText((Element)nodes.get(i));
    }
    Document safDoc = InlineToSAF.extractSAFs(doc, sourceDoc, "foo");


    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, false);


    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(sourceDoc);
    //nr.makeTokenisers(false);


    Nodes n = safDoc.query("/saf/annot[slot[@name='type']['PRW']]");


    Map<TokenSequence,Boolean> tokSeqs = new HashMap<TokenSequence,Boolean>();
    Map<TokenSequence,Bag<String>> tokSeqPRWs = new HashMap<TokenSequence,Bag<String>>();
    
    for(int i=0;i<n.size();i++) {
      Element e = (Element)n.get(i);
      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));

View Full Code Here

      for(int i=0;i<nodes.size();i++) {
        XOMTools.removeElementPreservingText((Element)nodes.get(i));
      }
      Document safDoc = InlineToSAF.extractSAFs(doc, sourceDoc, "foo");


      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, false);
      //NameRecogniser nr = new NameRecogniser();
      //nr.halfProcess(sourceDoc);
      //nr.makeTokenisers(false);
      Set<String> tokenSet = new HashSet<String>();
      Bag<String> tokenBag = new Bag<String>();
      for(TokenSequence t : procDoc.getTokenSequences()) {
        //System.out.println(t.getSourceString());
        for(Token token : t.getTokens()) {
          //tokenSet.add("stem=" + stemmer.getStem(token.getValue().toLowerCase()));
          //tokenSet.add(token.getValue().toLowerCase());
          tokenBag.add(token.getValue().toLowerCase());

View Full Code Here

    
    File inlineFile = new File(paperDir, "source.xml");
    File safFile = new File(paperDir, "saf.xml");
    Document doc = new Builder().build(inlineFile);
    Document safDoc = new Builder().build(safFile);
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, false, false, false);
    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(doc);
    //Nodes n = doc.query(XMLStrings.CHEMICAL_PLACES_XPATH);
    DFARelationFinder relf = DFARelationFinder.getInstance();
    List<Lattice> lattices = Lattice.buildLattices(procDoc, safDoc.getRootElement());

View Full Code Here

    //Document doc = new Document((Element)(sourceDoc.getRootElement().copy()));
    safDoc = new Document((Element)(safDoc.getRootElement().copy()));
    
    //NameRecogniser nr = new NameRecogniser();
    //nr.halfProcess(doc);
    ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(doc, false, false, false);
    
    DFARelationFinder relf = DFARelationFinder.getInstance();
    List<Lattice> lattices = Lattice.buildLattices(procDoc, safDoc.getRootElement());
    
    for(Lattice lattice : lattices) {

View Full Code Here

        XOMTools.removeElementPreservingText((Element)nodes.get(i));
      }
      Document safDoc = InlineToSAF.extractSAFs(doc, sourceDoc, "foo");


      
      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(sourceDoc, false, false, false);
      
      //NameRecogniser nr = new NameRecogniser();
      //nr.halfProcess(sourceDoc);
      //nr.makeTokenisers(false);
      Set<String> tokenSet = new HashSet<String>();
      Bag<String> tokenBag = new Bag<String>();
      for(TokenSequence t : procDoc.getTokenSequences()) {
        //System.out.println(t.getSourceString());
        for(Token token : t.getTokens()) {
          tokenSet.add(token.getValue().toLowerCase());
          tokenBag.add(token.getValue().toLowerCase());
        }

View Full Code Here

    tokenList = new ArrayList<String>();
    tokenIndex = new HashMap<String,Integer>();
    
    numdocs = 0;
    for(File f : files) {
      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(new Builder().build(f), false, false, false);
      for(TokenSequence ts : procDoc.getTokenSequences()) {
        if(ts.size() == 0) continue;
        for(Token t : ts.getTokens()) {
          String s = t.getValue().toLowerCase().intern();
          int tn = -1;
          if(tokenIndex.containsKey(s)) {

View Full Code Here

  public void randomCorpus(List<File> files, int num, int seed) throws Exception {
    List<Integer> sizes = new ArrayList<Integer>();
    List<Integer> corpus = new ArrayList<Integer>();
    long time = System.currentTimeMillis();
    for(File f : files) {
      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(new Builder().build(f), false, false, false);
      int size = 0;
      for(TokenSequence ts : procDoc.getTokenSequences()) {
        if(ts.size() == 0) continue;
        for(Token t : ts.getTokens()) {
          size++;
          String s = t.getValue().toLowerCase().intern();
          int tn = -1;

View Full Code Here

    
    Stemmer st = new Stemmer(new PorterStemmer());
    
    long time = System.currentTimeMillis();
    for(File f : files) {
      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(new Builder().build(f), false, false, false);
      Set<Integer> tokSet = new HashSet<Integer>();
      for(TokenSequence ts : procDoc.getTokenSequences()) {
        if(ts.size() == 0) continue;
        for(Token t : ts.getTokens()) {
          String s = t.getValue().toLowerCase().intern();
          if(s.matches(".*[a-z].*")) s = st.getStem(s);
          if(s == null || s.length() == 0) continue;

View Full Code Here

      //NameRecogniser nr = new NameRecogniser();
      
      //nr.halfProcess(new Builder().build(new File(f.getParentFile(), "source.xml")));
      //List<Tokeniser> tokSeqs = nr.buildTokenTables(new Builder().build(new File(f.getParentFile(), "saf.xml")).getRootElement(), false, true);
      
      ProcessingDocument procDoc = ProcessingDocumentFactory.getInstance().makeTokenisedDocument(new Builder().build(new File(f.getParentFile(), "source.xml")), true, true, false,
          new Builder().build(new File(f.getParentFile(), "saf.xml")));
      
      for(TokenSequence tokSeq : procDoc.getTokenSequences()) {
        for(Token t : tokSeq.getTokens()) {
          String tokVal = t.getValue();
          tokVal = StringTools.normaliseName(tokVal);
          tokVal.replaceAll("\\s+", "_");

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

dk.brics.automaton.Automaton

dk.brics.automaton.RunAutomaton

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.