Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

Package uk.ac.cam.ch.wwmm.oscar3.recogniser.document

Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

uk.ac.cam.ch.wwmm.oscar3.recogniser.document.NamedEntity
A document, with data structures to store information such as tokens. This extra information is essential for many document processing tasks. These should be created using the ProcessingDocumentFactory class. @author ptc24

      if(s.matches("\\S+")) {
        docFreq = ir.docFreq(new Term("txt", s));
      } else {
        PhraseQuery pq = new PhraseQuery();
        for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
        VectorCollector vc = new VectorCollector();
        is.search(pq, vc);
        docFreq = vc.getResultsVector().size();
      }
      double idf = Math.log(numDocs) - Math.log(docFreq);
      tfIdf.put(s, tf.getCount(s) * idf);
    }
    for(String s : StringTools.getSortedList(tfIdf)) {

View Full Code Here

      } else {
        PhraseQuery pq = new PhraseQuery();
        for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
        q = pq;
      }
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      docFreq = vc.getResultsVector().size();
      double score;
      double expected = scaleFactor * docFreq;
      double excess = df.getCount(s) - expected;
      score = excess / clusterSize;        
      if(score > threshold) scores.put(s, score);
    }
    Stemmer st = new Stemmer(new EnglishStemmer());
    Map<String,List<String>> stems = st.wordsToStems(df.getSet());
    for(String stem : stems.keySet()) {
      List<String> words = stems.get(stem);
      if(words.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String word : words) {
          bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          df.add(stems.get(stem).toString(), overlap);
          scores.put(stems.get(stem).toString(), score);
        }
      }
    }
    Map<String,List<String>> termStems = ngtd.ngramsByStem();
    for(String stem : termStems.keySet()) {
      List<String> multiWords = termStems.get(stem);
      if(multiWords.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String multiWord : multiWords) {
          PhraseQuery pq = new PhraseQuery();
          for(String ss : StringTools.arrayToList(multiWord.split("\\s+"))) pq.add(new Term("txt", ss));
          bq.add(new BooleanClause(pq, Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          df.add(termStems.get(stem).toString(), overlap);
          scores.put(termStems.get(stem).toString(), score);
        }
      }
    }
    if(enriched) {
      for(String inchi : inchis) {
        Term luceneTerm = new Term("InChI", inchi);
        Query q = new TermQuery(luceneTerm);
        VectorCollector vc = new VectorCollector();
        is.search(q, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        if(overlap < 2) continue;
        double excess = overlap - expected;
        double score = excess / clusterSize;
        
        if(score > threshold) {
          String s = "InChi: " + lci.getName(lci.hitsByInChI(inchi));
          scores.put(s, score);
          df.add(s, overlap);            
        }
      }
      
      Map<String,Set<String>> ontQs = OBOOntology.getInstance().queriesForIds(onts);
      
      for(String ontQ : ontQs.keySet()) {
        /*BooleanQuery bq = new BooleanQuery(true);
        if(ontQs.get(ontQ).size() > BooleanQuery.getMaxClauseCount()) continue;
        for(String ont : ontQs.get(ontQ)) {
          bq.add(new BooleanClause(new TermQuery(new Term("Ontology", ont)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);*/
        VectorCollector vc = OntologyQueryCache.getResultsStatic(ontQ, ontQs.get(ontQ), is);
        Map<Integer,Float> results = vc.getResultsVector();
        double expected = scaleFactor * results.size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        if(overlap < 2) continue;
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          String s = ontQ + " " + OBOOntology.getInstance().getNameForID(ontQ);

View Full Code Here

    String normWord = StringTools.normaliseName(word);
    if (!word.equals(normWord)) {
      contextable.add(makeWordFeature(normWord));
    }


    ExtractTrainingData etd = ExtractTrainingData.getInstance();
    makeWordFeatures(word, normWord, bigramable, etd);
    makeReactionFeatures(word, bigramable, contextable, etd);


    String wts = StringTools.removeTerminalS(normWord);
    contextable.add(WITHOUT_TERMINAL_S_FEATURE + wts);

View Full Code Here

  // I'm sure there's a nice analytic way of doing this. Ah well...
  public static void main(String[] args) {
    List<Double> positiveExamples = new ArrayList<Double>();
    List<Double> negativeExamples = new ArrayList<Double>();
    
    ExtractTrainingData etd1 = ExtractTrainingData.getInstance();
    List<File> sbFiles = new ArrayList<File>();
    sbFiles.addAll(FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/goodrsc"), "scrapbook.xml"));
    ExtractTrainingData etd2 = new ExtractTrainingData(sbFiles);
    Set<String> chem = new HashSet<String>(etd2.chemicalWords);
    //chem.removeAll(etd1.chemicalWords);
    for(String w : chem) {
      if(!NGramBuilder.getInstance().chemSet.contains(NGram.parseWord(w))) {
        double score = NGram.getInstance().testWord(w);

View Full Code Here

    }    
  }


  private boolean resolveVsNewPubChem(ProcessState state, String name) {
    try {
      NewPubChem npc = NewPubChem.getInstance();
      if(npc != null) {
        String [] results = npc.getShortestSmilesAndInChI(name);
        if(results == null) return false;
        state.smiles = results[0];
        state.inchi = results[1];
        if(state.smiles != null) setNEAttribute(state.ne, "SMILES", state.smiles);
        if(state.inchi != null) setNEAttribute(state.ne, "InChI", state.inchi);

View Full Code Here

      }
      return;
    }
    
    if(mode.equals("BuildPubChem")) {
      new NewPubChem().initialise();
      return;
    }
    
    if(mode.equals("Server")) {
      if(Oscar3Props.getInstance().serverType.equals("none")) {

View Full Code Here

    String type = a.type;
    //System.out.println(surface + " " + a.type);
    if(type.contains("_")) {
      type = type.split("_")[0];
    }
    NamedEntity ne = new NamedEntity(t.getTokens(a.startToken, endToken), surface, type);
    assert(collector instanceof NECollector);
    ((NECollector)collector).collect(ne);
    //System.out.println(surface + ": " + a.reps);
    if(a.type.startsWith("ONT")) {
      Set<String> ontIds = runAutToStateToOntIds.get(a.type).get(a.state);
      String s = OntologyTerms.idsForTerm(surface);
      if(s != null && s.length() > 0) {
        if(ontIds == null) ontIds = new HashSet<String>();
        ontIds.addAll(StringTools.arrayToList(s.split("\\s+")));        
      }
      ne.addOntIds(ontIds);
      //System.out.println(surface + "\t" + ontIds);
    }
    if(a.type.startsWith("CUST")) {
      Set<String> custTypes = runAutToStateToOntIds.get(a.type).get(a.state);
      ne.addCustTypes(custTypes);
      //System.out.println(surface + "\t" + ontIds);
    }
    //ne.setPattern(StringTools.collectionToString(a.getReps(), "_"));
  }

View Full Code Here

    String type = a.type;
    //System.out.println(surface + " " + a.type);
    if(type.contains("_")) {
      type = type.split("_")[0];
    }
    NamedEntity ne = new NamedEntity(t.getTokens(a.startToken, endToken), surface, type);
    assert(collector instanceof NECollector);
    ((NECollector)collector).collect(ne);
    //System.out.println(surface + ": " + a.reps);
    if(a.type.startsWith("ONT")) {
      Set<String> ontIds = runAutToStateToOntIds.get(a.type).get(a.state);
      String s = OntologyTerms.idsForTerm(StringTools.normaliseName(surface));
      if(s != null && s.length() > 0) {
        if(ontIds == null) ontIds = new HashSet<String>();
        ontIds.addAll(StringTools.arrayToList(s.split("\\s+")));        
      }
      ne.addOntIds(ontIds);
      //System.out.println(surface + "\t" + ontIds);
    }
    if(a.type.startsWith("CUST")) {
      //System.out.println(runAutToStateToOntIds.get(a.type));
      Set<String> custTypes = runAutToStateToOntIds.get(a.type).get(a.state);
      ne.addCustTypes(custTypes);
      //System.out.println(surface + "\t" + ontIds);
    }


    //ne.setPattern(StringTools.collectionToString(a.getReps(), "_"));
  }

View Full Code Here

        String value = token.getValue();
        value = value.toLowerCase();
        if(prwStrings.contains(value)) {
          List<Token> neTokens = new ArrayList<Token>();
          neTokens.add(token);
          NamedEntity ne = new NamedEntity(neTokens, token.getValue(), "PRW");
          safholder.appendChild(ne.toSAF());
          //System.out.println("**********");
        }
      }
      //System.out.println();
    }

View Full Code Here

          } else {
            featuresForAbbrev = new ArrayList<String>();
            abbrevFeatures.put(ne.getSurface(), featuresForAbbrev);
          }
          if(neByLastToken.containsKey(prev2)) {
            NamedEntity maybeAbbrev = neByLastToken.get(prev2);
            String abbrMode = "abbr1:";
            if(StringTools.testForAcronym(surf, maybeAbbrev.getSurface())) {
              abbrMode = "abbr2:";
            }
            if(surf.matches(".*\\s.*")) abbrMode += "wws:";
            for(double lthresh = -5.0;lthresh < 5.05;lthresh += 0.5) {
              double thresh = logitToProb(lthresh);
              if(maybeAbbrev.getConfidence() > thresh) {
                featuresForAbbrev.add(abbrMode + "abbr>" + thresh);
              } else {
                featuresForAbbrev.add(abbrMode + "abbr<" + thresh);
              }              
            }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

dk.brics.automaton.Automaton

dk.brics.automaton.RunAutomaton

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.