Package uk.ac.cam.ch.wwmm.oscar3.recogniser.document

Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.Tokeniser


    //pq.add(new Term("txt", "inhibitors"));
    Query q = new TermQuery(new Term("txt", queryTerm));
    //Query q = new StemQuery(new Term("txt", queryTerm), stemmerTools);

    for(int i=0;i<100;i++) {
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      for(Integer j : new ArrayList<Integer>(vc.getResultsVector().keySet())) {
        if(vc.getResultsVector().get(j) < 0.2) vc.getResultsVector().remove(j);
      }
      Map<String,Double> scores = ClusterAnalyser.simpleExcessAnalyseCluster(vc.getResultsVector(), lis.getIndexReader(), 0.1);
      BooleanQuery bq = new BooleanQuery(false);
      List<String> terms = StringTools.getSortedList(scores);
      if(terms.size() > 10) terms = terms.subList(0, 10);
      for(String s : terms) {
        System.out.println(s + "\t" + scores.get(s));
View Full Code Here


      dff.put(i, docFreqs.get(dfl.get(i)) / nd);
    }
   
    for(int i=0;i<50;i++) {
      TermQuery tq = new TermQuery(new Term("txt", dfl.get(i)));
      VectorCollector vc = new VectorCollector();
      is.search(tq, vc);
      float vcs = vc.getResultsVector().size();
      Map<Integer,Float> stf = new HashMap<Integer,Float>();
     
      for(Integer j : vc.getResultsVector().keySet()) {
        TermFreqVector tv = ir.getTermFreqVector(j, "txt");
        String [] terms = tv.getTerms();
        for(int k=0;k<tv.size();k++) {
          String term = terms[k];
          if(termMap.containsKey(term)) {
View Full Code Here

    System.out.println(totalEntropy);
   
    List<String> termList = tfBag.getList().subList(0, 2000);
    for(String splitTerm : termList) {
      Query q = new TermQuery(new Term("txt", splitTerm));
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      Bag<String> inBag = new Bag<String>();
      Bag<String> outBag = new Bag<String>();
      for(int i=0;i<ir.maxDoc();i++) {
        Bag<String> bag = inBag;
        if(!vc.getResultsVector().containsKey(i)) continue;
       
        //Bag<String> bag = outBag;
        //if(vc.getResultsVector().containsKey(i)) bag = inBag;
        TermFreqVector tv = ir.getTermFreqVector(i, "txt");
        String [] terms = tv.getTerms();
View Full Code Here

      Map<Integer,Integer> bestClusters = new HashMap<Integer,Integer>();
      Map<Integer,Float> bestClusterScores = new HashMap<Integer,Float>();
      List<Map<Integer,Float>> clusters = new ArrayList<Map<Integer,Float>>();
      for(int j=0;j<queries.size();j++) {
        clusters.add(new HashMap<Integer,Float>());
        VectorCollector vc = new VectorCollector();
        is.search(queries.get(j), vc);
        //System.out.println(vc.getResultsVector());
        for(Integer k : vc.getResultsVector().keySet()) {
          float score = vc.getResultsVector().get(k);
          if(score < 0.001) continue;
          if(!bestClusterScores.containsKey(k) || bestClusterScores.get(k) < score) {
            bestClusters.put(k, j);
            bestClusterScores.put(k, score);
          }
        }
      }
      for(Integer j : bestClusters.keySet()) {
        clusters.get(bestClusters.get(j)).put(j, bestClusterScores.get(j));
      }
      //for(Map<Integer,Float> cluster : clusters) System.out.println(cluster);
      queries.clear();
      for(int j=0;j<clusters.size();j++) {
        System.out.println("Size: " + clusters.get(j).size());
        /*if(i == 9) {
          for(Integer k : clusters.get(j).keySet()) {
            System.out.println(docFiles.get(k) + "\t" + bestClusterScores.get(k));
          }
        }*/
        //if(i == 9) ClusterAnalyser.excessAnalyseCluster(clusters.get(j), lis.getIndexReader(), 0.2, true);
        Map<String,Double> scores = ClusterAnalyser.simpleExcessAnalyseCluster(clusters.get(j), lis.getIndexReader(), 0.1);
        BooleanQuery bq = new BooleanQuery(false);
        List<String> terms = StringTools.getSortedList(scores);
        if(terms.size() > 20) terms = terms.subList(0, 20);
        for(String s : terms) {
          System.out.println(s + "\t" + scores.get(s));
          TermQuery tq = new TermQuery(new Term("txt", s));
          tq.setBoost(scores.get(s).floatValue());
          bq.add(new BooleanClause(tq, Occur.SHOULD));
        }
        System.out.println();
        queries.add(bq);
      }
      System.out.println();
    }
    List<Map<Integer,Float>> clusters = new ArrayList<Map<Integer,Float>>();
    final Map<Integer,Integer> clusterSizes = new HashMap<Integer,Integer>();
    for(int j=0;j<queries.size();j++) {
      VectorCollector vc = new VectorCollector();
      is.search(queries.get(j), vc);
      final Map<Integer,Float> cluster = new HashMap<Integer,Float>();
      //System.out.println(vc.getResultsVector());
      for(Integer k : vc.getResultsVector().keySet()) {
        float score = vc.getResultsVector().get(k);
        if(score < 0.2) continue;
        cluster.put(k, score);
      }
      clusters.add(cluster);
      clusterSizes.put(j, cluster.size());
View Full Code Here

      if(s.matches("\\S+")) {
        docFreq = ir.docFreq(new Term("txt", s));
      } else {
        PhraseQuery pq = new PhraseQuery();
        for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
        VectorCollector vc = new VectorCollector();
        is.search(pq, vc);
        docFreq = vc.getResultsVector().size();
      }
      double idf = Math.log(numDocs) - Math.log(docFreq);
      tfIdf.put(s, tf.getCount(s) * idf);
    }
    for(String s : StringTools.getSortedList(tfIdf)) {
View Full Code Here

      } else {
        PhraseQuery pq = new PhraseQuery();
        for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
        q = pq;
      }
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      docFreq = vc.getResultsVector().size();
      double score;
      double expected = scaleFactor * docFreq;
      double excess = df.getCount(s) - expected;
      score = excess / clusterSize;       
      if(score > threshold) scores.put(s, score);
    }
    Stemmer st = new Stemmer(new EnglishStemmer());
    Map<String,List<String>> stems = st.wordsToStems(df.getSet());
    for(String stem : stems.keySet()) {
      List<String> words = stems.get(stem);
      if(words.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String word : words) {
          bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          df.add(stems.get(stem).toString(), overlap);
          scores.put(stems.get(stem).toString(), score);
        }
      }
    }
    Map<String,List<String>> termStems = ngtd.ngramsByStem();
    for(String stem : termStems.keySet()) {
      List<String> multiWords = termStems.get(stem);
      if(multiWords.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String multiWord : multiWords) {
          PhraseQuery pq = new PhraseQuery();
          for(String ss : StringTools.arrayToList(multiWord.split("\\s+"))) pq.add(new Term("txt", ss));
          bq.add(new BooleanClause(pq, Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          df.add(termStems.get(stem).toString(), overlap);
          scores.put(termStems.get(stem).toString(), score);
        }
      }
    }
    if(enriched) {
      for(String inchi : inchis) {
        Term luceneTerm = new Term("InChI", inchi);
        Query q = new TermQuery(luceneTerm);
        VectorCollector vc = new VectorCollector();
        is.search(q, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        if(overlap < 2) continue;
        double excess = overlap - expected;
        double score = excess / clusterSize;
       
        if(score > threshold) {
          String s = "InChi: " + lci.getName(lci.hitsByInChI(inchi));
          scores.put(s, score);
          df.add(s, overlap);           
        }
      }
     
      Map<String,Set<String>> ontQs = OBOOntology.getInstance().queriesForIds(onts);
     
      for(String ontQ : ontQs.keySet()) {
        /*BooleanQuery bq = new BooleanQuery(true);
        if(ontQs.get(ontQ).size() > BooleanQuery.getMaxClauseCount()) continue;
        for(String ont : ontQs.get(ontQ)) {
          bq.add(new BooleanClause(new TermQuery(new Term("Ontology", ont)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);*/
        VectorCollector vc = OntologyQueryCache.getResultsStatic(ontQ, ontQs.get(ontQ), is);
        Map<Integer,Float> results = vc.getResultsVector();
        double expected = scaleFactor * results.size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        if(overlap < 2) continue;
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          String s = ontQ + " " + OBOOntology.getInstance().getNameForID(ontQ);
View Full Code Here

    int docTotal = ir.numDocs();
    for(String term : dfs.getSet()) {
      if(TermSets.getClosedClass().contains(term) || term.matches("[^A-Za-z]+")) continue;
      Term luceneTerm = new Term("txt", term);
      Query q = new TermQuery(luceneTerm);
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      double score = similarity.similarity(cluster, vc.getResultsVector());
      if(score > threshold) {
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        if(overlap > 1) {
          scores.put(term, score);
          overlaps.put(term, overlap);           
        }
      }
    }
    for(String stem : stems.keySet()) {
      List<String> words = stems.get(stem);
      if(words.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String word : words) {
          bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double score = similarity.similarity(cluster, vc.getResultsVector());
        if(score > threshold) {
          String s = words.toString();
          int overlap = overlapDocs(vc.getResultsVector(), cluster);
          if(overlap > 1) {
            scores.put(s, score);
            overlaps.put(s, overlap);           
          }
        }
      }
    }
    for(String stem : termStems.keySet()) {
      List<String> multiWords = termStems.get(stem);
      if(multiWords.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String multiWord : multiWords) {
          PhraseQuery pq = new PhraseQuery();
          for(String ss : StringTools.arrayToList(multiWord.split("\\s+"))) pq.add(new Term("txt", ss));
          bq.add(new BooleanClause(pq, Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double score = similarity.similarity(cluster, vc.getResultsVector());
        if(score > threshold) {
          String s = multiWords.toString();
          int overlap = overlapDocs(vc.getResultsVector(), cluster);
          if(overlap > 1) {
            scores.put(s, score);
            overlaps.put(s, overlap);           
          }
        }
      }
    }
    for(String s : bs.getList()) {
      if(!s.matches(".*\\s+.*")) continue;
      PhraseQuery pq = new PhraseQuery();
      for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
      VectorCollector vc = new VectorCollector();
      is.search(pq, vc);
      double score = similarity.similarity(cluster, vc.getResultsVector());
      if(score > threshold) {
        scores.put(s, score);
        overlaps.put(s, overlapDocs(vc.getResultsVector(), cluster));
      }
    }
   

    if(false) {
      for(String inchi : inchis) {
        Term luceneTerm = new Term("InChI", inchi);
        Query q = new TermQuery(luceneTerm);
        VectorCollector vc = new VectorCollector();
        is.search(q, vc);
        double score = similarity.similarity(cluster, vc.getResultsVector());
        if(score > threshold) {
          int overlap = overlapDocs(vc.getResultsVector(), cluster);
          if(overlap > 1) {
            String s = "InChi: " + lci.getName(lci.hitsByInChI(inchi));
            scores.put(s, score);
            overlaps.put(s, overlap);           
          }
        }
      }
     
      Map<String,Set<String>> ontQs = OBOOntology.getInstance().queriesForIds(onts);
     
      for(String ontQ : ontQs.keySet()) {
        BooleanQuery bq = new BooleanQuery(true);
        if(ontQs.get(ontQ).size() > BooleanQuery.getMaxClauseCount()) continue;
        for(String ont : ontQs.get(ontQ)) {
          bq.add(new BooleanClause(new TermQuery(new Term("Ontology", ont)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double score = similarity.similarity(cluster, vc.getResultsVector());
        if(score > threshold) {
          int overlap = overlapDocs(vc.getResultsVector(), cluster);
          if(overlap > 1) {
            String s = ontQ + " " + OBOOntology.getInstance().getNameForID(ontQ);
            scores.put(s, score);
            overlaps.put(s, overlap);           
          }
View Full Code Here

    //pq.add(new Term("txt", "aromatase"));
    //pq.add(new Term("txt", "inhibitors"));
    //Query q = new TermQuery(new Term("txt", queryTerm));
    Query q = new StemQuery(new Term("txt", queryTerm), stemmerTools);
    //q = pq;
    VectorCollector vc = new VectorCollector();
    is.search(q, vc);
    Map<String,Double> scores = simpleExcessAnalyseCluster(vc.getResultsVector(), lis.getIndexReader(), 0.01);
    for(String s : StringTools.getSortedList(scores)) {
      System.out.println(s + "\t" + scores.get(s));
    }

    //tfIdfAnalyseCluster(vc.getResultsVector(), lis.getIndexReader());
View Full Code Here

    String normWord = StringTools.normaliseName(word);
    if (!word.equals(normWord)) {
      contextable.add(makeWordFeature(normWord));
    }

    ExtractTrainingData etd = ExtractTrainingData.getInstance();
    makeWordFeatures(word, normWord, bigramable, etd);
    makeReactionFeatures(word, bigramable, contextable, etd);

    String wts = StringTools.removeTerminalS(normWord);
    contextable.add(WITHOUT_TERMINAL_S_FEATURE + wts);
View Full Code Here

  // I'm sure there's a nice analytic way of doing this. Ah well...
  public static void main(String[] args) {
    List<Double> positiveExamples = new ArrayList<Double>();
    List<Double> negativeExamples = new ArrayList<Double>();
   
    ExtractTrainingData etd1 = ExtractTrainingData.getInstance();
    List<File> sbFiles = new ArrayList<File>();
    sbFiles.addAll(FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/goodrsc"), "scrapbook.xml"));
    ExtractTrainingData etd2 = new ExtractTrainingData(sbFiles);
    Set<String> chem = new HashSet<String>(etd2.chemicalWords);
    //chem.removeAll(etd1.chemicalWords);
    for(String w : chem) {
      if(!NGramBuilder.getInstance().chemSet.contains(NGram.parseWord(w))) {
        double score = NGram.getInstance().testWord(w);
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.Tokeniser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.