Examples of uk.ac.cam.ch.wwmm.oscar3.chemnamedict.ChemNameDict$ChemRecord

Package uk.ac.cam.ch.wwmm.oscar3.chemnamedict

Examples of uk.ac.cam.ch.wwmm.oscar3.chemnamedict.ChemNameDict$ChemRecord

uk.ac.cam.ch.wwmm.oscar3.indexersearcher.LuceneIndexerSearcher
Name to structure dictionary, holds active data in memory, a replacement class for ChemNameDict. Improvements include the storage of orphan chemical names (i.e. those with no structure associated) and a simpler XML serialisation (with no need for ID numbers on everything). NewChemNameDict stores Chemical Records, Orphan Names, and Stopwords. Chemical Records must have an InChI identifier, may have a SMILES string, and an unlimited number of names and ontology identifiers. The InChI identifiers are unique; it is not possible to have two records with the same identifier. Orphan Names are names which have no InChI; a name can only be an Orphan Name if it does not appear as a name in any chemical record. Stopwords are things that the system should not recognise as chemical names. Currently, there is no checking to see if stopwords also appear in other name lists. Note that in chemical records, the aim is to associate names with InChIs, and ontology identifiers with InChIs, rather than to directly associate names with ontology identifiers. If you need to associate names directly with ontology identifiers, list them as orphan names here, and associate them with their identifiers in ontology.txt. @author ptc24

      System.out.println(s + "\t" + tfIdf.get(s));
    }  
  }
  
  public static Map<String,Double> excessAnalyseCluster(Map<Integer,Float> cluster, IndexReader ir, double threshold, boolean enriched) throws Exception {
    LuceneChemicalIndex lci = new LuceneIndexerSearcher(false).getLci();
    Set<String> inchis = new HashSet<String>();
    Set<String> onts = new HashSet<String>();
    
    List<File> clusterFiles = new ArrayList<File>();
    for(Integer i : cluster.keySet()) {
      clusterFiles.add(new File(ir.document(i).getField("filename").stringValue().replaceAll("markedup", "source")));
      if(enriched) {
        TermFreqVector tvf = ir.getTermFreqVector(i, "InChI");
        if(tvf != null) {
          String [] termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            inchis.add(termArray[j]);
          }        
        }
        tvf = ir.getTermFreqVector(i, "Ontology");
        if(tvf != null) {
          String [] termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            onts.add(termArray[j]);
          }        
        }        
      }
    }
    NGramTfDf ngtd = NGramTfDf.analyseFiles(clusterFiles);
    ngtd.calculateNGrams();
    Bag<String> df = ngtd.getDfBag(1);
    df.discardInfrequent(2);
    Map<String,Double> scores = new HashMap<String,Double>();
    int numDocs = ir.numDocs();
    int clusterSize = cluster.size();
    double scaleFactor = clusterSize * 1.0 / numDocs;
    IndexSearcher is = new IndexSearcher(ir);
    for(String s : df.getSet()) {
      //System.out.println(s);
      int docFreq = 0;
      Query q;
      if(s.matches("\\S+")) {
        TermQuery tq = new TermQuery(new Term("txt", s));
        q = tq;
        //docFreq = ir.docFreq(new Term("txt", s));
      } else {
        PhraseQuery pq = new PhraseQuery();
        for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
        q = pq;
      }
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      docFreq = vc.getResultsVector().size();
      double score;
      double expected = scaleFactor * docFreq;
      double excess = df.getCount(s) - expected;
      score = excess / clusterSize;        
      if(score > threshold) scores.put(s, score);
    }
    Stemmer st = new Stemmer(new EnglishStemmer());
    Map<String,List<String>> stems = st.wordsToStems(df.getSet());
    for(String stem : stems.keySet()) {
      List<String> words = stems.get(stem);
      if(words.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String word : words) {
          bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          df.add(stems.get(stem).toString(), overlap);
          scores.put(stems.get(stem).toString(), score);
        }
      }
    }
    Map<String,List<String>> termStems = ngtd.ngramsByStem();
    for(String stem : termStems.keySet()) {
      List<String> multiWords = termStems.get(stem);
      if(multiWords.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String multiWord : multiWords) {
          PhraseQuery pq = new PhraseQuery();
          for(String ss : StringTools.arrayToList(multiWord.split("\\s+"))) pq.add(new Term("txt", ss));
          bq.add(new BooleanClause(pq, Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        double excess = overlap - expected;
        double score = excess / clusterSize;
        if(score > threshold) {
          df.add(termStems.get(stem).toString(), overlap);
          scores.put(termStems.get(stem).toString(), score);
        }
      }
    }
    if(enriched) {
      for(String inchi : inchis) {
        Term luceneTerm = new Term("InChI", inchi);
        Query q = new TermQuery(luceneTerm);
        VectorCollector vc = new VectorCollector();
        is.search(q, vc);
        double expected = scaleFactor * vc.getResultsVector().size();
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        if(overlap < 2) continue;
        double excess = overlap - expected;
        double score = excess / clusterSize;
        
        if(score > threshold) {
          String s = "InChi: " + lci.getName(lci.hitsByInChI(inchi));
          scores.put(s, score);
          df.add(s, overlap);            
        }
      }

View Full Code Here

    return scores;
  }


  
  public static void analyseCluster(Map<Integer,Float> cluster, IndexReader ir, DocVectorSimilarity similarity, double threshold) throws Exception {
    LuceneChemicalIndex lci = new LuceneIndexerSearcher(false).getLci();
    List<File> clusterFiles = new ArrayList<File>();
    Bag<String> dfs = new Bag<String>();
    Set<String> inchis = new HashSet<String>();
    Set<String> onts = new HashSet<String>();
    for(Integer i : cluster.keySet()) {
      cluster.put(i, 1.0f);
      TermFreqVector tvf = ir.getTermFreqVector(i, "txt");
      String [] termArray = tvf.getTerms();
      for(int j=0;j<termArray.length;j++) {
        dfs.add(termArray[j]);
      }
      if(false) {
        tvf = ir.getTermFreqVector(i, "InChI");
        if(tvf != null) {
          termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            inchis.add(termArray[j]);
          }        
        }
        tvf = ir.getTermFreqVector(i, "Ontology");
        if(tvf != null) {
          termArray = tvf.getTerms();
          for(int j=0;j<termArray.length;j++) {
            onts.add(termArray[j]);
          }        
        }        
      }
      


      clusterFiles.add(new File(ir.document(i).getField("filename").stringValue().replaceAll("markedup", "source")));
    }
    Stemmer st = new Stemmer(new EnglishStemmer());
    Map<String,List<String>> stems = st.wordsToStems(dfs.getSet());


    dfs.discardInfrequent(2);
    NGramTfDf ngtd = NGramTfDf.analyseFiles(clusterFiles);
    ngtd.calculateNGrams();
    Bag<String> bs = ngtd.getDfBag(2);
    bs.discardInfrequent(2);
    Map<String,List<String>> termStems = ngtd.ngramsByStem();


    Map<String,Double> scores = new HashMap<String,Double>();
    Map<String,Integer> overlaps = new HashMap<String,Integer>();
    IndexSearcher is = new IndexSearcher(ir);
    int docTotal = ir.numDocs();
    for(String term : dfs.getSet()) {
      if(TermSets.getClosedClass().contains(term) || term.matches("[^A-Za-z]+")) continue;
      Term luceneTerm = new Term("txt", term);
      Query q = new TermQuery(luceneTerm);
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      double score = similarity.similarity(cluster, vc.getResultsVector());
      if(score > threshold) {
        int overlap = overlapDocs(vc.getResultsVector(), cluster);
        if(overlap > 1) {
          scores.put(term, score);
          overlaps.put(term, overlap);            
        }
      }
    }
    for(String stem : stems.keySet()) {
      List<String> words = stems.get(stem);
      if(words.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String word : words) {
          bq.add(new BooleanClause(new TermQuery(new Term("txt", word)), Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double score = similarity.similarity(cluster, vc.getResultsVector());
        if(score > threshold) {
          String s = words.toString();
          int overlap = overlapDocs(vc.getResultsVector(), cluster);
          if(overlap > 1) {
            scores.put(s, score);
            overlaps.put(s, overlap);            
          }
        }
      }
    }
    for(String stem : termStems.keySet()) {
      List<String> multiWords = termStems.get(stem);
      if(multiWords.size() > 1) {
        BooleanQuery bq = new BooleanQuery(true);
        for(String multiWord : multiWords) {
          PhraseQuery pq = new PhraseQuery();
          for(String ss : StringTools.arrayToList(multiWord.split("\\s+"))) pq.add(new Term("txt", ss));
          bq.add(new BooleanClause(pq, Occur.SHOULD));
        }
        VectorCollector vc = new VectorCollector();
        is.search(bq, vc);
        double score = similarity.similarity(cluster, vc.getResultsVector());
        if(score > threshold) {
          String s = multiWords.toString();
          int overlap = overlapDocs(vc.getResultsVector(), cluster);
          if(overlap > 1) {
            scores.put(s, score);
            overlaps.put(s, overlap);            
          }
        }
      }
    }
    for(String s : bs.getList()) {
      if(!s.matches(".*\\s+.*")) continue;
      PhraseQuery pq = new PhraseQuery();
      for(String ss : StringTools.arrayToList(s.split("\\s+"))) pq.add(new Term("txt", ss));
      VectorCollector vc = new VectorCollector();
      is.search(pq, vc);
      double score = similarity.similarity(cluster, vc.getResultsVector());
      if(score > threshold) {
        scores.put(s, score);
        overlaps.put(s, overlapDocs(vc.getResultsVector(), cluster));
      }
    }
    


    if(false) {
      for(String inchi : inchis) {
        Term luceneTerm = new Term("InChI", inchi);
        Query q = new TermQuery(luceneTerm);
        VectorCollector vc = new VectorCollector();
        is.search(q, vc);
        double score = similarity.similarity(cluster, vc.getResultsVector());
        if(score > threshold) {
          int overlap = overlapDocs(vc.getResultsVector(), cluster);
          if(overlap > 1) {
            String s = "InChi: " + lci.getName(lci.hitsByInChI(inchi));
            scores.put(s, score);
            overlaps.put(s, overlap);            
          }
        }
      }

View Full Code Here


  /**
   * @param args
   */
  public static void main(String[] args) throws Exception {
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
    IndexSearcher is = lis.getIndexSearcher();


    Stemmer stemmerTools = new Stemmer(new EnglishStemmer());
    
    //QueryParser qp = new Oscar3QueryParser("txt", new Oscar3Analyzer(), lis, false);
    //Query q = qp.parse("NaCl");
    
    String queryTerm = "lipid";
    //PhraseQuery pq = new PhraseQuery();
    //pq.add(new Term("txt", "aromatase"));
    //pq.add(new Term("txt", "inhibitors"));
    Query q = new TermQuery(new Term("txt", queryTerm));
    //Query q = new StemQuery(new Term("txt", queryTerm), stemmerTools);


    for(int i=0;i<100;i++) {
      VectorCollector vc = new VectorCollector();
      is.search(q, vc);
      for(Integer j : new ArrayList<Integer>(vc.getResultsVector().keySet())) {
        if(vc.getResultsVector().get(j) < 0.2) vc.getResultsVector().remove(j);
      }
      Map<String,Double> scores = ClusterAnalyser.simpleExcessAnalyseCluster(vc.getResultsVector(), lis.getIndexReader(), 0.1);
      BooleanQuery bq = new BooleanQuery(false);
      List<String> terms = StringTools.getSortedList(scores);
      if(terms.size() > 10) terms = terms.subList(0, 10);
      for(String s : terms) {
        System.out.println(s + "\t" + scores.get(s));

View Full Code Here

   */
  public static void main(String[] args) throws Exception {
    //System.out.println(StringTools.arrayToList(StandardAnalyzer.STOP_WORDS));
    //if(true) return;
    
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);


    String queryTerm = "cyp2d6";
    
    /*List<String> queryList = StringTools.arrayToList(new String[] {
    "cyp2d6", 
    "cyp3a4", 
    "dextromethorphan"});*/
    List<String> queryList = new ArrayList<String>();
    long time = System.currentTimeMillis();
    IndexReader ir = lis.getIndexReader();
    TermEnum termEnum = ir.terms();
    while(termEnum.next()) {
      Term t = termEnum.term();
      if(t.field().equals("txt") && termEnum.docFreq() > 20) queryList.add(t.text());
    }
    //queryList.addAll(lis.termsFromQuery(new TermQuery(new Term("txt", queryTerm))));
    System.out.println("All terms loaded: " + (System.currentTimeMillis() - time));
    
    time = System.currentTimeMillis();
    Map<String,Map<Integer,Float>> vectors = new HashMap<String,Map<Integer,Float>>();
    for(String query : queryList) {
      //System.out.println(query);
      vectors.put(query, lis.getScoresVectorForQuery(new TermQuery(new Term("txt", query))));
    }
    /*PhraseQuery pq = new PhraseQuery();
    pq.add(new Term("txt", "cyp2d6"));
    pq.add(new Term("txt", "inhibitors"));
    vectors.put("cyp2d6 inhibitors", lis.getScoresVectorForQuery(pq));*/

View Full Code Here

  
  /**
   * @param args
   */
  public static void main(String[] args) throws Exception{
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
    IndexReader ir = lis.getIndexReader();
    
    int numDocs = ir.numDocs();
    TermEnum textEnum = ir.terms();
    Map<String,Integer> docFreqs = new HashMap<String,Integer>();
    while(textEnum.next()) {

View Full Code Here


  /**
   * @param args
   */
  public static void main(String[] args) throws Exception {
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
    IndexSearcher is = lis.getIndexSearcher();
    IndexReader ir = lis.getIndexReader();
    
    TermEnum textEnum = ir.terms();
    Map<String,Integer> docFreqs = new HashMap<String,Integer>();
    float nd = ir.numDocs() * 1.0f;
    while(textEnum.next()) {

View Full Code Here

  
  /**
   * @param args
   */
  public static void main(String[] args) throws Exception {
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
    IndexSearcher is = lis.getIndexSearcher();


    TreeNode tn = new TreeNode();
    
    tn.subCluster(is);

View Full Code Here

  /**
   * @param args
   */
  public static void main(String[] args) throws Exception {
    // TODO Auto-generated method stub
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
    IndexReader ir = lis.getIndexReader();
    Directory dir = new RAMDirectory(ir.directory());
    ir.close();
    IndexSearcher is = new IndexSearcher(dir);
    ir = is.getIndexReader();

View Full Code Here


  /**
   * @param args
   */
  public static void main(String[] args) throws Exception {
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
    IndexSearcher is = lis.getIndexSearcher();
    IndexReader ir = lis.getIndexReader();
    Bag<String> tfBag = new Bag<String>();
    for(int i=0;i<ir.maxDoc();i++) {
      TermFreqVector tv = ir.getTermFreqVector(i, "txt");
      String [] terms = tv.getTerms();
      int [] freqs = tv.getTermFrequencies();

View Full Code Here

  
  /**
   * @param args
   */
  public static void main(String[] args) throws Exception {
    LuceneIndexerSearcher lis = new LuceneIndexerSearcher(false);
    IndexReader ir = lis.getIndexReader();
    
    long allTime = System.currentTimeMillis();
    
    int numDocs = ir.numDocs();
    TermEnum textEnum = ir.terms();

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.chemnamedict.ChemNameDict$ChemRecord

dk.brics.automaton.Automaton

dk.brics.automaton.RunAutomaton

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.