Package uk.ac.cam.ch.wwmm.ptc.experimental.termsimilarity

Examples of uk.ac.cam.ch.wwmm.ptc.experimental.termsimilarity.SimilarityExtractor


    List<File> files = new ArrayList<File>();     
    //files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/oscarworkspace/corpora/paperset1"), "markedup.xml");
    files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/oscarworkspace/corpora/newEnzyme"), "markedup.xml");

    FeatureVectorExtractor fve = new InlineFVE(files);
    SimilarityExtractor se = new SimilarityExtractor(fve, new CosSimilarity(), new TTestWeighting());

    System.out.println("Indexing...");
    Map<String,Integer> termIndex = new HashMap<String,Integer>();
    int termNo = 0;
    List<String> terms = se.getMostFrequentTerms(5000);
    for(String term : terms) {
      termIndex.put(term, termNo);
      termNo++;
    }
    int featureNo = 0;
    List<String> features = se.getFeatures();
    Map<String,Integer> featureIndex = new HashMap<String,Integer>();
    for(String feature : features) {
      featureIndex.put(feature, featureNo);
      featureNo++;
    }
    System.out.println("Indexed...");
    SVDHarness svdh = new SVDHarness(termNo, featureNo);
    for(int i=0;i<termNo;i++) {
      Map<String,Double> wv = se.getWeightVector(terms.get(i));
      for(String feature : wv.keySet()) {
        int fno = featureIndex.get(feature);
        svdh.set(fno, i, wv.get(feature));
      }
    }
    System.out.println("Harness ready...");
    svdh.svd(10);
    Matrix lm = svdh.getUt().transpose();
    System.out.println(lm.getRowDimension());
    System.out.println(lm.getColumnDimension());
    System.out.println(termNo);
    System.out.println(featureNo);
    double [] svals = svdh.getS();
    if(true) {
      for(int i=0;i<svals.length;i++) {
        System.out.println(svals[i]);
        for(int j=0;j<lm.getRowDimension();j++) {
          lm.set(j, i, lm.get(j, i) * svals[i]);
        }
      }     
    }

    for(int tn=0;tn<termNo;tn++) {
      System.out.println(terms.get(tn));
      Map<String,Double> cosines = new HashMap<String,Double>();
      for(int i=0;i<terms.size();i++) {
        double termScore = 0.0;
        double otherTermScore = 0.0;
        double product = 0.0;
       
        for(int j=0;j<svals.length;j++) {
          double tVal = lm.get(tn,j);
          double otVal = lm.get(i,j);
          termScore += tVal * tVal;
          otherTermScore += otVal * otVal;
          product += tVal * otVal;
        }
        double cosine = product / (Math.sqrt(termScore) * Math.sqrt(otherTermScore));
        if(cosine > 0.4) cosines.put(terms.get(i), cosine);
      }
      for(String s : StringTools.getSortedList(cosines)) {
        System.out.println("\t" + s + "\t" + cosines.get(s) + "\t" + se.getSimilarity(terms.get(tn), s));
      }

    }
  }
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.ptc.experimental.termsimilarity.SimilarityExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.