Package ch.akuhn.hapax.index

Examples of ch.akuhn.hapax.index.TermDocumentMatrix


    public CorpusBuilder(TermDocumentMatrix tdm) {
        this.corpus = tdm;
    }

    public CorpusBuilder() {
        this(new TermDocumentMatrix());
    }
View Full Code Here


        return this;
    }


    public TermDocumentMatrix makeTDM() {
        TermDocumentMatrix tdm = corpus;
        if (ignoreCase) tdm = tdm.toLowerCase();
        if (rejectRareTerms) tdm = tdm.rejectHapaxes();
        if (rejectStopwords) tdm = tdm.rejectStopwords();
        // TODO if (rejectCommonTerms) tdm = tdm.rejectCommonTerms();
        return tdm.weight(local, global);
    }
View Full Code Here

public class SmallDocumentsTest {

    @Test
    public void corpusWithSmallDocuments() {
        TermDocumentMatrix tdm = new TermDocumentMatrix();
        tdm.putDocument("m1", new Terms("Lorem ipsum dolor."));
        tdm.putDocument("m2", new Terms("Lorem ipsum dolor."));
        tdm.putDocument("m3", new Terms("Lorem ipsum dolor."));
        LatentSemanticIndex lsi = tdm.rejectAndWeight().createIndex();
        assertEquals(3, lsi.documentCount());
        assertEquals(3, lsi.rankDocumentsByQuery("Lorem").size());
    }
View Full Code Here

        assertEquals(3, lsi.rankDocumentsByQuery("Lorem").size());
    }
   
    @Test
    public void corpusWithoutDocuments() {
        TermDocumentMatrix tdm = new TermDocumentMatrix();
        LatentSemanticIndex lsi = tdm.rejectAndWeight().createIndex();
        assertEquals(0, lsi.documentCount());
        assertEquals(0, lsi.rankDocumentsByQuery("Lorem").size());
    }
View Full Code Here

        assertEquals(0, lsi.rankDocumentsByQuery("Lorem").size());
    }
   
    @Test
    public void corpusWithoutOneDocuments() {
        TermDocumentMatrix tdm = new TermDocumentMatrix();
        tdm.putDocument("m1", new Terms("Lorem ipsum dolor."));
        LatentSemanticIndex lsi = tdm.rejectAndWeight().createIndex();
        assertEquals(1, lsi.documentCount());
        assertEquals(1, lsi.rankDocumentsByQuery("Lorem").size());
    }
View Full Code Here

    public static void main(String[] args) throws FileNotFoundException {

     
     
        TermDocumentMatrix tdm = new TermDocumentMatrix();
        CorpusBuilderHelper importer = new CorpusBuilderHelper(tdm);
        importer.importAllFiles(new File("../ch.deif.meander"), ".java");
        importer.importAllFiles(new File("."), ".java");

        Out.puts(tdm);

        Out.puts(Get.head(tdm.documents()));
        Out.puts(tdm.getDocument(Get.head(tdm.documents())));
       
        Out.puts(tdm);

        Out.puts(tdm.density());

        tdm = tdm.rejectAndWeight();

        Out.puts(tdm);

        Out.puts(tdm.density());

        Out.puts(tdm.terms().sortedCounts());

        LatentSemanticIndex lsi = tdm.createIndex();

        Out.puts(first(10, lsi.rankDocumentsByTerm("bag")));
        Out.puts(first(10, lsi.rankTermsByTerm("bag")));
        Out.puts(first(10, lsi.rankDocumentsByTerm("codemap")));
        Out.puts(first(10, lsi.rankDocumentsByQuery("split string by lower- and upper-case")));
View Full Code Here

public class Examples {

  @Test
  public void hapaxExample() {
   
    TermDocumentMatrix tdm;
   
    tdm = Hapax.newCorpus()
        .useCamelCaseScanner()
        .rejectRareTerms()
        .rejectCommonTerms()
        .useTFIDF()
        .addFiles(".", ".java")
        .makeTDM();
   
    System.out.println(tdm);
   
    tdm = Hapax.newCorpus()
        .useLetterScanner()
        .ignoreCase()
        .rejectStopwords()
        .rejectRareTerms()
        .dontUseWeighting()
        .addDocument("c1", "Human machine interface for Lab ABC computer applications")
        .addDocument("c2", "A survey of user opinion of computer system response time")
        .addDocument("c3", "The EPS user interface management system")
        .addDocument("c4", "System and human system engineering testing of EPS")
        .addDocument("c5", "Relation of user-perceived response time to error measurement")
        .addDocument("m1", "The generation of random, binary, unordered trees")
        .addDocument("m2", "The intersection graph of paths in trees")
        .addDocument("m3", "Graph minors IV: Widths of trees and well-quasi-ordering")
        .addDocument("m4", "Graph minors: A survey")
        .makeTDM();
       
    System.out.println(tdm);
    
    LatentSemanticIndex lsi = tdm.createIndex(2);
   
    Ranking<String> ranking = lsi.rankDocumentsByQuery("human computer interaction");
   
    System.out.println(ranking.top(10));
   
View Full Code Here

        { "m4", "Graph minors: A survey" }};
   
   
    @Test
    public TermDocumentMatrix makeTermDocumentMatrix() {
        TermDocumentMatrix tdm = new TermDocumentMatrix();
        for (String[] each: DATA) {
            tdm.putDocument(each[0], new Terms(each[1]));
        }
        assertEquals(9, tdm.documentCount());
        assertEquals(45, tdm.termCount());
        return tdm;
    }
View Full Code Here

    }
   
    @Test
    @Given("#makeTermDocumentMatrix")
    public TermDocumentMatrix rejectStopWords(final TermDocumentMatrix matrix) {
        TermDocumentMatrix tdm = matrix;
        tdm = tdm.toLowerCase();
        assertEquals(9, tdm.documentCount());
        assertEquals(42, tdm.termCount());
        tdm = tdm.rejectHapaxes();
        assertEquals(9, tdm.documentCount());
        assertEquals(16, tdm.termCount());
        tdm = tdm.toLowerCase().rejectStopwords();
        assertEquals(9, tdm.documentCount());
        assertEquals(12, tdm.termCount());
        assertEquals(SORTED, Get.sorted(tdm.terms().elementSet()).toString());
        return tdm;
    }
View Full Code Here

TOP

Related Classes of ch.akuhn.hapax.index.TermDocumentMatrix

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.