Package edu.udo.cs.wvtool.config

Examples of edu.udo.cs.wvtool.config.WVTConfiguration


        // Initialize the WVTool
        WVTool wvt = new WVTool(false);

        // Initialize the configuration
        WVTConfiguration config = new WVTConfiguration();

        final WVTStemmer dummyStemmer = new DummyStemmer();
        final WVTStemmer porterStemmer = new PorterStemmerWrapper();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationRule() {
            public Object getMatchingComponent(WVTDocumentInfo d) {

                if (d.getContentLanguage().equals("english"))
                    return porterStemmer;
                else
                    return dummyStemmer;
            }
        });

        WVTStemmer stemmer = new LovinsStemmerWrapper();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationFact(stemmer));

        // Initialize the input list with two classes
        WVTFileInputList list = new WVTFileInputList(1);

       
        // Add entries
        list.addEntry(new WVTDocumentInfo("D:/crawler/data/testwww.nytimes.com/ref/membercenter/faq", "txt", "", "english", 1));

        // Generate the word list

        WVTWordList wordList = wvt.createWordList(list, config);

        // Prune the word list

        //wordList.pruneByFrequency(2, 5);

        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));

        // Create the word vectors

        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");
        WordVectorWriter wvw = new WordVectorWriter(outFile, true);

        config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        // Create the vectors
        wvt.createVectors(list, config, wordList);

        // Alternatively: create word list and vectors together
View Full Code Here


        // Initialize the WVTool
        WVTool wvt = new WVTool(true);

        // Initialize the configuration
        WVTConfiguration config = new WVTConfiguration();

        final WVTStemmer dummyStemmer = new DummyStemmer();
        final WVTStemmer porterStemmer = new PorterStemmerWrapper();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationRule() {
            public Object getMatchingComponent(WVTDocumentInfo d) {

                if (d.getContentLanguage().equals("english"))
                    return porterStemmer;
                else
                    return dummyStemmer;
            }
        });

        // Initialize the input list with two classes
        WVTFileInputList list = new WVTFileInputList(1);
       
        list.addEntry(
            new WVTDocumentInfo("D:/crawler/dump",
            "txt","","english",0));
        // Generate the word list
       
/*        WVTWordList wordList = wvt.createWordList(list, config);

        // Prune the word list
        wordList.pruneByFrequency(2, 5);
        wordList.storePlain(new FileWriter("wordlist.txt")); */
        WVTWordList wordList =
           new WVTWordList(
           new FileReader("mywordlist.txt"));

        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");
        //WordVectorWriter wvw = new WordVectorWriter()
        WordVectorWriter wvw = new WordVectorWriter(outFile, true);

        config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        // Create the vectors
        wvt.createVectors(list, config, wordList);

        // Alternatively: create word list and vectors together
View Full Code Here

     * @return a WVTWordVector
     * @throws Exception
     */
    public WVTWordVector createVector(String text, WVTWordList wordList) throws WVToolException {

        WVTConfiguration config = new WVTConfiguration();

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        return createVector(text, new WVTDocumentInfo("", "", "", ""), config, wordList);

    }
View Full Code Here

        // Initialize the WVTool
        WVTool wvt = new WVTool(false);

        // Initialize the configuration
        WVTConfiguration config = new WVTConfiguration();

        final WVTStemmer dummyStemmer = new DummyStemmer();
        final WVTStemmer porterStemmer = new PorterStemmerWrapper();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationFact(new ToLowerCaseConverter()));

        // Initialize the input list with two classes
        WVTFileInputList list = new WVTFileInputList(0);

        // Add entries
        list.addEntry(new WVTDocumentInfo("../data/german_utf8", "txt", "utf-8", "german"));
        list.addEntry(new WVTDocumentInfo("../data/german_iso", "txt", "iso-8859-1", "german"));

        // Generate the word list

        WVTWordList wordList = wvt.createWordList(list, config);

        // Prune the word list

        //wordList.pruneByFrequency(2, 5);

        // Alternativ I: read an already created word list from a file
        // WVTWordList wordList2 =
        // new WVTWordList(new FileReader("/home/wurst/tmp/wordlisttest.txt"));

        // Alternative II: Use predifined dimensions
        // List dimensions = new Vector();
        // dimensions.add("atheist");
        // dimensions.add("christian");
        // wordList =
        // wvt.createWordList(list, config, dimensions, false);

        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));

        // Create the word vectors

        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");
        WordVectorWriter wvw = new WordVectorWriter(outFile, true);

        config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        // Create the vectors
        wvt.createVectors(list, config, wordList);

        // Alternatively: create word list and vectors together
View Full Code Here

        // Initialize the WVTool
        WVTool wvt = new WVTool(false);

        // Initialize the configuration
        WVTConfiguration config = new WVTConfiguration();

        final WVTStemmer dummyStemmer = new DummyStemmer();
        final WVTStemmer porterStemmer = new PorterStemmerWrapper();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationRule() {
            public Object getMatchingComponent(WVTDocumentInfo d) {

                if (d.getContentLanguage().equals("english"))
                    return porterStemmer;
                else
                    return dummyStemmer;
            }
        });

        WVTStemmer stemmer = new LovinsStemmerWrapper();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationFact(stemmer));

        // Initialize the input list with two classes
        WVTFileInputList list = new WVTFileInputList(2);

        // Add entries
        list.addEntry(new WVTDocumentInfo("../data/alt.atheism", "txt", "", "german", 0));
        list.addEntry(new WVTDocumentInfo("../data/soc.religion.christian", "txt", "", "english", 1));

        // Generate the word list

        WVTWordList wordList = wvt.createWordList(list, config);

        // Prune the word list
        wordList.pruneByFrequency(2, 5);

        // Store the aml file
        WordList2AMLFile.storeWordList(wordList, new FileWriter("test_wv.aml"), true, "wv.dat");
        // Create the word vectors

        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.dat");
        WordVectorWriter wvw = new WordVectorWriter(outFile, true, true, true, 1);

        config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        // Create the vectors
        wvt.createVectors(list, config, wordList);

        // Close the output file
View Full Code Here

       // Initialize the WVTool
        WVTool wvt = new WVTool(false);

        // Initialize the configuration
        WVTConfiguration config = new WVTConfiguration();

        WVTStemmer stemmer = new WordNetHypernymStemmer();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationFact(stemmer));

        // Initialize the input list with two classes
        WVTFileInputList list = new WVTFileInputList(2);

        // Add entries
        list.addEntry(new WVTDocumentInfo("../data/alt.atheism", "txt", "", "german", 0));
        list.addEntry(new WVTDocumentInfo("../data/soc.religion.christian", "txt", "", "english", 1));

        // Generate the word list

        WVTWordList wordList = wvt.createWordList(list, config);

        // Prune the word list
        wordList.pruneByFrequency(2, 5);

        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));

        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");
        WordVectorWriter wvw = new WordVectorWriter(outFile, true);

        config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        // Create the vectors
        wvt.createVectors(list, config, wordList);

        // Alternatively: create word list and vectors together
View Full Code Here

        // Initialize the WVTool
        WVTool wvt = new WVTool(false);

        // Initialize the configuration
        WVTConfiguration config = new WVTConfiguration();

        final WVTStemmer dummyStemmer = new DummyStemmer();
        final WVTStemmer porterStemmer = new PorterStemmerWrapper();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationFact(new ToLowerCaseConverter()));

        // Initialize the input list with two classes
        WVTFileInputList list = new WVTFileInputList(0);

        // Add entries
        list.addEntry(new WVTDocumentInfo("../data/pdf", "pdf", "", ""));

        // Generate the word list

        WVTWordList wordList = wvt.createWordList(list, config);

        // Prune the word list

        //wordList.pruneByFrequency(2, 5);

        // Alternativ I: read an already created word list from a file
        // WVTWordList wordList2 =
        // new WVTWordList(new FileReader("/home/wurst/tmp/wordlisttest.txt"));

        // Alternative II: Use predifined dimensions
        // List dimensions = new Vector();
        // dimensions.add("atheist");
        // dimensions.add("christian");
        // wordList =
        // wvt.createWordList(list, config, dimensions, false);

        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));

        // Create the word vectors

        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");
        WordVectorWriter wvw = new WordVectorWriter(outFile, true);

        config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        // Create the vectors
        wvt.createVectors(list, config, wordList);

        // Alternatively: create word list and vectors together
View Full Code Here

        // Initialize the WVTool
        WVTool wvt = new WVTool(false);

        // Initialize the configuration
        WVTConfiguration config = new WVTConfiguration();

        final WVTStemmer dummyStemmer = new DummyStemmer();
        final WVTStemmer porterStemmer = new PorterStemmerWrapper();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationRule() {
            public Object getMatchingComponent(WVTDocumentInfo d) {

                if (d.getContentLanguage().equals("english"))
                    return porterStemmer;
                else
                    return dummyStemmer;
            }
        });

        WVTStemmer stemmer = new LovinsStemmerWrapper();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationFact(stemmer));

        // Initialize the input list with two classes
        WVTFileInputList list = new WVTFileInputList(2);

        // Add entries
        list.addEntry(new WVTDocumentInfo("../data/alt.atheism", "txt", "", "german", 0));
        list.addEntry(new WVTDocumentInfo("../data/soc.religion.christian", "txt", "", "english", 1));

        // Generate the word list

        WVTWordList wordList = wvt.createWordList(list, config);

        // Prune the word list

        wordList.pruneByFrequency(2, 5);

        // Alternativ I: read an already created word list from a file
        // WVTWordList wordList2 =
        // new WVTWordList(new FileReader("/home/wurst/tmp/wordlisttest.txt"));

        // Alternative II: Use predifined dimensions
        // List dimensions = new Vector();
        // dimensions.add("atheist");
        // dimensions.add("christian");
        // wordList =
        // wvt.createWordList(list, config, dimensions, false);

        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));

        // Create the word vectors

        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");
        WordVectorWriter wvw = new WordVectorWriter(outFile, true);

        config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        // Create the vectors
        wvt.createVectors(list, config, wordList);

        // Alternatively: create word list and vectors together
View Full Code Here

        // Initialize the WVTool
        WVTool wvt = new WVTool(false);

        // Initialize the configuration
        WVTConfiguration config = new WVTConfiguration();

        WVTStemmer stemmer = new DictionaryStemmer(new FileReader("../data/sample_dictionary.txt"));

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationFact(stemmer));

        // Initialize the input list with two classes
        WVTFileInputList list = new WVTFileInputList(2);

        // Add entries
        list.addEntry(new WVTDocumentInfo("../data/alt.atheism", "txt", "", "german", 0));
        list.addEntry(new WVTDocumentInfo("../data/soc.religion.christian", "txt", "", "english", 1));

        // Generate the word list

        WVTWordList wordList = wvt.createWordList(list, config);

        // Prune the word list
        //wordList.pruneByFrequency(1, 5);

        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));

        // Create the word vectors

        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");
        WordVectorWriter wvw = new WordVectorWriter(outFile, true);

        config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        // Create the vectors
        wvt.createVectors(list, config, wordList);

        // Alternatively: create word list and vectors together
View Full Code Here

        // Initialize the WVTool
        WVTool wvt = new WVTool(true);

        // Initialize the configuration
        WVTConfiguration config = new WVTConfiguration();

        final WVTStemmer dummyStemmer = new DummyStemmer();
        final WVTStemmer porterStemmer = new PorterStemmerWrapper();

        config.setConfigurationRule(WVTConfiguration.STEP_STEMMER, new WVTConfigurationRule() {
            public Object getMatchingComponent(WVTDocumentInfo d) {

                if (d.getContentLanguage().equals("english"))
                    return porterStemmer;
                else
                    return dummyStemmer;
            }
        });

        WVToolCrawler test = new WVToolCrawler("", "", "") {

            protected boolean vectorizePage(Page page) {

                String url = page.getURL().toExternalForm();

                return (url.indexOf("html")>-1);
            }

            public boolean shouldVisit(Link arg0) {

                return true;

            }
        };

        test.addRoot(new Link(new URL("http://www.bankersonline.com/")));
        test.setMaxDepth(2);

        // Initialize the input list with two classes
        WVTInputList list = new CrawledInputList(test);

        // Generate the word list

        WVTWordList wordList = wvt.createWordList(list, config);

        // Prune the word list
        wordList.pruneByFrequency(2, 5);

        wordList.storePlain(new FileWriter("wordlist.txt"));

        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");
        WordVectorWriter wvw = new WordVectorWriter(outFile, true);

        config.setConfigurationRule(WVTConfiguration.STEP_OUTPUT, new WVTConfigurationFact(wvw));

        config.setConfigurationRule(WVTConfiguration.STEP_VECTOR_CREATION, new WVTConfigurationFact(new TFIDF()));

        // Create the vectors
        wvt.createVectors(list, config, wordList);

        // Alternatively: create word list and vectors together
View Full Code Here

TOP

Related Classes of edu.udo.cs.wvtool.config.WVTConfiguration

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.