Examples of edu.udo.cs.wvtool.wordlist.WVTWordList

edu.udo.cs.wvtool.wordlist.WVTWordList
This class represents a word list. It is used to store information about individual words, to count words and to calculate the vectors. @author Michael Wurst @version $Id: WVTWordList.java,v 1.6 2007/05/22 18:07:27 mjwurst Exp $

        // Add entries
        list.addEntry(new WVTDocumentInfo("D:/crawler/data/testwww.nytimes.com/ref/membercenter/faq", "txt", "", "english", 1));


        // Generate the word list


        WVTWordList wordList = wvt.createWordList(list, config);


        // Prune the word list


        //wordList.pruneByFrequency(2, 5);


        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));


        // Create the word vectors


        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");

View Full Code Here

/*        WVTWordList wordList = wvt.createWordList(list, config);


        // Prune the word list
        wordList.pruneByFrequency(2, 5);
        wordList.storePlain(new FileWriter("wordlist.txt")); */
        WVTWordList wordList =
           new WVTWordList(
           new FileReader("mywordlist.txt"));


        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");
        //WordVectorWriter wvw = new WordVectorWriter()

View Full Code Here

     * @throws Exception
     */
    public WVTWordList createWordList(WVTInputList input, WVTConfiguration config, List initialWords, boolean addWords) throws WVToolException {


        // Initialize the word list
        WVTWordList wordList = new WVTWordList(initialWords, input.getNumClasses());
        wordList.setAppendWords(addWords);
        wordList.setUpdateOnlyCurrent(false);


        // Initialize pointers to components for the individual steps
        WVTDocumentLoader loader = null;
        WVTInputFilter infilter = null;
        WVTCharConverter charConverter = null;
        WVTTokenizer tokenizer = null;
        WVTWordFilter wordFilter = null;
        WVTStemmer stemmer = null;


        // Obtain an expanded list of all documents to consider
        Iterator inList = input.getEntries();


        // Get through the list
        while (inList.hasNext()) {


            WVTDocumentInfo d = (WVTDocumentInfo) inList.next();


            try {


                // Intialize all required components for this document


                loader = (WVTDocumentLoader) config.getComponentForStep(WVTConfiguration.STEP_LOADER, d);
                infilter = (WVTInputFilter) config.getComponentForStep(WVTConfiguration.STEP_INPUT_FILTER, d);
                charConverter = (WVTCharConverter) config.getComponentForStep(WVTConfiguration.STEP_CHAR_MAPPER, d);
                tokenizer = (WVTTokenizer) config.getComponentForStep(WVTConfiguration.STEP_TOKENIZER, d);
                wordFilter = (WVTWordFilter) config.getComponentForStep(WVTConfiguration.STEP_WORDFILTER, d);
                stemmer = (WVTStemmer) config.getComponentForStep(WVTConfiguration.STEP_STEMMER, d);


                // Process the document


                TokenEnumeration tokens = stemmer.stem(wordFilter.filter(tokenizer.tokenize(charConverter.convertChars(infilter.convertToPlainText(loader.loadDocument(d), d), d), d), d), d);


                while (tokens.hasMoreTokens()) {
                    wordList.addWordOccurance(tokens.nextToken());
                }


                wordList.closeDocument(d);
                loader.close(d);


            } catch (WVToolException e) {


                WVToolLogger.getGlobalLogger().logException("Problems processing document " + d.getSourceName(), e);

View Full Code Here


        // Currently somewhat unefficient: Simply reads all documents twice,
        // will be enhanced in future versions.


        // Generate the word list
        WVTWordList wordList = createWordList(input, config);


        // Prune the word list
        wordList.pruneByFrequency(DEFAULT_PRUNE_MIN, DEFAULT_PRUNE_MAX);


        // Create the word vectors
        createVectors(input, config, wordList);


    }

View Full Code Here

        list.addEntry(new WVTDocumentInfo("../data/german_utf8", "txt", "utf-8", "german"));
        list.addEntry(new WVTDocumentInfo("../data/german_iso", "txt", "iso-8859-1", "german"));


        // Generate the word list


        WVTWordList wordList = wvt.createWordList(list, config);


        // Prune the word list


        //wordList.pruneByFrequency(2, 5);


        // Alternativ I: read an already created word list from a file
        // WVTWordList wordList2 =
        // new WVTWordList(new FileReader("/home/wurst/tmp/wordlisttest.txt"));


        // Alternative II: Use predifined dimensions
        // List dimensions = new Vector();
        // dimensions.add("atheist");
        // dimensions.add("christian");
        // wordList =
        // wvt.createWordList(list, config, dimensions, false);


        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));


        // Create the word vectors


        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");

View Full Code Here

        list.addEntry(new WVTDocumentInfo("../data/alt.atheism", "txt", "", "german", 0));
        list.addEntry(new WVTDocumentInfo("../data/soc.religion.christian", "txt", "", "english", 1));


        // Generate the word list


        WVTWordList wordList = wvt.createWordList(list, config);


        // Prune the word list
        wordList.pruneByFrequency(2, 5);


        // Store the aml file
        WordList2AMLFile.storeWordList(wordList, new FileWriter("test_wv.aml"), true, "wv.dat");
        // Create the word vectors

View Full Code Here

        list.addEntry(new WVTDocumentInfo("../data/alt.atheism", "txt", "", "german", 0));
        list.addEntry(new WVTDocumentInfo("../data/soc.religion.christian", "txt", "", "english", 1));


        // Generate the word list


        WVTWordList wordList = wvt.createWordList(list, config);


        // Prune the word list
        wordList.pruneByFrequency(2, 5);


        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));


        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");
        WordVectorWriter wvw = new WordVectorWriter(outFile, true);

View Full Code Here

        // Add entries
        list.addEntry(new WVTDocumentInfo("../data/pdf", "pdf", "", ""));


        // Generate the word list


        WVTWordList wordList = wvt.createWordList(list, config);


        // Prune the word list


        //wordList.pruneByFrequency(2, 5);


        // Alternativ I: read an already created word list from a file
        // WVTWordList wordList2 =
        // new WVTWordList(new FileReader("/home/wurst/tmp/wordlisttest.txt"));


        // Alternative II: Use predifined dimensions
        // List dimensions = new Vector();
        // dimensions.add("atheist");
        // dimensions.add("christian");
        // wordList =
        // wvt.createWordList(list, config, dimensions, false);


        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));


        // Create the word vectors


        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");

View Full Code Here

        list.addEntry(new WVTDocumentInfo("../data/alt.atheism", "txt", "", "german", 0));
        list.addEntry(new WVTDocumentInfo("../data/soc.religion.christian", "txt", "", "english", 1));


        // Generate the word list


        WVTWordList wordList = wvt.createWordList(list, config);


        // Prune the word list


        wordList.pruneByFrequency(2, 5);


        // Alternativ I: read an already created word list from a file
        // WVTWordList wordList2 =
        // new WVTWordList(new FileReader("/home/wurst/tmp/wordlisttest.txt"));


        // Alternative II: Use predifined dimensions
        // List dimensions = new Vector();
        // dimensions.add("atheist");
        // dimensions.add("christian");
        // wordList =
        // wvt.createWordList(list, config, dimensions, false);


        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));


        // Create the word vectors


        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");

View Full Code Here

        list.addEntry(new WVTDocumentInfo("../data/alt.atheism", "txt", "", "german", 0));
        list.addEntry(new WVTDocumentInfo("../data/soc.religion.christian", "txt", "", "english", 1));


        // Generate the word list


        WVTWordList wordList = wvt.createWordList(list, config);


        // Prune the word list
        //wordList.pruneByFrequency(1, 5);


        // Store the word list in a file
        wordList.storePlain(new FileWriter("wordlist.txt"));


        // Create the word vectors


        // Set up an output filter (write sparse vectors to a file)
        FileWriter outFile = new FileWriter("wv.txt");

View Full Code Here

0 1

TOP

Related Classes of edu.udo.cs.wvtool.wordlist.WVTWordList

crawler.text.CrawlWordVectorGenerator

crawler.text.Vectorizer

edu.udo.cs.wvtool.main.WVTool

WVToolCrawlerExample

WVToolDictionaryExample

WVToolExample

WVToolWordNetExample

WVToolYaleOutput

java.io.BufferedReader

java.util.StringTokenizer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.