Examples of edu.udo.cs.wvtool.generic.loader.WVTDocumentLoader

Package edu.udo.cs.wvtool.generic.loader

Examples of edu.udo.cs.wvtool.generic.loader.WVTDocumentLoader

edu.udo.cs.wvtool.generic.loader.WVTDocumentLoader
This interface represents a mechanism by which a document is loaded. Loading refers to the operation of opening a stream to the source of the document. @author Michael Wurst @version $Id: WVTDocumentLoader.java,v 1.3 2007/05/20 18:06:03 mjwurst Exp $


    }


    public InputStream getInputStream(WVTDocumentInfo d, WVTConfiguration config) throws WVToolException {


        WVTDocumentLoader loader = null;
        loader = (WVTDocumentLoader) config.getComponentForStep(WVTConfiguration.STEP_LOADER, d);


        return loader.loadDocument(d);


    }

View Full Code Here


    }


    public Reader getReader(WVTDocumentInfo d, WVTConfiguration config) throws WVToolException {


        WVTDocumentLoader loader = null;
        WVTInputFilter infilter = null;


        loader = (WVTDocumentLoader) config.getComponentForStep(WVTConfiguration.STEP_LOADER, d);
        infilter = (WVTInputFilter) config.getComponentForStep(WVTConfiguration.STEP_INPUT_FILTER, d);


        return infilter.convertToPlainText(loader.loadDocument(d), d);


    }

View Full Code Here

        WVTWordList wordList = new WVTWordList(initialWords, input.getNumClasses());
        wordList.setAppendWords(addWords);
        wordList.setUpdateOnlyCurrent(false);


        // Initialize pointers to components for the individual steps
        WVTDocumentLoader loader = null;
        WVTInputFilter infilter = null;
        WVTCharConverter charConverter = null;
        WVTTokenizer tokenizer = null;
        WVTWordFilter wordFilter = null;
        WVTStemmer stemmer = null;


        // Obtain an expanded list of all documents to consider
        Iterator inList = input.getEntries();


        // Get through the list
        while (inList.hasNext()) {


            WVTDocumentInfo d = (WVTDocumentInfo) inList.next();


            try {


                // Intialize all required components for this document


                loader = (WVTDocumentLoader) config.getComponentForStep(WVTConfiguration.STEP_LOADER, d);
                infilter = (WVTInputFilter) config.getComponentForStep(WVTConfiguration.STEP_INPUT_FILTER, d);
                charConverter = (WVTCharConverter) config.getComponentForStep(WVTConfiguration.STEP_CHAR_MAPPER, d);
                tokenizer = (WVTTokenizer) config.getComponentForStep(WVTConfiguration.STEP_TOKENIZER, d);
                wordFilter = (WVTWordFilter) config.getComponentForStep(WVTConfiguration.STEP_WORDFILTER, d);
                stemmer = (WVTStemmer) config.getComponentForStep(WVTConfiguration.STEP_STEMMER, d);


                // Process the document


                TokenEnumeration tokens = stemmer.stem(wordFilter.filter(tokenizer.tokenize(charConverter.convertChars(infilter.convertToPlainText(loader.loadDocument(d), d), d), d), d), d);


                while (tokens.hasMoreTokens()) {
                    wordList.addWordOccurance(tokens.nextToken());
                }


                wordList.closeDocument(d);
                loader.close(d);


            } catch (WVToolException e) {


                WVToolLogger.getGlobalLogger().logException("Problems processing document " + d.getSourceName(), e);


                // close the input stream for this document


                loader.close(d);


                // If errors should not be skip throw an exception
                if (!skipErrors) {
                    throw new WVToolException("Problems processing document " + d.getSourceName(), e);

View Full Code Here


        wordList.setAppendWords(false);
        wordList.setUpdateOnlyCurrent(true);


        // Initialize pointers to components for the individual steps
        WVTDocumentLoader loader = null;
        WVTInputFilter infilter = null;
        WVTCharConverter charConverter = null;
        WVTTokenizer tokenizer = null;
        WVTWordFilter wordFilter = null;
        WVTStemmer stemmer = null;
        WVTVectorCreator vectorCreator = null;
        WVTOutputFilter outputFilter = null;


        // Obtain an expanded list of all documents to consider
        Iterator inList = input.getEntries();


        // Get through the list
        while (inList.hasNext()) {


            WVTDocumentInfo d = (WVTDocumentInfo) inList.next();


            try {


                // Intialize all required components for this document


                loader = (WVTDocumentLoader) config.getComponentForStep(WVTConfiguration.STEP_LOADER, d);
                infilter = (WVTInputFilter) config.getComponentForStep(WVTConfiguration.STEP_INPUT_FILTER, d);
                charConverter = (WVTCharConverter) config.getComponentForStep(WVTConfiguration.STEP_CHAR_MAPPER, d);
                tokenizer = (WVTTokenizer) config.getComponentForStep(WVTConfiguration.STEP_TOKENIZER, d);
                wordFilter = (WVTWordFilter) config.getComponentForStep(WVTConfiguration.STEP_WORDFILTER, d);
                stemmer = (WVTStemmer) config.getComponentForStep(WVTConfiguration.STEP_STEMMER, d);


                vectorCreator = (WVTVectorCreator) config.getComponentForStep(WVTConfiguration.STEP_VECTOR_CREATION, d);


                outputFilter = (WVTOutputFilter) config.getComponentForStep(WVTConfiguration.STEP_OUTPUT, d);


                // Process the document


                TokenEnumeration tokens = stemmer.stem(wordFilter.filter(tokenizer.tokenize(charConverter.convertChars(infilter.convertToPlainText(loader.loadDocument(d), d), d), d), d), d);


                while (tokens.hasMoreTokens()) {
                    wordList.addWordOccurance(tokens.nextToken());
                }


                outputFilter.write(vectorCreator.createVector(wordList.getFrequenciesForCurrentDocument(), wordList.getTermCountForCurrentDocument(), wordList, d));


                wordList.closeDocument(d);
                loader.close(d);


            } catch (WVToolException e) {


                // If an error occurs add it to the error log
                WVToolLogger.getGlobalLogger().logException("Problems processing document " + d.getSourceName(), e);


                // close the input stream for this document
                loader.close(d);


                // If errors should not be skip throw an exception
                if (!skipErrors)
                    throw new WVToolException("Problems processing document " + d.getSourceName(), e);

View Full Code Here

     * @throws WVToolException
     */
    public void iterateWords(WVTInputList input, WVTConfiguration config, WVToolWordListener listener) throws WVToolException {


        // Initialize pointers to components for the individual steps
        WVTDocumentLoader loader = null;
        WVTInputFilter infilter = null;
        WVTCharConverter charConverter = null;
        WVTTokenizer tokenizer = null;
        WVTWordFilter wordFilter = null;
        WVTStemmer stemmer = null;


        // Obtain an expanded list of all documents to consider
        Iterator inList = input.getEntries();


        // Get through the list
        while (inList.hasNext()) {


            WVTDocumentInfo d = (WVTDocumentInfo) inList.next();
            listener.openNewDocument(d);


            try {


                // Intialize all required components for this document


                loader = (WVTDocumentLoader) config.getComponentForStep(WVTConfiguration.STEP_LOADER, d);
                infilter = (WVTInputFilter) config.getComponentForStep(WVTConfiguration.STEP_INPUT_FILTER, d);
                charConverter = (WVTCharConverter) config.getComponentForStep(WVTConfiguration.STEP_CHAR_MAPPER, d);
                tokenizer = (WVTTokenizer) config.getComponentForStep(WVTConfiguration.STEP_TOKENIZER, d);
                wordFilter = (WVTWordFilter) config.getComponentForStep(WVTConfiguration.STEP_WORDFILTER, d);
                stemmer = (WVTStemmer) config.getComponentForStep(WVTConfiguration.STEP_STEMMER, d);


                // Process the document


                TokenEnumeration tokens = stemmer.stem(wordFilter.filter(tokenizer.tokenize(charConverter.convertChars(infilter.convertToPlainText(loader.loadDocument(d), d), d), d), d), d);


                while (tokens.hasMoreTokens()) {
                    listener.processWord(tokens.nextToken());
                }


                loader.close(d);


            } catch (WVToolException e) {


                // If an error occurs add it to the error log
                WVToolLogger.getGlobalLogger().logException("Problems processing document " + d.getSourceName(), e);


                // close the input stream for this document
                loader.close(d);


                // If errors should not be skip throw an exception
                if (!skipErrors)
                    throw new WVToolException("Problems processing document " + d.getSourceName(), e);

View Full Code Here

TOP

Related Classes of edu.udo.cs.wvtool.generic.loader.WVTDocumentLoader

edu.udo.cs.wvtool.main.WVTool

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.