Package com.googlecode.gaal.preprocess.impl

Examples of com.googlecode.gaal.preprocess.impl.MultidocumentRegexTokenizer


    protected List<Interval> dstObjects;

    public VectorVisualizer(String srcFileName, String dstFileName, int windowSize) throws FileNotFoundException {
        FileReader srcReader = new FileReader(srcFileName);
        FileReader dstReader = new FileReader(dstFileName);
        Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, STRING_REGEX,
                new LowerCaseNormalizer());
        Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, STRING_REGEX,
                new LowerCaseNormalizer());
        srcCorpus = new TreeMapCorpus(srcTokenizer, SEPARATORS);
        dstCorpus = new TreeMapCorpus(dstTokenizer, SEPARATORS);
        // IntervalSetBuilder intervalSetBuilder = new SupermaximalSetBuilder();
        srcSequence = srcCorpus.sequence();
View Full Code Here


        Set<String> srcSeparators = new HashSet<String>(
                Arrays.asList(new String[] { ".", ",", ";", "", "(", ")", "of" }));
        Set<String> dstSeparators = new HashSet<String>(Arrays.asList(new String[] { ".", ",", ";", "(", ")", "von" }));
        FileReader srcReader = new FileReader(srcFileName);
        FileReader dstReader = new FileReader(dstFileName);
        Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, STRING_REGEX, new StopWordRemover(
                srcStopWords, new LowerCaseNormalizer()));
        Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, STRING_REGEX, new StopWordRemover(
                dstStopWords, new LowerCaseNormalizer()));
        Corpus<String> srcCorpus = new TreeMapCorpus(srcTokenizer, srcSeparators);
        Corpus<String> dstCorpus = new TreeMapCorpus(dstTokenizer, dstSeparators);

        double minSimilarity = 0.3;
View Full Code Here

                srcReader = new FileReader(srcFileName);
            } catch (FileNotFoundException e) {
                System.err.printf("can't open source file: ", e.getMessage());
                System.exit(1);
            }
            Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, regex, new StopWordRemover(
                    srcStopWords, new LowerCaseNormalizer()));
            Corpus<String> srcCorpus = new TreeMapCorpus(srcTokenizer, srcSeparators, corpusSize);
            Corpus<String> dstCorpus = null;
            if (dstFileName != null) {
                try {
                    dstReader = new FileReader(dstFileName);
                } catch (FileNotFoundException e) {
                    System.err.printf("can't open target file: ", e.getMessage());
                    System.exit(1);
                }
                Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, regex, new StopWordRemover(
                        dstStopWords, new LowerCaseNormalizer()));
                dstCorpus = new TreeMapCorpus(dstTokenizer, dstSeparators, corpusSize);
            }
            ANALYSER = new ConcurrentAnalyser(srcCorpus, dstCorpus);
        }
View Full Code Here

            srcReader = new FileReader(fileName);
        } catch (FileNotFoundException e) {
            System.err.printf("can't open source file: ", e.getMessage());
            System.exit(1);
        }
        Tokenizer<String> tokenizer = new MultidocumentRegexTokenizer(srcReader, regex, new StopWordRemover(stopWords,
                new LowerCaseNormalizer()));
        Iterator<Document<String>> docIter = tokenizer.iterator();
        int lineCounter = 0;
        while (docIter.hasNext()) {
            Document<String> doc = docIter.next();
            System.out.format("\nDocument #%d:\n", doc.getId());
            Iterator<String> tokIter = doc.iterator();
View Full Code Here

TOP

Related Classes of com.googlecode.gaal.preprocess.impl.MultidocumentRegexTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.