Package com.googlecode.gaal.preprocess.impl

Examples of com.googlecode.gaal.preprocess.impl.LowerCaseNormalizer


    public VectorVisualizer(String srcFileName, String dstFileName, int windowSize) throws FileNotFoundException {
        FileReader srcReader = new FileReader(srcFileName);
        FileReader dstReader = new FileReader(dstFileName);
        Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, STRING_REGEX,
                new LowerCaseNormalizer());
        Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, STRING_REGEX,
                new LowerCaseNormalizer());
        srcCorpus = new TreeMapCorpus(srcTokenizer, SEPARATORS);
        dstCorpus = new TreeMapCorpus(dstTokenizer, SEPARATORS);
        // IntervalSetBuilder intervalSetBuilder = new SupermaximalSetBuilder();
        srcSequence = srcCorpus.sequence();
        dstSequence = dstCorpus.sequence();
View Full Code Here


                Arrays.asList(new String[] { ".", ",", ";", "", "(", ")", "of" }));
        Set<String> dstSeparators = new HashSet<String>(Arrays.asList(new String[] { ".", ",", ";", "(", ")", "von" }));
        FileReader srcReader = new FileReader(srcFileName);
        FileReader dstReader = new FileReader(dstFileName);
        Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, STRING_REGEX, new StopWordRemover(
                srcStopWords, new LowerCaseNormalizer()));
        Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, STRING_REGEX, new StopWordRemover(
                dstStopWords, new LowerCaseNormalizer()));
        Corpus<String> srcCorpus = new TreeMapCorpus(srcTokenizer, srcSeparators);
        Corpus<String> dstCorpus = new TreeMapCorpus(dstTokenizer, dstSeparators);

        double minSimilarity = 0.3;
        int alignmentsNumber = 3;
View Full Code Here

            } catch (FileNotFoundException e) {
                System.err.printf("can't open source file: ", e.getMessage());
                System.exit(1);
            }
            Tokenizer<String> srcTokenizer = new MultidocumentRegexTokenizer(srcReader, regex, new StopWordRemover(
                    srcStopWords, new LowerCaseNormalizer()));
            Corpus<String> srcCorpus = new TreeMapCorpus(srcTokenizer, srcSeparators, corpusSize);
            Corpus<String> dstCorpus = null;
            if (dstFileName != null) {
                try {
                    dstReader = new FileReader(dstFileName);
                } catch (FileNotFoundException e) {
                    System.err.printf("can't open target file: ", e.getMessage());
                    System.exit(1);
                }
                Tokenizer<String> dstTokenizer = new MultidocumentRegexTokenizer(dstReader, regex, new StopWordRemover(
                        dstStopWords, new LowerCaseNormalizer()));
                dstCorpus = new TreeMapCorpus(dstTokenizer, dstSeparators, corpusSize);
            }
            ANALYSER = new ConcurrentAnalyser(srcCorpus, dstCorpus);
        }
        return ANALYSER;
View Full Code Here

        } catch (FileNotFoundException e) {
            System.err.printf("can't open source file: ", e.getMessage());
            System.exit(1);
        }
        Tokenizer<String> tokenizer = new MultidocumentRegexTokenizer(srcReader, regex, new StopWordRemover(stopWords,
                new LowerCaseNormalizer()));
        Iterator<Document<String>> docIter = tokenizer.iterator();
        int lineCounter = 0;
        while (docIter.hasNext()) {
            Document<String> doc = docIter.next();
            System.out.format("\nDocument #%d:\n", doc.getId());
View Full Code Here

        // String[] alphabet = { null, "a", "ā", "b", "g", "h", "j", "l", "m",
        // "n", "r", "s", "t", "y", "#" };
        // StringReader sr = new StringReader("caggtcagtcacggtatca#");
        // String[] alphabet = { null, "a", "c", "g", "t", "#" };

        Tokenizer<String> tokenizer = new RegexTokenizer(sr, "[\\W\\w]", new LowerCaseNormalizer());
        Corpus<String> corpus = new TreeMapCorpus(tokenizer, alphabet);
        System.out.println("alphabet size " + corpus.alphabetSize());
        System.out.println("text size " + corpus.sequence().size());
        System.out.println("text " + corpus.sequence());
        System.out.println("alphabet " + corpus.alphabet());
View Full Code Here

    }

    @Test
    public void testTokenizer() {
        StringReader sr = new StringReader(TINY_CORPUS);
        Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
        for (Document<String> doc : tokenizer) {
            int index = 0;
            for (String token : doc) {
                String expected = TINY_CORPUS.substring(index, index + 1);
                if (!expected.equals(token)) {
View Full Code Here

    public void testAlphabetSize() throws FileNotFoundException {
        assertTrue(corpus.alphabetSize() == TINY_CORPUS_ALPHABET.length - 1);

        FileReader reader = new FileReader("data/tlg.txt");

        Tokenizer<String> tokenizer = new RegexTokenizer(reader, Analyser.STRING_REGEX, new LowerCaseNormalizer());
        Corpus<String> corpus1 = new TreeMapCorpus(tokenizer);
        reader = new FileReader("data/tlg.txt");

        tokenizer = new RegexTokenizer(reader, Analyser.STRING_REGEX, new LowerCaseNormalizer());
        Corpus<String> corpus2 = new TreeMapCorpus(tokenizer, new HashSet<String>());
        System.out.println(corpus1.alphabetSize() + " " + corpus2.alphabetSize());
        // assertTrue(corpus1.alphabetSize() == ALPHABET_SIZE);
        assertTrue(corpus1.alphabetSize() == corpus2.alphabetSize());
    }
View Full Code Here

    public static Corpus<String> createMississippiCorpus() {
        String text = "mississippi#";
        String[] alphabet = { null, "i", "m", "p", "s", "#" };
        StringReader sr = new StringReader(text);

        Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
        return new TreeMapCorpus(tokenizer, alphabet);
    }
View Full Code Here

    public static Corpus<String> createMiningEngineeringCorpus() {
        String text = "mining␣engineering#";
        String[] alphabet = { null, "e", "g", "i", "m", "n", "r", "␣", "#" };
        StringReader sr = new StringReader(text);

        Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
        return new TreeMapCorpus(tokenizer, alphabet);
    }
View Full Code Here

    }

    public static Corpus<String> createTinyCorpus() {
        StringReader sr = new StringReader(TINY_CORPUS);

        Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
        return new TreeMapCorpus(tokenizer, TINY_CORPUS_ALPHABET);
    }
View Full Code Here

TOP

Related Classes of com.googlecode.gaal.preprocess.impl.LowerCaseNormalizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.