Package com.googlecode.gaal.preprocess.impl

Examples of com.googlecode.gaal.preprocess.impl.RegexTokenizer


        // String[] alphabet = { null, "a", "ā", "b", "g", "h", "j", "l", "m",
        // "n", "r", "s", "t", "y", "#" };
        // StringReader sr = new StringReader("caggtcagtcacggtatca#");
        // String[] alphabet = { null, "a", "c", "g", "t", "#" };

        Tokenizer<String> tokenizer = new RegexTokenizer(sr, "[\\W\\w]", new LowerCaseNormalizer());
        Corpus<String> corpus = new TreeMapCorpus(tokenizer, alphabet);
        System.out.println("alphabet size " + corpus.alphabetSize());
        System.out.println("text size " + corpus.sequence().size());
        System.out.println("text " + corpus.sequence());
        System.out.println("alphabet " + corpus.alphabet());
View Full Code Here


    }

    @Test
    public void testTokenizer() {
        StringReader sr = new StringReader(TINY_CORPUS);
        Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
        for (Document<String> doc : tokenizer) {
            int index = 0;
            for (String token : doc) {
                String expected = TINY_CORPUS.substring(index, index + 1);
                if (!expected.equals(token)) {
View Full Code Here

    public void testAlphabetSize() throws FileNotFoundException {
        assertTrue(corpus.alphabetSize() == TINY_CORPUS_ALPHABET.length - 1);

        FileReader reader = new FileReader("data/tlg.txt");

        Tokenizer<String> tokenizer = new RegexTokenizer(reader, Analyser.STRING_REGEX, new LowerCaseNormalizer());
        Corpus<String> corpus1 = new TreeMapCorpus(tokenizer);
        reader = new FileReader("data/tlg.txt");

        tokenizer = new RegexTokenizer(reader, Analyser.STRING_REGEX, new LowerCaseNormalizer());
        Corpus<String> corpus2 = new TreeMapCorpus(tokenizer, new HashSet<String>());
        System.out.println(corpus1.alphabetSize() + " " + corpus2.alphabetSize());
        // assertTrue(corpus1.alphabetSize() == ALPHABET_SIZE);
        assertTrue(corpus1.alphabetSize() == corpus2.alphabetSize());
    }
View Full Code Here

    public static Corpus<String> createMississippiCorpus() {
        String text = "mississippi#";
        String[] alphabet = { null, "i", "m", "p", "s", "#" };
        StringReader sr = new StringReader(text);

        Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
        return new TreeMapCorpus(tokenizer, alphabet);
    }
View Full Code Here

    public static Corpus<String> createMiningEngineeringCorpus() {
        String text = "mining␣engineering#";
        String[] alphabet = { null, "e", "g", "i", "m", "n", "r", "␣", "#" };
        StringReader sr = new StringReader(text);

        Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
        return new TreeMapCorpus(tokenizer, alphabet);
    }
View Full Code Here

    }

    public static Corpus<String> createTinyCorpus() {
        StringReader sr = new StringReader(TINY_CORPUS);

        Tokenizer<String> tokenizer = new RegexTokenizer(sr, CHAR_REGEX, new LowerCaseNormalizer());
        return new TreeMapCorpus(tokenizer, TINY_CORPUS_ALPHABET);
    }
View Full Code Here

    }

    public static Corpus<String> createSmallCorpus() throws FileNotFoundException {
        FileReader reader = new FileReader("data/tlg.txt");

        Tokenizer<String> tokenizer = new RegexTokenizer(reader, Analyser.STRING_REGEX, new LowerCaseNormalizer());
        return new TreeMapCorpus(tokenizer, SEPARATORS);
    }
View Full Code Here

    }

    public static Corpus<String> createLargeCorpus() throws FileNotFoundException {
        FileReader reader = new FileReader("data/moby.txt");

        Tokenizer<String> tokenizer = new RegexTokenizer(reader, Analyser.STRING_REGEX, new LowerCaseNormalizer());
        return new TreeMapCorpus(tokenizer, SEPARATORS);
    }
View Full Code Here

TOP

Related Classes of com.googlecode.gaal.preprocess.impl.RegexTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.