Source Code of eu.hlavki.text.lemmagen.TrainTest

/*
 * Copyright 2013 Michal Hlavac <hlavki@hlavki.eu>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.hlavki.text.lemmagen;


import eu.hlavki.text.lemmagen.api.Lemmatizer;
import java.io.BufferedReader;
import org.junit.After;
import static org.junit.Assert.*;
import org.junit.Before;
import org.junit.Test;
import eu.hlavki.text.lemmagen.impl.DefaultLemmatizer;
import eu.hlavki.text.lemmagen.impl.LemmatizerSettings;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;


public class TrainTest {


    private static final String TEST_DICTIONARY = "/wfl-me-en.tbl";
    private static final String[] ACTUAL_WORDS = new String[]{"respond", "are", "uninflected", "items", "underlying", "singing"};
    private static final String[][] LEMMA_WORDS = new String[][]{
        {"respond", "be", "uninflect", "item", "underlie", "sing"}
    };


    @Before
    public void beforeTest() {
    }


    @After
    public void afterTest() {
    }


    @Test
    public void trainEnglish() {
        File tmpLemFile = null;
        try {
            tmpLemFile = File.createTempFile("lemmagen", ".lem");
            String format = "WLM";
            LemmatizerSettings settings = new LemmatizerSettings();
//            settings.setUseFromInRules(false);
//            settings.setMsdConsider(MsdConsideration.IGNORE);
//            settings.setMaxRulesPerNode(0);
//            settings.setBuildFrontLemmatizer(true);


            InputStream in = TrainTest.class.getResourceAsStream(TEST_DICTIONARY);
            BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));


            System.out.println("Building model...");
            DefaultLemmatizer lm = new DefaultLemmatizer(br, format, settings);
            lm.buildModel();


            System.out.println("Model built");


            System.out.println("Saving model...");
            LemmatizerFactory.saveToFile(lm, tmpLemFile);
            System.out.println("Model saved.");


            assertLemmaEquals(lm, ACTUAL_WORDS, LEMMA_WORDS);


            System.out.println("Clearing examples...");
            lm.clearExamples();
            System.out.println("Examples clear...");


            System.out.println("Reading model from file");
            lm = (DefaultLemmatizer) LemmatizerFactory.readFromFile(tmpLemFile);


            assertLemmaEquals(lm, ACTUAL_WORDS, LEMMA_WORDS);


        } catch (IOException e) {
            e.printStackTrace();
            fail(e.getMessage());
        } finally {
            if (tmpLemFile != null) tmpLemFile.delete();
        }
    }


    private static void assertLemmaEquals(Lemmatizer lm, String[] actual, String[][] expected) {
        for (int idx = 0; idx < actual.length; idx++) {
            CharSequence lemma = lm.lemmatize(actual[idx]);
            boolean result = false;
            StringBuilder sb = new StringBuilder("[");
            for (String[] row : expected) {
                result |= row[idx].equals(lemma);
                sb.append(row[idx]).append(", ");
            }
            sb.delete(sb.length() - 2, sb.length()).append("]");
            System.out.println("Lemma of " + actual[idx] + " is " + lemma + " and must be one of " + sb.toString());
            assertTrue(result);
        }
    }
}
Source Code of eu.hlavki.text.lemmagen.TrainTest

Related Classes of eu.hlavki.text.lemmagen.TrainTest