Package eu.hlavki.text.lemmagen

Source Code of eu.hlavki.text.lemmagen.TrainTest

/*
* Copyright 2013 Michal Hlavac <hlavki@hlavki.eu>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.hlavki.text.lemmagen;

import eu.hlavki.text.lemmagen.api.Lemmatizer;
import java.io.BufferedReader;
import org.junit.After;
import static org.junit.Assert.*;
import org.junit.Before;
import org.junit.Test;
import eu.hlavki.text.lemmagen.impl.DefaultLemmatizer;
import eu.hlavki.text.lemmagen.impl.LemmatizerSettings;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

public class TrainTest {

    private static final String TEST_DICTIONARY = "/wfl-me-en.tbl";
    private static final String[] ACTUAL_WORDS = new String[]{"respond", "are", "uninflected", "items", "underlying", "singing"};
    private static final String[][] LEMMA_WORDS = new String[][]{
        {"respond", "be", "uninflect", "item", "underlie", "sing"}
    };

    @Before
    public void beforeTest() {
    }

    @After
    public void afterTest() {
    }

    @Test
    public void trainEnglish() {
        File tmpLemFile = null;
        try {
            tmpLemFile = File.createTempFile("lemmagen", ".lem");
            String format = "WLM";
            LemmatizerSettings settings = new LemmatizerSettings();
//            settings.setUseFromInRules(false);
//            settings.setMsdConsider(MsdConsideration.IGNORE);
//            settings.setMaxRulesPerNode(0);
//            settings.setBuildFrontLemmatizer(true);

            InputStream in = TrainTest.class.getResourceAsStream(TEST_DICTIONARY);
            BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));

            System.out.println("Building model...");
            DefaultLemmatizer lm = new DefaultLemmatizer(br, format, settings);
            lm.buildModel();

            System.out.println("Model built");

            System.out.println("Saving model...");
            LemmatizerFactory.saveToFile(lm, tmpLemFile);
            System.out.println("Model saved.");

            assertLemmaEquals(lm, ACTUAL_WORDS, LEMMA_WORDS);

            System.out.println("Clearing examples...");
            lm.clearExamples();
            System.out.println("Examples clear...");

            System.out.println("Reading model from file");
            lm = (DefaultLemmatizer) LemmatizerFactory.readFromFile(tmpLemFile);

            assertLemmaEquals(lm, ACTUAL_WORDS, LEMMA_WORDS);

        } catch (IOException e) {
            e.printStackTrace();
            fail(e.getMessage());
        } finally {
            if (tmpLemFile != null) tmpLemFile.delete();
        }
    }

    private static void assertLemmaEquals(Lemmatizer lm, String[] actual, String[][] expected) {
        for (int idx = 0; idx < actual.length; idx++) {
            CharSequence lemma = lm.lemmatize(actual[idx]);
            boolean result = false;
            StringBuilder sb = new StringBuilder("[");
            for (String[] row : expected) {
                result |= row[idx].equals(lemma);
                sb.append(row[idx]).append(", ");
            }
            sb.delete(sb.length() - 2, sb.length()).append("]");
            System.out.println("Lemma of " + actual[idx] + " is " + lemma + " and must be one of " + sb.toString());
            assertTrue(result);
        }
    }
}
TOP

Related Classes of eu.hlavki.text.lemmagen.TrainTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.