Package com.googlecode.gaal.analysis.impl

Source Code of com.googlecode.gaal.analysis.impl.ContextExtractorTest$NoContextFilter

package com.googlecode.gaal.analysis.impl;

import static org.junit.Assert.fail;

import java.io.FileNotFoundException;
import java.util.NavigableSet;
import java.util.TreeSet;

import org.junit.Before;
import org.junit.Test;

import com.googlecode.gaal.analysis.api.Context;
import com.googlecode.gaal.analysis.api.Filter;
import com.googlecode.gaal.analysis.impl.EmbeddedIntervalExtractor.EmbeddedContext;
import com.googlecode.gaal.data.api.Corpus;
import com.googlecode.gaal.data.api.IntSequence;
import com.googlecode.gaal.data.api.Multiset;
import com.googlecode.gaal.data.impl.ArraySequence;
import com.googlecode.gaal.data.impl.CorpusTest;
import com.googlecode.gaal.data.impl.HashMultiset;
import com.googlecode.gaal.suffix.api.EmbeddedSuffixTree.EmbeddedInterval;
import com.googlecode.gaal.suffix.api.IntervalTree.Interval;
import com.googlecode.gaal.suffix.api.LinearizedSuffixTree;
import com.googlecode.gaal.suffix.api.LinearizedSuffixTree.BinaryInterval;
import com.googlecode.gaal.suffix.impl.LinearizedSuffixTreeImpl;

public class ContextExtractorTest {

    private static final int WINDOW_SIZE = 5;

    private Corpus<String> corpus;
    private Iterable<EmbeddedInterval> embeddedExtractor;
    private Iterable<Context> nestedExtractor;
    private LinearizedSuffixTree lst;

    @Before
    public void setUp() throws Exception {
        this.corpus = CorpusTest.createSmallCorpus();
        lst = new LinearizedSuffixTreeImpl(corpus.sequence(), corpus.alphabetSize());
        embeddedExtractor = new EmbeddedContextExtractor(lst, corpus, new NonSingletonBwtSetBuilder(),
                new NoContextFilter<EmbeddedInterval>(), WINDOW_SIZE);
        nestedExtractor = new NestedMaximalityContextExtractor<String>(corpus, false);
    }

    @Test
    public void testPrint() throws FileNotFoundException {
        Iterable<EmbeddedInterval> contextExtractor = new EmbeddedContextExtractor(lst, corpus,
                new SupermaximalSetBuilder(), new NoContextFilter<EmbeddedInterval>(), WINDOW_SIZE);
        for (EmbeddedInterval embeddedInterval : contextExtractor) {
            Context context = new EmbeddedContext(embeddedInterval);
            System.out.printf("%s %s %s\n", corpus.toString(context.leftSequence(), " "),
                    corpus.toString(context.fillerSet(), " "), corpus.toString(context.rightSequence(), " "));
        }
    }

    @Test
    public void testEmbeddedContexts() {
        System.out.println("Embedded Suffix Tree Test");
        int counter = 0;
        for (EmbeddedInterval embeddedInterval : embeddedExtractor) {
            Context context = new EmbeddedContext(embeddedInterval);
            IntSequence left = context.leftSequence();
            IntSequence right = context.rightSequence();
            Multiset<IntSequence> expectedSet = fillerSet(left, right);
            Multiset<IntSequence> resultSet = context.fillerSet();
            if (!resultSet.equals(expectedSet)) {
                String leftString = corpus.toString(left, " ");
                String rightString = corpus.toString(right, " ");
                String expectedString = corpus.toString(expectedSet, " ");
                String resultString = corpus.toString(resultSet, " ");
                System.out.printf("%s %s %s\n", leftString, expectedString, rightString);
                System.err.printf("%s %s %s\n", leftString, resultString, rightString);
                fail("fill mismatch");
            }
            if (expectedSet.size() != context.fillerSetSize()) {
                System.err.printf("%d != %d\n", expectedSet.size(), context.fillerSetSize());
                fail("filler set size mismatch");
            }
            counter++;
        }
        System.out.printf("total contexts: %d\n", counter);
    }

    @Test
    public void testNestedContexts() {
        System.out.println("Nested Maximality Test");
        int counter = 0;
        for (Context context : nestedExtractor) {
            IntSequence left = context.leftSequence();
            IntSequence right = context.rightSequence();
            Multiset<IntSequence> expectedSet = fillerSet(left, right);
            Multiset<IntSequence> resultSet = context.fillerSet();
            for (IntSequence filler : resultSet) {
                int expectedQuantity = expectedSet.getQuantity(filler);
                int resultQuantity = resultSet.getQuantity(filler);
                if (expectedQuantity == 0) {
                    String leftString = corpus.toString(left, " ");
                    String rightString = corpus.toString(right, " ");
                    String expectedString = corpus.toString(expectedSet, " ");
                    String resultString = corpus.toString(resultSet, " ");
                    System.out.printf("%s %s %s\n", leftString, expectedString, rightString);
                    System.err.printf("%s %s %s\n", leftString, resultString, rightString);
                    fail("fill mismatch");
                } else if (expectedQuantity != resultQuantity) {
                    System.out.printf("%d != %d\n", expectedQuantity, resultQuantity);
                }
            }
            if (expectedSet.size() != context.fillerSetSize()) {
                System.out.printf("filler set size mismatch: %d != %d\n", expectedSet.size(), context.fillerSetSize());
            }
            counter++;
        }
        System.out.printf("total contexts: %d\n", counter);
    }

    private Multiset<IntSequence> fillerSet(IntSequence left, IntSequence right) {
        BinaryInterval leftInterval = lst.search(left);
        Multiset<IntSequence> expectedSet = null;
        if (leftInterval == null) {
            System.err.printf("left context not found: %s\n", left);
            fail("left context not found");
        } else {
            Interval rightInterval = lst.search(right);
            if (rightInterval == null) {
                System.err.printf("right context not found: %s\n", right);
                fail("right context not found");
            } else {
                NavigableSet<Integer> leftIndices = intervalIndices(leftInterval);
                int[] suffixTable = lst.getSuffixTable();
                expectedSet = new HashMultiset<IntSequence>();
                for (int i = rightInterval.left(); i <= rightInterval.right(); i++) {
                    int end = suffixTable[i];
                    Integer start = leftIndices.floor(end);
                    if (start != null && end - start < WINDOW_SIZE) {
                        IntSequence fillerSeq = lst.getSequence().subSequence(start, end);
                        boolean hasSpecial = false;
                        for (int j = 0; j < fillerSeq.size(); j++) {
                            if (corpus.isSeparator(fillerSeq.get(j))) {
                                hasSpecial = true;
                                break;
                            }
                        }
                        if (!hasSpecial)
                            expectedSet.add(fillerSeq);
                    }
                }
            }
        }
        return expectedSet;
    }

    public IntSequence toIntSequence(String string) {
        String[] stringArray = string.split(" ");
        int[] intArray = new int[stringArray.length];
        for (int i = 0; i < intArray.length; i++) {
            intArray[i] = corpus.toInt(stringArray[i]);
        }
        return new ArraySequence(intArray);
    }

    private NavigableSet<Integer> intervalIndices(BinaryInterval interval) {
        int lcp = interval.lcp();
        NavigableSet<Integer> intervalIndices = new TreeSet<Integer>();
        int[] suffixTable = lst.getSuffixTable();
        for (int i = interval.left(); i <= interval.right(); i++) {
            intervalIndices.add(suffixTable[i] + lcp);
        }
        return intervalIndices;
    }

    private static class NoContextFilter<C> implements Filter<C> {

        @Override
        public boolean passed(C context) {
            return true;
        }

    }
}
TOP

Related Classes of com.googlecode.gaal.analysis.impl.ContextExtractorTest$NoContextFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.