package com.googlecode.gaal.suffix.impl;
import java.util.Map.Entry;
import java.util.SortedMap;
import java.util.TreeMap;
import org.junit.Before;
import org.junit.Test;
import com.googlecode.gaal.analysis.impl.ProperIntervalSetBuilder;
import com.googlecode.gaal.data.api.Corpus;
import com.googlecode.gaal.data.api.IntSequence;
import com.googlecode.gaal.data.api.IntervalSet;
import com.googlecode.gaal.data.api.Multiset;
import com.googlecode.gaal.data.impl.CorpusTest;
import com.googlecode.gaal.suffix.api.EmbeddedSuffixTree;
import com.googlecode.gaal.suffix.api.EmbeddedSuffixTree.EmbeddedInterval;
import com.googlecode.gaal.suffix.api.IntervalTree.Interval;
import com.googlecode.gaal.suffix.api.LinearizedSuffixTree;
import com.googlecode.gaal.suffix.api.LinearizedSuffixTree.BinaryInterval;
public class EmbeddedSuffixTreeImplTest {
private final Corpus<String> corpus = CorpusTest.createMississippiCorpus();
private final IntSequence sequence = corpus.sequence();
private LinearizedSuffixTree lst;
@Before
public void setUp() throws Exception {
lst = new LinearizedSuffixTreeImpl(corpus.sequence(), corpus.alphabetSize());
}
@Test
public void testCreate() {
printSuffixes();
int windowSize = 9;
IntervalSet<BinaryInterval> properIntervalSet = new ProperIntervalSetBuilder().buildIntervalSet(lst);
for (BinaryInterval interval : properIntervalSet) {
System.out.printf("===interval: '%s'===\n", corpus.toString(interval.label(), ""));
printIntervalSuffixes(interval, windowSize);
EmbeddedSuffixTree est = EmbeddedSuffixTreeImpl.create(lst, interval, windowSize, corpus);
IntervalSet<EmbeddedInterval> embProperIntervalSet = new ProperIntervalSetBuilder().buildIntervalSet(est);
for (EmbeddedInterval embeddedInterval : embProperIntervalSet) {
Multiset<IntSequence> fillerSet = embeddedInterval.fillerSet();
System.out.printf("\tembedded interval %s(%d) fillers: %s\n",
corpus.toString(embeddedInterval.label(), ""), embeddedInterval.size(),
corpus.toString(fillerSet, ""));
}
}
}
private void printSuffixes() {
System.out.println("Suffix Table:");
for (int i : lst.getSuffixTable()) {
IntSequence suffix = sequence.subSequence(i, sequence.size());
System.out.printf("%2d| %s\n", i, corpus.toString(suffix, ""));
}
}
private void printIntervalSuffixes(Interval interval, int windowSize) {
IntSequence indices = interval.indices();
int lcp = interval.lcp();
int[] suffixTable = lst.getSuffixTable();
int[] inverseSuffixTable = lst.getInverseSuffixTable();
SortedMap<Integer, Integer> embeddedSuffixTableIndices = new TreeMap<Integer, Integer>();
for (int i = 0; i < interval.size(); i++) {
int start = indices.get(i) + lcp;
for (int j = start; j < start + windowSize && j < sequence.size(); j++) {
IntSequence suffix = sequence.subSequence(j, sequence.size());
Integer startIndex = embeddedSuffixTableIndices.get(inverseSuffixTable[j]);
if (startIndex == null || startIndex < start) {
embeddedSuffixTableIndices.put(inverseSuffixTable[j], start);
System.out.printf("%2d:%2d| %s+\n", start, j, corpus.toString(suffix, ""));
} else {
System.out.printf("%2d:%2d| %s-\n", start, j, corpus.toString(suffix, ""));
}
}
}
System.out.println("Chosen Suffixes:");
for (Entry<Integer, Integer> entry : embeddedSuffixTableIndices.entrySet()) {
IntSequence suffix = sequence.subSequence(suffixTable[entry.getKey()], sequence.size());
IntSequence filler = sequence.subSequence(entry.getValue(), suffixTable[entry.getKey()]);
System.out.printf("%2d:%2d| %s {%s}\n", entry.getValue(), suffixTable[entry.getKey()],
corpus.toString(suffix, ""), corpus.toString(filler, ""));
}
}
}