}
protected boolean findHeadTailAndL1Patterns() {
List<TextRulerExampleDocument> docs = exampleDocuments.getDocuments();
TextRulerExampleDocument doc0 = docs.get(0);
TextRulerRulePattern head = new TextRulerRulePattern();
TextRulerRulePattern tail = new TextRulerRulePattern();
getPageHeadAndTailPortion(doc0, head, tail);
final class HLCandidate {
public TextRulerRulePattern head = new TextRulerRulePattern();
public TextRulerRulePattern l1 = new TextRulerRulePattern();
}
// a small optimization:
// find out the maximum possible length for l1 in doc0 since l1 is much
// smaller than the possible head length!
List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc0);
int shortestL1 = head.size() - 1;
for (TextRulerRulePattern its : interTupleSeparators)
shortestL1 = its.size() < shortestL1 ? its.size() : shortestL1;
List<HLCandidate> hlCandidates = new ArrayList<HLCandidate>();
// create candidates for each separation of the head and tail patterns:
for (int separator = head.size() - 1; separator > 0; separator--) {
HLCandidate c = new HLCandidate();
for (int i = 0; i < head.size(); i++) {
if (i < separator)
c.head.add(head.get(i));
else {
WienRuleItem it = (WienRuleItem) head.get(i).copy();
it.getWordConstraint().setGeneralizeLinkMarkUp(true);
c.l1.add(it);
}
}
hlCandidates.add(c);
TextRulerToolkit.log(c.head.size() + " vs. " + c.l1.size());
if (c.l1.size() >= shortestL1)
break;
}
long total = 0;
// get total h l1 t combination count:
long tCand = (tail.size() * (tail.size() + 1)) / 2;
for (HLCandidate c : hlCandidates) {
total += ((c.head.size() - 1) * (c.head.size())) / 2;
}
total *= tCand;
long current = 0;
int oldPercent = -1;
for (HLCandidate c : hlCandidates) {
// for each "candidate" which represents a l1 suffix pattern of the
// head tokens and a rest pattern for the h pattern,
// we have to create every sub pattern of the remaining h pattern as
// a h candidate:
TextRulerRulePattern l1 = c.l1;
TextRulerRulePattern h = null;
boolean l1Sucks = false;
for (int endI = c.head.size() - 1; endI > 0; endI--) {
for (int startI = endI; startI > 0; startI--) {
h = new TextRulerRulePattern();
for (int i = startI; i <= endI; i++)
h.add(c.head.get(i));
// now for each h candidate we have to create each t
// candidate:
TextRulerRulePattern t = null;
for (int tstartI = 0; tstartI < tail.size(); tstartI++) {
for (int tendI = tstartI; tendI < tail.size(); tendI++) {
int percent = Math.round(((float) current * 100 / total));
if (percent != oldPercent) {
oldPercent = percent;
if (percent > 100)
percent = 100;
// TextRulerToolkit.log(current+" / "+total);
sendStatusUpdateToDelegate("Testing C3, " + percent + "%",
TextRulerLearnerState.ML_RUNNING, false);
}
if (shouldAbort())
return false;
current++;
t = new TextRulerRulePattern();
for (int i = tstartI; i <= tendI; i++)
t.add(tail.get(i));
// no we have a possible candidate triple: h, t and
// l1:
constraint3ReturnType c3Result = testConstraint3(h, t, l1);