// Build FST:
BytesRef previousAnalyzed = null;
BytesRef analyzed = new BytesRef();
BytesRef surface = new BytesRef();
IntsRef scratchInts = new IntsRef();
ByteArrayDataInput input = new ByteArrayDataInput();
// Used to remove duplicate surface forms (but we
// still index the hightest-weight one). We clear
// this when we see a new analyzed form, so it cannot
// grow unbounded (at most 256 entries):
Set<BytesRef> seenSurfaceForms = new HashSet<>();
int dedup = 0;
while (reader.read(scratch)) {
input.reset(scratch.bytes, scratch.offset, scratch.length);
short analyzedLength = input.readShort();
analyzed.grow(analyzedLength+2);
input.readBytes(analyzed.bytes, 0, analyzedLength);
analyzed.length = analyzedLength;
long cost = input.readInt();
surface.bytes = scratch.bytes;
if (hasPayloads) {
surface.length = input.readShort();
surface.offset = input.getPosition();
} else {
surface.offset = input.getPosition();
surface.length = scratch.length - surface.offset;
}
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRef();
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else if (analyzed.equals(previousAnalyzed)) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
// More than maxSurfaceFormsPerAnalyzedForm
// dups: skip the rest:
continue;
}
if (seenSurfaceForms.contains(surface)) {
continue;
}
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else {
dedup = 0;
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.clear();
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
}
// TODO: I think we can avoid the extra 2 bytes when
// there is no dup (dedup==0), but we'd have to fix
// the exactFirst logic ... which would be sort of
// hairy because we'd need to special case the two
// (dup/not dup)...
// NOTE: must be byte 0 so we sort before whatever
// is next
analyzed.bytes[analyzed.offset+analyzed.length] = 0;
analyzed.bytes[analyzed.offset+analyzed.length+1] = (byte) dedup;
analyzed.length += 2;
Util.toIntsRef(analyzed, scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) {
builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
} else {
int payloadOffset = input.getPosition() + surface.length;
int payloadLength = scratch.length - payloadOffset;
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = PAYLOAD_SEP;
System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);