}
// reset the TokenStream to the first token
tokenStream.reset();
final Token reusableToken = new Token();
for (Token nextToken = tokenStream.next(reusableToken); nextToken != null; nextToken = tokenStream.next(reusableToken)) {
tokens.add((Token) nextToken.clone()); // the vector will be built on commit.
fieldSetting.fieldLength++;
if (fieldSetting.fieldLength > maxFieldLength) {
break;
}
}
} else {
// untokenized
String fieldVal = field.stringValue();
Token token = new Token(0, fieldVal.length(), "untokenized");
token.setTermBuffer(fieldVal);
tokens.add(token);
fieldSetting.fieldLength++;
}
}
if (!field.isStored()) {
it.remove();
}
}
Map<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>> termDocumentInformationFactoryByTermTextAndFieldSetting = new HashMap<FieldSetting, Map<String /*text*/, TermDocumentInformationFactory>>();
termDocumentInformationFactoryByDocument.put(document, termDocumentInformationFactoryByTermTextAndFieldSetting);
// build term vector, term positions and term offsets
for (Map.Entry<Field, LinkedList<Token>> eField_Tokens : tokensByField.entrySet()) {
FieldSetting fieldSetting = fieldSettingsByFieldName.get(eField_Tokens.getKey().name());
Map<String, TermDocumentInformationFactory> termDocumentInformationFactoryByTermText = termDocumentInformationFactoryByTermTextAndFieldSetting.get(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()));
if (termDocumentInformationFactoryByTermText == null) {
termDocumentInformationFactoryByTermText = new HashMap<String /*text*/, TermDocumentInformationFactory>();
termDocumentInformationFactoryByTermTextAndFieldSetting.put(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()), termDocumentInformationFactoryByTermText);
}
int lastOffset = 0;
// for each new field, move positions a bunch.
if (fieldSetting.position > 0) {
// todo what if no analyzer set, multiple fields with same name and index without tokenization?
fieldSetting.position += analyzer.getPositionIncrementGap(fieldSetting.fieldName);
}
for (Token token : eField_Tokens.getValue()) {
TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.term());
if (termDocumentInformationFactory == null) {
termDocumentInformationFactory = new TermDocumentInformationFactory();
termDocumentInformationFactoryByTermText.put(token.term(), termDocumentInformationFactory);
}
//termDocumentInformationFactory.termFrequency++;
fieldSetting.position += (token.getPositionIncrement() - 1);
termDocumentInformationFactory.termPositions.add(fieldSetting.position++);
if (token.getPayload() != null && token.getPayload().length() > 0) {
termDocumentInformationFactory.payloads.add(token.getPayload().toByteArray());
fieldSetting.storePayloads = true;
} else {
termDocumentInformationFactory.payloads.add(null);
}
if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSetting.offset + token.startOffset(), fieldSetting.offset + token.endOffset()));
lastOffset = fieldSetting.offset + token.endOffset();
}
}