int position = 0;
for (Fieldable field : doc.getFields())
{
ThriftTerm firstTerm = null;
// Indexed field
if (field.isIndexed() && field.isTokenized())
{
TokenStream tokens = field.tokenStreamValue();
if (tokens == null)
{
Reader tokReader = field.readerValue();
if (tokReader == null)
tokReader = new StringReader(field.stringValue());
tokens = analyzer.reusableTokenStream(field.name(), tokReader);
}
// collect term information per field
Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new HashMap<Term, Map<ByteBuffer, List<Number>>>();
int lastOffset = 0;
if (position > 0)
{
position += analyzer.getPositionIncrementGap(field.name());
}
// Build the termPositions vector for all terms
tokens.reset(); // reset the TokenStream to the first token
// set up token attributes we are working on
// offsets
OffsetAttribute offsetAttribute = null;
if (field.isStoreOffsetWithTermVector())
offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
// positions
// these are always gathered in later lucene versions
PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) tokens
.addAttribute(PositionIncrementAttribute.class);
// term as string
CharTermAttribute termAttribute = (CharTermAttribute) tokens.addAttribute(CharTermAttribute.class);
// store normalizations of field per term per document rather
// than per field.
// this adds more to write but less to read on other side
Integer tokensInField = new Integer(0);
while (tokens.incrementToken())
{
tokensInField++;
Term term = new Term(field.name(), termAttribute.toString());
ThriftTerm tterm = new ThriftTerm(term.field()).setText(
ByteBuffer.wrap(term.text().getBytes("UTF-8"))).setIs_binary(false);
if (firstTerm == null)
firstTerm = tterm;
allIndexedTerms.addToTerms(tterm);
// fetch all collected information for this term
Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);
if (termInfo == null)
{
termInfo = new HashMap<ByteBuffer, List<Number>>();
allTermInformation.put(term, termInfo);
}
// term frequency
{
List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKeyBytes);
if (termFrequency == null)
{
termFrequency = new ArrayList<Number>();
termFrequency.add(new Integer(0));
termInfo.put(CassandraUtils.termFrequencyKeyBytes, termFrequency);
}
// increment
termFrequency.set(0, termFrequency.get(0).intValue() + 1);
}
// position vector
{
position += (posIncrAttribute.getPositionIncrement() - 1);
List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKeyBytes);
if (positionVector == null)
{
positionVector = new ArrayList<Number>();
termInfo.put(CassandraUtils.positionVectorKeyBytes, positionVector);
}
positionVector.add(++position);
}
// term offsets
if (field.isStoreOffsetWithTermVector())
{
List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKeyBytes);
if (offsetVector == null)
{
offsetVector = new ArrayList<Number>();
termInfo.put(CassandraUtils.offsetVectorKeyBytes, offsetVector);
}
offsetVector.add(lastOffset + offsetAttribute.startOffset());
offsetVector.add(lastOffset + offsetAttribute.endOffset());
}
}
List<Number> bnorm = null;
if (!field.getOmitNorms())
{
bnorm = new ArrayList<Number>();
final FieldInvertState invertState = new FieldInvertState();
invertState.setBoost(doc.getBoost() * field.getBoost());
invertState.setLength(tokensInField);
final float norm = similarity.computeNorm(field.name(), invertState);
bnorm.add(Similarity.getDefault().encodeNormValue(norm));
}
for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet())
{
// Terms are stored within a unique key combination
// This is required since cassandra loads all columns
// in a key/column family into memory
ByteBuffer key = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes, term
.getKey().field().getBytes("UTF-8"), CassandraUtils.delimeterBytes, term.getKey().text()
.getBytes("UTF-8"));
// Mix in the norm for this field alongside each term
// more writes but faster on read side.
if (!field.getOmitNorms())
{
term.getValue().put(CassandraUtils.normsKeyBytes, bnorm);
}
CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
new LucandraTermInfo(docNumber, term.getValue()).serialize());
// Store all terms under a row
CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
CassandraUtils.createColumnName(term.getKey()), indexTermsKey,
ByteBufferUtil.EMPTY_BYTE_BUFFER);
}
}
// Untokenized fields go in without a termPosition
if (field.isIndexed() && !field.isTokenized())
{
ThriftTerm tterm = new ThriftTerm(field.name()).setText(
ByteBuffer.wrap(field.stringValue().getBytes("UTF-8"))).setIs_binary(false);
if (firstTerm == null)
firstTerm = tterm;
allIndexedTerms.addToTerms(tterm);
ByteBuffer key = CassandraUtils.hashKeyBytes(indexName.getBytes("UTF-8"),
CassandraUtils.delimeterBytes, field.name().getBytes("UTF-8"), CassandraUtils.delimeterBytes,
field.stringValue().getBytes("UTF-8"));
Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
termMap.put(CassandraUtils.termFrequencyKeyBytes, CassandraUtils.emptyArray);
termMap.put(CassandraUtils.positionVectorKeyBytes, CassandraUtils.emptyArray);
CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
new LucandraTermInfo(docNumber, termMap).serialize());
// Store all terms under a row
CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
CassandraUtils.createColumnName(field), indexTermsKey, ByteBufferUtil.EMPTY_BYTE_BUFFER);
}
// Stores each field as a column under this doc key
if (field.isStored())
{
ThriftTerm tt = new ThriftTerm(field.name());
if (field instanceof NumericField)
{
Number n = ((NumericField) field).getNumericValue();
switch(((NumericField) field).getDataType())
{
case LONG: tt.setLongVal(n.longValue()); break;
case INT: tt.setIntVal(n.intValue()); break;
case FLOAT: tt.setFloatVal(n.floatValue()); break;
case DOUBLE: tt.setDoubleVal(n.doubleValue()); break;
default: throw new IllegalStateException("Unknown numeric type in field: "+field);
};
}
byte[] value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");
tt.setText(ByteBuffer.wrap(value)).setIs_binary(field.isBinary());
// logic to handle multiple fields w/ same name
DocumentMetadata currentValue = fieldCache.get(field.name());
if (currentValue == null)
{