Package lucandra.serializers.thrift

Examples of lucandra.serializers.thrift.ThriftTerm

        int position = 0;

        for (Fieldable field : doc.getFields())

            ThriftTerm firstTerm = null;

            // Indexed field
            if (field.isIndexed() && field.isTokenized())
                TokenStream tokens = field.tokenStreamValue();

                if (tokens == null)
                    Reader tokReader = field.readerValue();

                    if (tokReader == null)
                        tokReader = new StringReader(field.stringValue());

                    tokens = analyzer.reusableTokenStream(, tokReader);

                // collect term information per field
                Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new HashMap<Term, Map<ByteBuffer, List<Number>>>();

                int lastOffset = 0;
                if (position > 0)
                    position += analyzer.getPositionIncrementGap(;

                // Build the termPositions vector for all terms

                tokens.reset(); // reset the TokenStream to the first token

                // set up token attributes we are working on

                // offsets
                OffsetAttribute offsetAttribute = null;
                if (field.isStoreOffsetWithTermVector())
                    offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

                // positions
                // these are always gathered in later lucene versions
                PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) tokens

                // term as string
                CharTermAttribute termAttribute = (CharTermAttribute) tokens.addAttribute(CharTermAttribute.class);

                // store normalizations of field per term per document rather
                // than per field.
                // this adds more to write but less to read on other side
                Integer tokensInField = new Integer(0);

                while (tokens.incrementToken())
                    Term term = new Term(, termAttribute.toString());

                    ThriftTerm tterm = new ThriftTerm(term.field()).setText(

                    if (firstTerm == null)
                        firstTerm = tterm;


                    // fetch all collected information for this term
                    Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);

                    if (termInfo == null)
                        termInfo = new HashMap<ByteBuffer, List<Number>>();
                        allTermInformation.put(term, termInfo);

                    // term frequency
                        List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKeyBytes);

                        if (termFrequency == null)
                            termFrequency = new ArrayList<Number>();
                            termFrequency.add(new Integer(0));
                            termInfo.put(CassandraUtils.termFrequencyKeyBytes, termFrequency);

                        // increment
                        termFrequency.set(0, termFrequency.get(0).intValue() + 1);

                    // position vector
                        position += (posIncrAttribute.getPositionIncrement() - 1);

                        List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKeyBytes);

                        if (positionVector == null)
                            positionVector = new ArrayList<Number>();
                            termInfo.put(CassandraUtils.positionVectorKeyBytes, positionVector);


                    // term offsets
                    if (field.isStoreOffsetWithTermVector())

                        List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKeyBytes);
                        if (offsetVector == null)
                            offsetVector = new ArrayList<Number>();
                            termInfo.put(CassandraUtils.offsetVectorKeyBytes, offsetVector);

                        offsetVector.add(lastOffset + offsetAttribute.startOffset());
                        offsetVector.add(lastOffset + offsetAttribute.endOffset());


                List<Number> bnorm = null;
                if (!field.getOmitNorms())
                    bnorm = new ArrayList<Number>();

                    final FieldInvertState invertState = new FieldInvertState();
                    invertState.setBoost(doc.getBoost() * field.getBoost());
                    final float norm = similarity.computeNorm(, invertState);


                for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet())

                    // Terms are stored within a unique key combination
                    // This is required since cassandra loads all columns
                    // in a key/column family into memory
                    ByteBuffer key = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes, term
                            .getKey().field().getBytes("UTF-8"), CassandraUtils.delimeterBytes, term.getKey().text()

                    // Mix in the norm for this field alongside each term
                    // more writes but faster on read side.
                    if (!field.getOmitNorms())
                        term.getValue().put(CassandraUtils.normsKeyBytes, bnorm);

                    CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
                            new LucandraTermInfo(docNumber, term.getValue()).serialize());

                    // Store all terms under a row
                    CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
                            CassandraUtils.createColumnName(term.getKey()), indexTermsKey,

            // Untokenized fields go in without a termPosition
            if (field.isIndexed() && !field.isTokenized())
                ThriftTerm tterm = new ThriftTerm(

                if (firstTerm == null)
                    firstTerm = tterm;


                ByteBuffer key = CassandraUtils.hashKeyBytes(indexName.getBytes("UTF-8"),
                        CassandraUtils.delimeterBytes,"UTF-8"), CassandraUtils.delimeterBytes,

                Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
                termMap.put(CassandraUtils.termFrequencyKeyBytes, CassandraUtils.emptyArray);
                termMap.put(CassandraUtils.positionVectorKeyBytes, CassandraUtils.emptyArray);

                CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
                        new LucandraTermInfo(docNumber, termMap).serialize());

                // Store all terms under a row
                CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
                        CassandraUtils.createColumnName(field), indexTermsKey, ByteBufferUtil.EMPTY_BYTE_BUFFER);

            // Stores each field as a column under this doc key
            if (field.isStored())
                ThriftTerm tt = new ThriftTerm(;

                if (field instanceof NumericField)
                    Number n = ((NumericField) field).getNumericValue();
                    switch(((NumericField) field).getDataType())
                    case LONG: tt.setLongVal(n.longValue()); break;
                    case INT: tt.setIntVal(n.intValue()); break;
                    case FLOAT: tt.setFloatVal(n.floatValue()); break;
                    case DOUBLE: tt.setDoubleVal(n.doubleValue()); break;
                    default: throw new IllegalStateException("Unknown numeric type in field: "+field);

                byte[] value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");

                // logic to handle multiple fields w/ same name
                DocumentMetadata currentValue = fieldCache.get(;
                if (currentValue == null)
View Full Code Here

        int position = 0;

        for (Fieldable field : doc.getFields())

            ThriftTerm firstTerm = null;

            // Indexed field
            if (field.isIndexed() && field.isTokenized())
                TokenStream tokens = field.tokenStreamValue();

                if (tokens == null)
                    Reader tokReader = field.readerValue();

                    if (tokReader == null)
                        tokReader = new StringReader(field.stringValue());

                    tokens = analyzer.reusableTokenStream(, tokReader);

                // collect term information per field
                Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new HashMap<Term, Map<ByteBuffer, List<Number>>>();

                int lastOffset = 0;
                if (position > 0)
                    position += analyzer.getPositionIncrementGap(;

                // Build the termPositions vector for all terms

                tokens.reset(); // reset the TokenStream to the first token

                // set up token attributes we are working on

                // offsets
                OffsetAttribute offsetAttribute = null;
                if (field.isStoreOffsetWithTermVector())
                    offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

                // positions
                // these are always gathered in later lucene versions
                PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) tokens

                // term as string
                CharTermAttribute termAttribute = (CharTermAttribute) tokens.addAttribute(CharTermAttribute.class);

                // store normalizations of field per term per document rather
                // than per field.
                // this adds more to write but less to read on other side
                Integer tokensInField = new Integer(0);

                while (tokens.incrementToken())
                    Term term = new Term(, termAttribute.toString());

                    ThriftTerm tterm = new ThriftTerm(term.field()).setText(

                    if (firstTerm == null)
                        firstTerm = tterm;


                    // fetch all collected information for this term
                    Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);

                    if (termInfo == null)
                        termInfo = new HashMap<ByteBuffer, List<Number>>();
                        allTermInformation.put(term, termInfo);

                    // term frequency
                        List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKeyBytes);

                        if (termFrequency == null)
                            termFrequency = new ArrayList<Number>();
                            termFrequency.add(new Integer(0));
                            termInfo.put(CassandraUtils.termFrequencyKeyBytes, termFrequency);

                        // increment
                        termFrequency.set(0, termFrequency.get(0).intValue() + 1);

                    // position vector
                        position += (posIncrAttribute.getPositionIncrement() - 1);

                        List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKeyBytes);

                        if (positionVector == null)
                            positionVector = new ArrayList<Number>();
                            termInfo.put(CassandraUtils.positionVectorKeyBytes, positionVector);


                    // term offsets
                    if (field.isStoreOffsetWithTermVector())

                        List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKeyBytes);
                        if (offsetVector == null)
                            offsetVector = new ArrayList<Number>();
                            termInfo.put(CassandraUtils.offsetVectorKeyBytes, offsetVector);

                        offsetVector.add(lastOffset + offsetAttribute.startOffset());
                        offsetVector.add(lastOffset + offsetAttribute.endOffset());


                List<Number> bnorm = null;
                if (!field.getOmitNorms())
                    bnorm = new ArrayList<Number>();

                    final FieldInvertState invertState = new FieldInvertState();
                    invertState.setBoost(doc.getBoost() * field.getBoost());
                    final float norm = similarity.computeNorm(, invertState);


                for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet())

                    // Terms are stored within a unique key combination
                    // This is required since cassandra loads all columns
                    // in a key/column family into memory
                    ByteBuffer key = CassandraUtils.hashKeyBytes(indexNameBytes, CassandraUtils.delimeterBytes, term
                            .getKey().field().getBytes("UTF-8"), CassandraUtils.delimeterBytes, term.getKey().text()

                    // Mix in the norm for this field alongside each term
                    // more writes but faster on read side.
                    if (!field.getOmitNorms())
                        term.getValue().put(CassandraUtils.normsKeyBytes, bnorm);

                    CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
                            new LucandraTermInfo(docNumber, term.getValue()).serialize());

                    // Store all terms under a row
                    CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
                            CassandraUtils.createColumnName(term.getKey()), indexTermsKey,

            // Untokenized fields go in without a termPosition
            if (field.isIndexed() && !field.isTokenized())
                ThriftTerm tterm = new ThriftTerm(

                if (firstTerm == null)
                    firstTerm = tterm;


                ByteBuffer key = CassandraUtils.hashKeyBytes(indexName.getBytes("UTF-8"),
                        CassandraUtils.delimeterBytes,"UTF-8"), CassandraUtils.delimeterBytes,

                CassandraUtils.addMutations(workingMutations, CassandraUtils.termVecColumnFamily, docId, key,
                        new LucandraTermInfo(docNumber, emptyTermMap).serialize());

                // Store all terms under a row
                CassandraUtils.addMutations(workingMutations, CassandraUtils.metaInfoColumnFamily,
                        CassandraUtils.createColumnName(field), indexTermsKey, ByteBufferUtil.EMPTY_BYTE_BUFFER);

            // Stores each field as a column under this doc key
            if (field.isStored())
                ThriftTerm tt = new ThriftTerm(;

                if (field instanceof NumericField)
                    Number n = ((NumericField) field).getNumericValue();
                    switch(((NumericField) field).getDataType())
                    case LONG: tt.setLongVal(n.longValue()); break;
                    case INT: tt.setIntVal(n.intValue()); break;
                    case FLOAT: tt.setFloatVal(n.floatValue()); break;
                    case DOUBLE: tt.setDoubleVal(n.doubleValue()); break;
                    default: throw new IllegalStateException("Unknown numeric type in field: "+field);

                byte[] value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");

                // logic to handle multiple fields w/ same name
                DocumentMetadata currentValue = fieldCache.get(;
                if (currentValue == null)
View Full Code Here


Related Classes of lucandra.serializers.thrift.ThriftTerm

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact