Package org.leskes.elasticfacets

Source Code of org.leskes.elasticfacets.HashedStringsFacetCollector$ReaderAggregator

package org.leskes.elasticfacets;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.PriorityQueue;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.trove.set.hash.TIntHashSet;
import org.elasticsearch.index.cache.field.data.FieldDataCache;
import org.elasticsearch.index.field.data.FieldData;
import org.elasticsearch.index.field.data.FieldDataType;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.script.SearchScript;
import org.elasticsearch.search.facet.AbstractFacetCollector;
import org.elasticsearch.search.facet.Facet;
import org.elasticsearch.search.facet.terms.TermsFacet;
import org.elasticsearch.search.facet.terms.support.EntryPriorityQueue;
import org.elasticsearch.search.internal.SearchContext;
import org.leskes.elasticfacets.fields.HashedStringFieldData;
import org.leskes.elasticfacets.fields.HashedStringFieldSettings;
import org.leskes.elasticfacets.fields.HashedStringFieldType;
import org.leskes.elasticfacets.utils.SizeSensitiveCacheRecycler;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

/**
*
*/
public class HashedStringsFacetCollector extends AbstractFacetCollector {

   final static ESLogger logger = Loggers
           .getLogger(HashedStringsFacetCollector.class);


   private final FieldDataCache fieldDataCache;

   private final String indexFieldName;

   private Analyzer fieldIndexAnalyzer;

   private final TermsFacet.ComparatorType comparatorType;

   public static enum OUTPUT_MODE {
      TERM, HASH, SCRIPT;

      public static OUTPUT_MODE fromString(String type) {
         if ("term".equals(type)) {
            return TERM;
         } else if ("hash".equals(type)) {
            return HASH;
         } else if ("script".equals(type)) {
            return SCRIPT;
         }
         throw new ElasticSearchIllegalArgumentException("No type argument match for hashed string facet output mode [" + type + "]");
      }
   }

   private final OUTPUT_MODE output_mode;

   private final int size;

   private int fetch_size;

   private final int numberOfShards;

   private final int minCount;

   private final FieldDataType fieldDataType;

   private HashedStringFieldData fieldData;

   private final List<ReaderAggregator> aggregators;

   private ReaderAggregator current;

   private final SearchContext context;

   private final SearchScript output_script;

   private final HashedStringFieldSettings.FieldTypeFactory hashedStringTypeFactory;

   long missing;
   long total;

   private final TIntHashSet excluded;
   private final TIntHashSet included;

   public HashedStringsFacetCollector(String facetName, String fieldName, int size, int fetch_size,
                                      TermsFacet.ComparatorType comparatorType, boolean allTerms,
                                      OUTPUT_MODE output_mode,
                                      TIntHashSet included, TIntHashSet excluded,
                                      String output_script, String output_scriptLang, SearchContext context,
                                      Map<String, Object> params, HashedStringFieldSettings.FieldTypeFactory loaderForField) {
      super(facetName);
      this.fieldDataCache = context.fieldDataCache();
      this.size = size;
      this.fetch_size = fetch_size;
      this.comparatorType = comparatorType;
      this.numberOfShards = context.numberOfShards();
      this.context = context;
      this.output_mode = output_mode;
      this.hashedStringTypeFactory = loaderForField;


      MapperService.SmartNameFieldMappers smartMappers = context.smartFieldMappers(fieldName);
      if (smartMappers == null || !smartMappers.hasMapper()) {
         throw new ElasticSearchIllegalArgumentException(
                 "Field [" + fieldName + "] doesn't have a type, can't run hashed string facet collector on it");
      }
      // add type filter if there is exact doc mapper associated with it
      if (smartMappers.explicitTypeInNameWithDocMapper()) {
         setFilter(context.filterCache().cache(smartMappers.docMapper().typeFilter()));
      }

      if (smartMappers.mapper().fieldDataType() != FieldDataType.DefaultTypes.STRING) {
         throw new ElasticSearchIllegalArgumentException(
                 "Field [" + fieldName + "] is not of string type, can't run hashed string facet collector on it");
      }

      if (TermsFacet.ComparatorType.TERM.id() == comparatorType.id()) {
         throw new ElasticSearchIllegalArgumentException("HashedStringsFacet doesn't support sorting by term.");

      }

      if (output_script != null) {
         this.output_script = context.scriptService().search(context.lookup(), output_scriptLang, output_script, params);
      } else {
         this.output_script = null;
      }


      this.indexFieldName = smartMappers.mapper().names().indexName();
      this.fieldDataType = smartMappers.mapper().fieldDataType();


      this.fieldIndexAnalyzer = smartMappers.mapper().indexAnalyzer();
      if (this.fieldIndexAnalyzer == null && smartMappers.docMapper() != null)
         this.fieldIndexAnalyzer = smartMappers.docMapper().indexAnalyzer();
      if (this.fieldIndexAnalyzer == null) this.fieldIndexAnalyzer = Lucene.STANDARD_ANALYZER;

      if (excluded == null || excluded.isEmpty()) {
         this.excluded = null;
      } else {
         this.excluded = excluded;
      }
      if (included == null || included.isEmpty()) {
         this.included = null;
      } else {
         this.included = included;
      }

      // minCount is offset by -1
      if (allTerms) {
         minCount = -1;
      } else {
         minCount = 0;
      }

      this.aggregators = new ArrayList<ReaderAggregator>(context.searcher().subReaders().length);
   }

   @Override
   protected void doSetNextReader(IndexReader reader, int docBase) throws IOException {
      if (current != null) {
         missing += current.missing;
         total += current.total;
         if (current.values.length > 0) {
            aggregators.add(current);
         }
      }
      fieldData = (HashedStringFieldData) fieldDataCache.cache(hashedStringTypeFactory.getTypeForField(indexFieldName),
              reader, indexFieldName);
      current = new ReaderAggregator(fieldData, docBase);
   }

   @Override
   protected void doCollect(int doc) throws IOException {
      fieldData.forEachOrdinalInDoc(doc, current);
   }


   @Override
   public Facet facet() {
      if (current != null) {
         missing += current.missing;
         total += current.total;
         // if we have values for this one, add it
         if (current.values.length > 0) {
            aggregators.add(current);
         }
      }

      AggregatorPriorityQueue queue = new AggregatorPriorityQueue(aggregators.size());

      for (ReaderAggregator aggregator : aggregators) {
         if (aggregator.nextPosition()) {
            queue.add(aggregator);
         }
      }

      // if there is one shard, there will not be a reduce phase, so we must not deliver too much
      int queue_size = numberOfShards == 1 ? size : fetch_size;

      // YACK, we repeat the same logic, but once with an optimizer priority queue for smaller sizes
      if (queue_size < EntryPriorityQueue.LIMIT) {
         // optimize to use priority size
         EntryPriorityQueue ordered = new EntryPriorityQueue(queue_size, comparatorType.comparator());

         while (queue.size() > 0) {
            ReaderAggregator agg = queue.top();
            int value = agg.currentValue;
            int count = 0;
            int docId = agg.currentDocId;
            do {
               if (agg.currentCount != 0) {
                  count += agg.currentCount;
                  docId = agg.currentDocId;
               }

               if (agg.nextPosition()) {
                  agg = queue.updateTop();
               } else {
                  // we are done with this reader
                  queue.pop();
                  agg = queue.top();
               }
            } while (agg != null && value == agg.currentValue);

            assert (agg == null || value < agg.currentValue);

            if (count > minCount) {
               if (excluded != null && excluded.contains(value)) {
                  continue;
               }
               if (included != null && !included.contains(value)) {
                  continue;
               }
               HashedStringsFacet.HashedStringEntry entry = new HashedStringsFacet.HashedStringEntry(null, value, docId, count);
               ordered.insertWithOverflow(entry);
            }
         }
         HashedStringsFacet.HashedStringEntry[] list = new HashedStringsFacet.HashedStringEntry[ordered.size()];
         for (int i = ordered.size() - 1; i >= 0; i--) {
            HashedStringsFacet.HashedStringEntry entry = (HashedStringsFacet.HashedStringEntry) ordered.pop();
            loadTermIntoEntry(entry);
            list[i] = entry;


         }

         for (ReaderAggregator aggregator : aggregators) {
            aggregator.close();
         }

         return new HashedStringsFacet(facetName, comparatorType, size, Arrays.asList(list), missing, total);
      }

      // TODO
      throw new UnsupportedOperationException("Large facet sizes (> 5000) are not yet implemented by HashedStringsFacet");

//        BoundedTreeSet<InternalStringTermsFacet.StringEntry> ordered = new BoundedTreeSet<InternalStringTermsFacet.StringEntry>(comparatorType.comparator(), size);
//
//        while (queue.size() > 0) {
//            ReaderAggregator agg = queue.top();
//            String value = agg.current;
//            int count = 0;
//            do {
//                count += agg.counts[agg.position];
//                if (agg.nextPosition()) {
//                    agg = queue.updateTop();
//                } else {
//                    // we are done with this reader
//                    queue.pop();
//                    agg = queue.top();
//                }
//            } while (agg != null && value.equals(agg.current));
//
//            if (count > minCount) {
//                if (excluded != null && excluded.contains(value)) {
//                    continue;
//                }
//                if (matcher != null && !matcher.reset(value).matches()) {
//                    continue;
//                }
//                InternalStringTermsFacet.StringEntry entry = new InternalStringTermsFacet.StringEntry(value, count);
//                ordered.add(entry);
//            }
//        }
//
//
//        for (ReaderAggregator aggregator : aggregators) {
//            CacheRecycler.pushIntArray(aggregator.counts);
//        }
//
//        return new InternalStringTermsFacet(facetName, comparatorType, size, ordered, missing, total);
   }

   private void loadTermIntoEntry(HashedStringsFacet.HashedStringEntry hashedEntry) {
      switch (output_mode) {
         case HASH:
            break;
         case TERM:
            String term = HashedStringFieldData.findTermInDoc(hashedEntry.getTermHash(), hashedEntry.getDocId(),
                    indexFieldName, fieldIndexAnalyzer, context);
            hashedEntry.setTerm(term);
            if (logger.isTraceEnabled())
               logger.trace("Converted hash entry: term={}, expected_hash={},real_hash={}, count={}, docId={}",
                       hashedEntry.term(), hashedEntry.getTermHash(), HashedStringFieldType.hashCode(hashedEntry.term()),
                       hashedEntry.count(), hashedEntry.getDocId());

            break;
         case SCRIPT:
            if (output_script == null)
               throw new ElasticSearchIllegalArgumentException(
                       "Hashed string facet output was set to script, but no script was supplied");
            int readerIndex = context.searcher().readerIndex(hashedEntry.getDocId());
            IndexReader subReader = context.searcher().subReaders()[readerIndex];
            int subDoc = hashedEntry.getDocId() - context.searcher().docStarts()[readerIndex];
            output_script.setNextReader(subReader);
            output_script.setNextDocId(subDoc);
            output_script.setNextVar("_hash",hashedEntry.hashCode());

            Object value;
            value = output_script.run();
            value = output_script.unwrap(value);
            hashedEntry.setTerm((String) value);
            break;
      }
   }

   public static class ReaderAggregator implements FieldData.OrdinalInDocProc {

      final int[] values;
      final int[] counts;
      final int[] docIdsForValues; // of every value keep a docid where we run into it.

      int position = 0; // first value is a null value.
      int currentValue;
      int currentDocId;
      int currentCount;
      int total;
      int missing;
      int docBase;

      public ReaderAggregator(HashedStringFieldData fieldData, int docBase) {
         this.values = fieldData.values();
         this.counts = SizeSensitiveCacheRecycler.popIntArray(fieldData.values().length);
         this.docIdsForValues = SizeSensitiveCacheRecycler.popIntArray(fieldData.values().length);
         this.docBase = docBase;
      }

      public void close() {
         SizeSensitiveCacheRecycler.pushIntArray(counts);
         SizeSensitiveCacheRecycler.pushIntArray(docIdsForValues);
      }

      public void onOrdinal(int docId, int ordinal) {
         if (ordinal == 0) {
            missing++;
            return; // no value no count..
         }
         if (counts[ordinal]++ == 0)
            docIdsForValues[ordinal] = docId + docBase;
         total++;
      }

      public boolean nextPosition() {
         if (++position >= values.length) {
            return false;
         }
         currentValue = values[position];
         currentDocId = docIdsForValues[position];
         currentCount = counts[position];
         return true;
      }
   }


   public static class AggregatorPriorityQueue extends PriorityQueue<ReaderAggregator> {

      public AggregatorPriorityQueue(int size) {
         initialize(size);
      }

      @Override
      protected boolean lessThan(ReaderAggregator a, ReaderAggregator b) {
         return a.currentValue < b.currentValue;
      }
   }
}
TOP

Related Classes of org.leskes.elasticfacets.HashedStringsFacetCollector$ReaderAggregator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.