Source Code of org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder

/*
 * Hibernate, Relational Persistence for Idiomatic Java
 *
 * Copyright (c) 2014, Red Hat, Inc. and/or its affiliates or third-party contributors as
 * indicated by the @author tags or express copyright attribution
 * statements applied by the authors.  All third-party contributions are
 * distributed under license by Red Hat, Inc.
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */


package org.hibernate.search.query.dsl.impl;


import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.UnicodeUtil;


import org.hibernate.search.annotations.Store;
import org.hibernate.search.bridge.FieldBridge;
import org.hibernate.search.bridge.builtin.NumericFieldBridge;
import org.hibernate.search.bridge.util.impl.ContextualExceptionBridgeHelper;
import org.hibernate.search.engine.impl.DocumentBuilderHelper;
import org.hibernate.search.engine.metadata.impl.DocumentFieldMetadata;
import org.hibernate.search.engine.spi.DocumentBuilderIndexedEntity;
import org.hibernate.search.engine.spi.SearchFactoryImplementor;
import org.hibernate.search.exception.AssertionFailure;
import org.hibernate.search.query.engine.spi.EntityInfo;
import org.hibernate.search.query.engine.spi.HSQuery;
import org.hibernate.search.util.impl.PassThroughAnalyzer;
import org.hibernate.search.util.logging.impl.Log;
import org.hibernate.search.util.logging.impl.LoggerFactory;


import static org.hibernate.search.query.dsl.impl.ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE.ID;
import static org.hibernate.search.query.dsl.impl.ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE.ENTITY;


/**
 * Class inspired and code copied from Apache Lucene MoreLikeThis class.
 * Apache Lucene code copyright the Apache Software Foundation released under the
 * Apache Software License 2.0.
 *
 * @author Emmanuel Bernard <emmanuel@hibernate.org>
 */
public class MoreLikeThisBuilder<T> {


  private static final Log log = LoggerFactory.make();


  private int minWordLen = MoreLikeThis.DEFAULT_MIN_WORD_LENGTH;
  private int maxNumTokensParsed = MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED;
  private int maxWordLen = MoreLikeThis.DEFAULT_MAX_WORD_LENGTH;
  private Set<?> stopWords = MoreLikeThis.DEFAULT_STOP_WORDS;
  private DocumentBuilderIndexedEntity<T> documentBuilder;
  // We lower the min defaults to 1 because we don't merge the freq of *all* fields unlike the original MoreLikeThis
  // TODO: is that hurting performance? Could we guess "small fields" and ony lower these?
  private int minTermFreq = 1; //MoreLikeThis.DEFAULT_MIN_TERM_FREQ;
  private int minDocFreq = 1; //MoreLikeThis.DEFAULT_MIN_DOC_FREQ;
  private int maxDocFreq = MoreLikeThis.DEFAULT_MAX_DOC_FREQ;
  private int maxQueryTerms = MoreLikeThis.DEFAULT_MAX_QUERY_TERMS;
  private boolean boost = MoreLikeThis.DEFAULT_BOOST;
  private float boostFactor = 1;
  private TFIDFSimilarity similarity;
  private Integer documentNumber;
  private String[] compatibleFieldNames;
  private IndexReader indexReader;
  private FieldsContext fieldsContext;
  private Object input;
  private QueryBuildingContext queryContext;
  private boolean excludeEntityCompared;
  private ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE inputType;
  private TermQuery findById;


  public MoreLikeThisBuilder( DocumentBuilderIndexedEntity<T> documentBuilder, SearchFactoryImplementor searchFactory ) {
    this.documentBuilder = documentBuilder;
    Similarity configuredSimilarity = searchFactory.getIndexBindings().get( documentBuilder.getBeanClass() ).getSimilarity();
    if ( configuredSimilarity instanceof TFIDFSimilarity ) {
      this.similarity = (TFIDFSimilarity) configuredSimilarity;
    }
    else {
      throw log.requireTFIDFSimilarity( documentBuilder.getBeanClass() );
    }
  }


  public MoreLikeThisBuilder indexReader(IndexReader indexReader) {
    this.indexReader = indexReader;
    return this;
  }


  public MoreLikeThisBuilder compatibleFieldNames(String... compatibleFieldNames) {
    this.compatibleFieldNames = compatibleFieldNames;
    return this;
  }


  public MoreLikeThisBuilder otherMoreLikeThisContext(MoreLikeThisQueryContext moreLikeThisContext) {
    this.boost = moreLikeThisContext.isBoostTerms();
    this.boostFactor = moreLikeThisContext.getTermBoostFactor();
    this.excludeEntityCompared = moreLikeThisContext.isExcludeEntityUsedForComparison();
    return this;
  }


  /**
   * Return a query that will return docs like the passed lucene document ID.
   */
  public Query createQuery() {
    try {
      documentNumber = getLuceneDocumentIdFromIdAsTermOrNull( documentBuilder );
      return maybeExcludeComparedEntity( createQuery( retrieveTerms() ) );
    }
    catch (IOException e) {
      throw log.ioExceptionOnIndexOfEntity( e, documentBuilder.getBeanClass() );
    }
  }


  /**
   * Try and retrieve the document id from the input. If failing and a backup approach exists, returns null.
   */
  private Integer getLuceneDocumentIdFromIdAsTermOrNull(DocumentBuilderIndexedEntity<?> documentBuilder) {
    String id;
    if ( inputType == ID ) {
      id = documentBuilder.getIdBridge().objectToString( input );
    }
    else if ( inputType == ENTITY ) {
      // Try and extract the id, if failing the id will be null
      try {
        // I expect a two way bridge to return null from a null input, correct?
        id = documentBuilder.getIdBridge().objectToString( documentBuilder.getId( input ) );
      }
      catch (IllegalStateException e) {
        id = null;
      }
    }
    else {
      throw new AssertionFailure( "We don't support no string and reader for MoreLikeThis" );
    }
    if ( id == null ) {
      return null;
    }
    findById = new TermQuery( new Term( documentBuilder.getIdKeywordName(), id ) );
    HSQuery query = queryContext.getFactory().createHSQuery();
    //can't use Arrays.asList for some obscure capture reason
    List<Class<?>> classes = new ArrayList<Class<?>>(1);
    classes.add( queryContext.getEntityType() );
    List<EntityInfo> entityInfos = query
        .luceneQuery( findById )
        .maxResults( 1 )
        .projection( HSQuery.DOCUMENT_ID )
        .targetedEntities( classes )
        .queryEntityInfos();
    if ( entityInfos.size() == 0 ) {
      if ( inputType == ID ) {
        throw log.entityWithIdNotFound( queryContext.getEntityType(), id );
      }
      else {
        return null;
      }
    }
    return (Integer) entityInfos.iterator().next().getProjection()[0];
  }


  private Query maybeExcludeComparedEntity(Query query) {
    // It would be better to attach a collector to exclude a document by its id
    // but at this stage we could have documents reordered and thus with a different id
    // Maybe a Filter would be more efficient?
    if ( excludeEntityCompared && documentNumber != null ) {
      BooleanQuery booleanQuery;
      if ( ! ( query instanceof BooleanQuery ) ) {
        booleanQuery = new BooleanQuery();
        booleanQuery.add( query, BooleanClause.Occur.MUST );
      }
      else {
        booleanQuery = (BooleanQuery) query;
      }
      booleanQuery.add(
          new ConstantScoreQuery( findById ),
          BooleanClause.Occur.MUST_NOT );
      return booleanQuery;
    }
    else {
      return query;
    }
  }


  /**
   * Create the More Like This query from a PriorityQueue
   */
  private Query createQuery(List<PriorityQueue<Object[]>> q) {
    //In the original algorithm, the number of terms is limited to maxQueryTerms
    //In the current implementation, we do nbrOfFields * maxQueryTerms
    int length = fieldsContext.size();
    if ( length == 0 ) {
      throw new AssertionFailure( "Querying MoreLikeThis on 0 field." );
    }
    else if ( length == 1 ) {
      return createQuery( q.get( 0 ), fieldsContext.getFirst() );
    }
    else {
      BooleanQuery query = new BooleanQuery();
      //the fieldsContext indexes are aligned with the priority queue's
      Iterator<FieldContext> fieldsContextIterator = fieldsContext.iterator();
      for ( PriorityQueue<Object[]> queue : q ) {
        try {
          query.add( createQuery( queue, fieldsContextIterator.next() ), BooleanClause.Occur.SHOULD );
        }
        catch (BooleanQuery.TooManyClauses ignore) {
          break;
        }
      }
      return query;
    }
  }


  private Query createQuery(PriorityQueue<Object[]> q, FieldContext fieldContext) {
    if ( q == null ) {
      final FieldBridge fieldBridge = fieldContext.getFieldBridge() != null ? fieldContext.getFieldBridge() : documentBuilder.getBridge( fieldContext.getField() );
      if ( fieldBridge instanceof NumericFieldBridge ) {
        // we probably can do something here
        //TODO how to build the query where we don't have the value?
        throw log.numericFieldCannotBeUsedInMoreLikeThis( fieldContext.getField(), documentBuilder.getBeanClass() );
      }
      DocumentFieldMetadata fieldMetadata = documentBuilder.getTypeMetadata().getDocumentFieldMetadataFor(
          fieldContext.getField()
      );
      boolean hasTermVector = fieldMetadata.getTermVector() != Field.TermVector.NO;
      boolean isStored = fieldMetadata.getStore() != Store.NO;
      if ( ! ( hasTermVector || isStored ) ) {
        throw log.fieldNotStoredNorTermVectorCannotBeUsedInMoreLikeThis( fieldContext.getField(), documentBuilder.getBeanClass() );
      }
      boolean isIdOrEmbeddedId = fieldMetadata.isId() || fieldMetadata.isIdInEmbedded();
      if ( isIdOrEmbeddedId ) {
        throw log.fieldIdCannotBeUsedInMoreLikeThis( fieldContext.getField(), documentBuilder.getBeanClass() );
      }
    }


    BooleanQuery query = new BooleanQuery();
    Object cur;
    int qterms = 0;
    float bestScore = 0;
    while ( ( cur = q.pop() ) != null ) {
      Object[] ar = (Object[]) cur;
      TermQuery tq = new TermQuery( new Term( (String) ar[1], (String) ar[0] ) );


      if ( boost ) {
        if ( qterms == 0 ) {
          bestScore = ( (Float) ar[2]);
        }
        float myScore = ( (Float) ar[2]);


        tq.setBoost( boostFactor * myScore / bestScore );
      }


      try {
        query.add( tq, BooleanClause.Occur.SHOULD );
      }
      catch (BooleanQuery.TooManyClauses ignore) {
        break;
      }


      qterms++;
      if ( maxQueryTerms > 0 && qterms >= maxQueryTerms ) {
        break;
      }
    }
    // Apply field adjustments
    return fieldContext.getFieldCustomizer().setWrappedQuery( query ).createQuery();
  }


  /**
   * Find words for a more-like-this query former.
   * Store them per field name according to the order of fieldnames defined in {@link #fieldsContext}.
   * If the field name is not compatible with term retrieval, the queue will be empty for that index.
   */
  private List<PriorityQueue<Object[]>> retrieveTerms() throws IOException {
    int size = fieldsContext.size();
    Map<String,Map<String, Int>> termFreqMapPerFieldname = new HashMap<String,Map<String, Int>>( size );
    final Fields vectors;
    Document maybeDocument = null;
    if ( documentNumber == null && size > 0 ) {
      //build the document from the entity instance


      //first build the list of fields we are interested in
      String[] fieldNames = new String[ size ];
      Iterator<FieldContext> fieldsContextIterator = fieldsContext.iterator();
      for ( int index = 0 ; index < size ; index++ ) {
        fieldNames[index] = fieldsContextIterator.next().getField();
      }
      //TODO should we keep the fieldToAnalyzerMap around to pass to the analyzer?
      Map<String,String> fieldToAnalyzerMap = new HashMap<String, String>( );
      //FIXME by calling documentBuilder we don't honor .comparingField("foo").ignoreFieldBridge(): probably not a problem in practice though
      maybeDocument = documentBuilder.getDocument( (T) input, null, fieldToAnalyzerMap, null, new ContextualExceptionBridgeHelper(), fieldNames );
      vectors = null;
    }
    else {
      vectors = indexReader.getTermVectors( documentNumber );
    }
    for ( FieldContext fieldContext : fieldsContext ) {
      String fieldName = fieldContext.getField();
      if ( isCompatibleField( fieldName ) ) {
        Map<String,Int> termFreqMap = new HashMap<String, Int>();
        termFreqMapPerFieldname.put( fieldName, termFreqMap );
        final Terms vector;
        if ( vectors != null ) {
          vector = vectors.terms( fieldName );
        }
        else {
          vector = null;
        }


        // field does not store term vector info
        if ( vector == null ) {
          if ( maybeDocument == null ) {
            maybeDocument = indexReader.document( documentNumber );
          }
          IndexableField[] fields = maybeDocument.getFields( fieldName );
          for ( IndexableField field : fields ) {
            //TODO numbers
            final String stringValue = DocumentBuilderHelper.extractStringFromFieldable( field );
            if ( stringValue != null ) {
              addTermFrequencies( new StringReader( stringValue ), termFreqMap, fieldContext );
            }
          }
        }
        else {
          addTermFrequencies( termFreqMap, vector );
        }
      }
      else {
        //place null as the field is not compatible
        termFreqMapPerFieldname.put( fieldName, null );
      }
    }
    List<PriorityQueue<Object[]>> results = new ArrayList<PriorityQueue<Object[]>>( size );
    for ( Map.Entry<String,Map<String,Int>> entry : termFreqMapPerFieldname.entrySet() ) {
      results.add( createQueue( entry.getKey(), entry.getValue() ) );
    }
    return results;
  }


  private boolean isCompatibleField(String fieldName) {
    for ( String compatibleFieldName : compatibleFieldNames ) {
      if ( compatibleFieldName.equals( fieldName ) ) {
        return true;
      }
    }
    return false;
  }


  /**
   * Create a PriorityQueue from a word->tf map.
   *
   * @param words a map of words keyed on the word(String) with Int objects as the values.
   */
  private PriorityQueue<Object[]> createQueue(String fieldName, Map<String, Int> words) throws IOException {
    if ( words == null ) {
      //incompatible field name
      return null;
    }
    // have collected all words in doc and their freqs
    int numDocs = indexReader.numDocs();
    FreqQ res = new FreqQ( words.size() ); // will order words by score


    for ( Map.Entry<String,Int> entry : words.entrySet() ) { // for every word
      String word = entry.getKey();
      int tf = entry.getValue().x; // term freq in the source doc
      if ( minTermFreq > 0 && tf < minTermFreq ) {
        continue; // filter out words that don't occur enough times in the source
      }


      // The original algorithm looks for all field names and finds the top frequency
      // and only consider this field for the query
      // "go through all the fields and find the largest document frequency"
      Term term = new Term( fieldName, word );
      int freq = indexReader.docFreq( new Term( fieldName, word ) );


      if ( minDocFreq > 0 && freq < minDocFreq ) {
        continue; // filter out words that don't occur in enough docs
      }


      if ( freq > maxDocFreq ) {
        continue; // filter out words that occur in too many docs
      }


      if ( freq == 0 ) {
        continue; // index update problem?
      }


      float idf = similarity.idf( freq, numDocs );
      float score = tf * idf;


      // only really need 1st 3 entries, other ones are for troubleshooting
      res.insertWithOverflow(
          new Object[] {
              word, // the word
              fieldName, // the top field
              score, // overall score
              idf, // idf
              freq, // freq in all docs
              tf
          }
      );
    }
    return res;
  }


  /**
   * Adds terms and frequencies found in vector into the Map termFreqMap
   *
   * @param termFreqMap a Map of terms and their frequencies
   * @param vector List of terms and their frequencies for a doc/field
   */
  private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator( null );
    final CharsRef spare = new CharsRef();
    BytesRef text;
    while ( ( text = termsEnum.next() ) != null ) {
      UnicodeUtil.UTF8toUTF16( text, spare );
      final String term = spare.toString();
      if ( isNoiseWord( term ) ) {
        continue;
      }
      final int freq = (int) termsEnum.totalTermFreq();


      // increment frequency
      Int cnt = termFreqMap.get( term );
      if ( cnt == null ) {
        cnt = new Int();
        termFreqMap.put( term, cnt );
        cnt.x = freq;
      }
      else {
        cnt.x += freq;
      }
    }
  }


  /**
   * Adds term frequencies found by tokenizing text from reader into the Map words
   *
   * @param r a source of text to be tokenized
   * @param termFreqMap a Map of terms and their frequencies
   * @param fieldName Used by analyzer for any special per-field analysis
   */
  private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, FieldContext fieldContext)
      throws IOException {
    String fieldName = fieldContext.getField();
    Analyzer analyzer = queryContext.getQueryAnalyzer();
    if ( fieldContext.isIgnoreAnalyzer() ) {
      // essentially does the Reader to String conversion for us
      analyzer = PassThroughAnalyzer.INSTANCE;
    }
    TokenStream ts = analyzer.tokenStream( fieldName, r );
    try {
      int tokenCount = 0;
      // for every token
      CharTermAttribute termAtt = ts.addAttribute( CharTermAttribute.class );
      ts.reset();
      while ( ts.incrementToken() ) {
        String word = termAtt.toString();
        tokenCount++;
        if ( tokenCount > maxNumTokensParsed ) {
          break;
        }
        if ( isNoiseWord( word ) ) {
          continue;
        }


        // increment frequency
        Int cnt = termFreqMap.get( word );
        if ( cnt == null ) {
          termFreqMap.put( word, new Int() );
        }
        else {
          cnt.x++;
        }
      }
      ts.end();
    }
    finally {
      IOUtils.closeWhileHandlingException( ts );
    }
  }


  /**
   * determines if the passed term is likely to be of interest in "more like" comparisons
   *
   * @param term The word being considered
   *
   * @return true if should be ignored, false if should be used in further analysis
   */
  private boolean isNoiseWord(String term) {
    int len = term.length();
    if ( minWordLen > 0 && len < minWordLen ) {
      return true;
    }
    if ( maxWordLen > 0 && len > maxWordLen ) {
      return true;
    }
    return stopWords != null && stopWords.contains( term );
  }


  public MoreLikeThisBuilder fieldsContext(FieldsContext fieldsContext) {
    this.fieldsContext = fieldsContext;
    return this;
  }


  public MoreLikeThisBuilder input(Object input) {
    this.input = input;
    return this;
  }


  public MoreLikeThisBuilder queryContext(QueryBuildingContext queryContext) {
    this.queryContext = queryContext;
    return this;
  }


  public MoreLikeThisBuilder idAsTerm(String idAsTerm) {
    return this;
  }


  public MoreLikeThisBuilder inputType(ConnectedMoreLikeThisQueryBuilder.INPUT_TYPE inputType) {
    this.inputType = inputType;
    return this;
  }


  /**
   * PriorityQueue that orders words by score.
   */
  private static class FreqQ extends PriorityQueue<Object[]> {
    FreqQ(int s) {
      super( s );
    }


    @Override
    protected boolean lessThan(Object[] aa, Object[] bb) {
      Float fa = (Float) aa[2];
      Float fb = (Float) bb[2];
      return fa > fb;
    }
  }


  /**
   * Use for frequencies and to avoid renewing Integers.
   */
  private static class Int {
    int x;


    Int() {
      x = 1;
    }


    @Override
    public String toString() {
      return "Int{" + x + '}';
    }
  }
}
Source Code of org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder

Related Classes of org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder