Source Code of it.unimi.dsi.mg4j.examples.RunQuery

package it.unimi.dsi.mg4j.examples;


/*     
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2009-2010 Sebastiano Vigna 
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 3 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/


import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.Reference2DoubleOpenHashMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.mg4j.document.HtmlDocumentFactory;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.TermProcessor;
import it.unimi.dsi.mg4j.query.IntervalSelector;
import it.unimi.dsi.mg4j.query.QueryEngine;
import it.unimi.dsi.mg4j.query.SelectedInterval;
import it.unimi.dsi.mg4j.query.parser.SimpleParser;
import it.unimi.dsi.mg4j.search.DocumentIteratorBuilderVisitor;
import it.unimi.dsi.mg4j.search.score.BM25Scorer;
import it.unimi.dsi.mg4j.search.score.DocumentScoreInfo;


/** A very simple example that shows how to load a couple of indices and run them using
 * a {@linkplain QueryEngine query engine}. First argument is the basename of an index (possibly produced
 * by an {@link HtmlDocumentFactory}) that has fields <code>title</code> and <code>text</code>.
 * Second argument is a query.
 * 
 * @author Sebastiano Vigna
 * @since 2.2
 */


public class RunQuery {


  public static void main( String arg[] ) throws Exception {
    /** First we open our indices. The booleans tell that we want random access to
     * the inverted lists, and we are going to use document sizes (for scoring--see below). */
    final Index text = Index.getInstance( arg[ 0 ] + "-text", true, true );
    final Index title = Index.getInstance( arg[ 0 ] + "-title", true, true );


    /* We need a map mapping index names to actual indices. Its keyset will be used by the
     * parser to distinguish correct index names (e.g., "text:foo title:bar"), and the mapping
     * itself will be used when transforming a query into a document iterator. We use a handy
     * fastutil array-based constructor. */
    Object2ReferenceOpenHashMap<String,Index> indexMap = 
      new Object2ReferenceOpenHashMap<String,Index>( new String[] { "text", "title" }, new Index[] { text, title } );
    
    /* We now need to map index names to term processors. This is necessary as any processing
     * applied during indexing must be applied at query time, too. */
    Object2ReferenceOpenHashMap<String, TermProcessor> termProcessors = 
      new Object2ReferenceOpenHashMap<String,TermProcessor>( new String[] { "text", "title" }, new TermProcessor[] { text.termProcessor, title.termProcessor } );
    
    /* To run a query in a simple way we need a query engine. The engine requires a parser
     * (which in turn requires the set of index names and a default index), a document iterator
     * builder, which needs the index map, a default index, and a limit on prefix query
     * expansion, and finally the index map. */
    QueryEngine engine = new QueryEngine(
      new SimpleParser( indexMap.keySet(), "text", termProcessors ),
      new DocumentIteratorBuilderVisitor( indexMap, text, 1000 ), 
      indexMap
      
    );


    /* Optionally, we can score the results. Here we use a state-of-art ranking 
     * function, BM25, which requires document sizes. */
    engine.score( new BM25Scorer() );
    
    /* Optionally, we can weight the importance of each index. To do so, we have to pass a map,
     * and again we use the handy fastutil constructor. Note that setting up a BM25F scorer
     * would give much better results, but we want to keep it simple. */
    engine.setWeights( new Reference2DoubleOpenHashMap<Index>( new Index[] { text, title }, new double[] { 1, 2 } ) );
    
    /* Optionally, we can use an interval selector to get intervals representing matches. */
    engine.intervalSelector = new IntervalSelector();
    
    /* We are ready to run our query. We just need a list to store its results. The list is made
     * of DocumentScoreInfo objects, which comprise a document id, a score, and possibly an
     * info field that is generic. Here the info field is a map from indices to arrays
     * of selected intervals. This part will be empty if we do not set an interval selector. */
    ObjectArrayList<DocumentScoreInfo<Reference2ObjectMap<Index, SelectedInterval[]>>> result = 
      new ObjectArrayList<DocumentScoreInfo<Reference2ObjectMap<Index,SelectedInterval[]>>>();


    /* The query engine can return any subsegment of the results of a query. Here we grab the first 20 results. */
    engine.process( arg[ 1 ], 0, 20, result );
    
    for( DocumentScoreInfo<Reference2ObjectMap<Index, SelectedInterval[]>> dsi : result ) {
      System.out.println( dsi.document + " " + dsi.score );
    }
  }
}
Source Code of it.unimi.dsi.mg4j.examples.RunQuery

Related Classes of it.unimi.dsi.mg4j.examples.RunQuery