package it.unimi.dsi.mg4j.search.visitor;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2006-2010 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.objects.Object2IntLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Reference2IntLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.Reference2IntMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ReferenceSet;
import it.unimi.dsi.fastutil.objects.ReferenceSets;
import it.unimi.dsi.mg4j.index.Index;
import it.unimi.dsi.mg4j.index.IndexIterator;
import it.unimi.dsi.mg4j.search.DocumentIterator;
import java.io.IOException;
import org.apache.log4j.Logger;
/** A visitor collecting information about terms appearing
* in a {@link it.unimi.dsi.mg4j.search.DocumentIterator}.
*
* <P>The purpose of this visitor is that of exploring before iteration the structure
* of a {@link DocumentIterator} to count how many terms are actually used, and set up some
* preliminary access data. More precisely, we count the distinct pairs index/term
* appearing in all leaves of nonzero frequency (the latter
* condition is used to skip empty iterators). For this visitor to work, all leaves
* of nonzero frequency must return a non-<code>null</code> value on
* a call to {@link it.unimi.dsi.mg4j.index.IndexIterator#term()}.
*
* <p>During the visit, we keep track of which index/term pair have been already
* seen. Each pair is assigned an distinct <em>offset</em>—a number between
* zero and the overall number of distinct pairs—which is stored into
* each index iterator {@linkplain it.unimi.dsi.mg4j.index.IndexIterator#id() id}
* and is used afterwards to access quickly data about the pair. Note that duplicate index/term pairs
* get the same offset. The overall number of distinct pairs is returned
* by {@link #numberOfPairs()} after a visit.
*
* <p>During the visit, the indices actually appearing in some nonzero-frequency
* leaf are gathered; they are accessible as a vector returned
* by {@link #indices()}, and the map from positions in this vector to indices
* is inverted by {@link #indexMap()}. If you need to force some index to appear in {@link #indices()},
* there's a special {@link #prepare(ReferenceSet)} method.
*
* <p>The offset assigned to each pair index/term
* is returned by {@link #offset(Index, String)}. Should you need to know the terms
* associated to each index, they are returned by {@link #terms(Index)}.
*
* <p>The after a term collection, usually counters are set
* up by a visit of {@link it.unimi.dsi.mg4j.search.visitor.CounterSetupVisitor}.
*/
public class TermCollectionVisitor extends AbstractDocumentIteratorVisitor {
private final static Logger LOGGER = Logger.getLogger( TermCollectionVisitor.class );
private final static boolean DEBUG = false;
/** The map from indices to maps from terms to offsets. The map themselves are linked,
* so terms are always returned in the same order (the visit order). */
private final Reference2ObjectMap<Index,Object2IntMap<String>> index2termMap;
/** The overall number of pairs index/term. */
private int numberOfPairs;
/** The array of indices involved in this query, returned by {@link #indices()}. */
private Index[] index;
/** A map from indices to positions in {@link #index}. */
private final Reference2IntMap<Index> indexMap;
/** A map from terms (indistinctly belonging to some index) to term ids. */
private final Object2IntLinkedOpenHashMap<String> term2Id;
/** Creates a new term-collection visitor. */
public TermCollectionVisitor() {
index2termMap = new Reference2ObjectOpenHashMap<Index,Object2IntMap<String>>();
indexMap = new Reference2IntLinkedOpenHashMap<Index>( Hash.DEFAULT_INITIAL_SIZE, .5f );
term2Id = new Object2IntLinkedOpenHashMap<String>();
term2Id.defaultReturnValue( -1 );
}
@SuppressWarnings("unchecked")
public TermCollectionVisitor prepare() {
return prepare( ReferenceSets.EMPTY_SET );
}
public TermCollectionVisitor prepare( final ReferenceSet<Index> indices ) {
index = null;
index2termMap.clear();
indexMap.clear();
term2Id.clear();
numberOfPairs = 0;
// If indices is not empty, we do eager instantiation of the index set. This must be kept in sync with the lazy part.
int c = 0;
for( final Index i: indices ) {
indexMap.put( i, c++ );
final Object2IntMap<String> termMap = new Object2IntLinkedOpenHashMap<String>( Hash.DEFAULT_INITIAL_SIZE, .5f );
index2termMap.put( i, termMap );
termMap.defaultReturnValue( -1 );
}
return this;
}
public Boolean visit( final IndexIterator indexIterator ) throws IOException {
// TODO: the second condition should be checked elsewhere, maybe...
if ( indexIterator.frequency() > 0 && indexIterator.index().hasCounts) { // We skip empty iterators and indices without counts
final Index index = indexIterator.index();
final String term = indexIterator.term();
if ( term == null ) throw new NullPointerException( "This visitor needs a non-null term for each index iterator of nonzero frequency" );
if ( ! term2Id.containsKey( term ) ) term2Id.put( term, term2Id.size() );
if ( DEBUG ) LOGGER.debug( "Visiting leaf: index=" + index + ", term=" + term );
final Object2IntMap<String> termMap;
if ( ! indexMap.containsKey( index ) ) {
// This index has never been seen before
indexMap.put( index, indexMap.size() );
// Lazy instantiation of the term map. Please keep in sync with eager instantiation in prepare(ReferenceSet).
index2termMap.put( index, termMap = new Object2IntLinkedOpenHashMap<String>( Hash.DEFAULT_INITIAL_SIZE, .5f ) );
termMap.defaultReturnValue( -1 );
}
else termMap = index2termMap.get( index );
int offset = termMap.getInt( term );
if ( offset == -1 ) termMap.put( term, offset = numberOfPairs++ ); // Unknown index/term pair
indexIterator.id( offset );
if ( DEBUG ) LOGGER.debug( "Offset for index iterator " + indexIterator + ": " + offset );
}
return Boolean.TRUE;
}
/** Returns the number of distinct index/term pair corresponding to
* nonzero-frequency index iterators in the last visit.
*
* @return the number distinct index/term pair corresponding to
* nonzero-frequency index iterators.
*/
public int numberOfPairs() {
return numberOfPairs;
}
/** Returns the indices met during pair collection.
*
* <p>Note that the returned array does not include indices only associated
* to index iterators of zero frequency, unless {@link #prepare(ReferenceSet)} was
* called with a nonempty argument.
*
* @return the indices met during term collection.
*/
public Index[] indices() {
if ( index == null ) index = indexMap.keySet().toArray( new Index[ index2termMap.size() ] );
return index;
}
/** Returns a map from indices met during term collection to their position
* into {@link #indices()}.
*
* <p>Note that the returned map does not include as keys indices only associated
* to index iterators of zero frequency, unless {@link #prepare(ReferenceSet)} was
* called with a nonempty argument.
*
* @return a map from indices met during term collection to their position
* into {@link #indices()}.
*/
public Reference2IntMap<Index> indexMap() {
return indexMap;
}
/** Returns the terms associated to the given index.
*
* @param index an index.
* @return the terms associated to <code>index</code>, in the same order in which
* they appeared during the visit, skipping duplicates, if some nonzero-frequency iterator
* based on <code>index</code> was found; <code>null</code> otherwise.
*/
public String[] terms( final Index index ) {
final Object2IntMap<String> termMap = index2termMap.get( index );
return termMap == null ? null : termMap.keySet().toArray( new String[ termMap.size() ] );
}
/** Returns the a map associating terms appearing in the query with ids.
*
* @return a map from terms appearing in the query (in indices with counts) to ids.
*/
public Object2IntLinkedOpenHashMap<String> term2Id() {
return term2Id;
}
/** Returns the offset associated to a given pair index/term.
*
* @param index an index appearing in {@link #indices()}.
* @param term a term appearing in the array returned by {@link #terms(Index)} with argument <code>index</code>.
* @return the offset associated to the pair <code>index</code>/<code>term</code>.
*/
public int offset( final Index index, final String term ) {
return index2termMap.get( index ).getInt( term );
}
public String toString() {
return "[Leaves: " + numberOfPairs + "; " + index2termMap + "]";
}
}