Package it.unimi.dsi.mg4j.tool

Source Code of it.unimi.dsi.mg4j.tool.URLMPHVirtualDocumentResolver

package it.unimi.dsi.mg4j.tool;

/*    
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2006-2010 Paolo Boldi
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 3 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/

import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.mg4j.document.Document;
import it.unimi.dsi.mg4j.util.MG4JClassParser;
import it.unimi.dsi.sux4j.mph.MWHCFunction;
import it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.util.BloomFilter;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.util.StringMap;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collection;

import org.apache.commons.lang.RandomStringUtils;
import org.apache.log4j.Logger;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

/** A virtual-document resolver based on document URIs.
*
* <p>Instances of this class store in a {@link StringMap} instances
* all URIs from a collection, and consider a virtual-document specification a (possibly relative) URI. The
* virtual-document specification is resolved against the document URI, and then the perfect hash is used
* to retrieve the corresponding document.
*
* <p>This class provides a main method that helps in building serialised resolvers from URI lists.
* In case of pathological document collections with duplicate URIs (most notably, the GOV2 collection
* used for TREC evaluations), an option makes it possible to add random noise to duplicates, so that
* minimal perfect hash construction does not go into an infinite loop. It is a rather crude solution, but it
* is nonsensical to have duplicate URIs in the first place. Additional option include the kind of minimal perfect
* hash function you want to use (e.g., out of {@link it.unimi.dsi.sux4j}) and the number of bits used to sign them.
*/

public class URLMPHVirtualDocumentResolver implements VirtualDocumentResolver {
  private static final long serialVersionUID = 1L;

  private static final Logger LOGGER = Logger.getLogger( URLMPHVirtualDocumentResolver.class );
 
  /** The term map used by this resolver to associated URI strings to numbers. */
  private final StringMap<? extends CharSequence> url2DocumentPointer;
  /** The cached URI of the last argument to {@link #context(Document)}. */
  private transient URI documentURI;

  public URLMPHVirtualDocumentResolver( final StringMap<? extends CharSequence> url2DocumentPointer ) {
    this.url2DocumentPointer = url2DocumentPointer;
  }

  public void context( final Document document ) {
    try {
      documentURI = new URI( document.uri().toString() ).normalize();
    }
    catch ( URISyntaxException e ) {
      documentURI = null;
    }
  }

  public int resolve( final CharSequence virtualDocumentSpec ) {
    try {
      URI virtualURI = URI.create( virtualDocumentSpec.toString() ).normalize();
      if ( ! virtualURI.isAbsolute() ) {
        if ( documentURI == null ) return -1;
        virtualURI = documentURI.resolve( virtualURI );
      }

      // TODO discard opaque?
      return (int)url2DocumentPointer.getLong( virtualURI.toString() );
    } catch ( Exception e ) {
      return -1;
    }
  }


  public int numberOfDocuments() {
    return url2DocumentPointer.size();
  }

  private static void makeUnique( final BloomFilter filter, final MutableString uri ) {
    while( ! filter.add( uri ) ) {
      LOGGER.debug( "Duplicate URI " + uri );
      uri.append( '/' ).append( RandomStringUtils.randomAlphanumeric( 32 ) );
    }
  }


  public static void main( final String[] arg ) throws JSAPException, IOException {
    final SimpleJSAP jsap = new SimpleJSAP( URLMPHVirtualDocumentResolver.class.getName(), "Builds a URL document resolver from a sequence of URIs, extracted typically using ScanMetadata, using a suitable function. You can specify that the list is sorted, in which case it is possible to generate a resolver that occupies less space.",
        new Parameter[] {
          new Switch( "sorted", 's', "sorted", "URIs are sorted: use a monotone minimal perfect hash function." ),
          new Switch( "iso", 'i', "iso", "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)." ),
          new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b'"buffer-size", "The size of the I/O buffer used to read terms." ),
          new FlaggedOption( "class", MG4JClassParser.getParser(), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "class", "A class used to create the function from URIs to their ranks; defaults to it.unimi.dsi.sux4j.mph.MHWCFunction for non-sorted inputs, and to it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction for sorted inputs." ),
          new FlaggedOption( "width", JSAP.INTEGER_PARSER, Integer.toString( Long.SIZE ), JSAP.NOT_REQUIRED, 'w', "width", "The width, in bits, of the signatures used to sign the function from URIs to their rank." ),
          new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file (without loading them into core memory) instead of standard input." ),
          new FlaggedOption( "uniqueUris", JSAP.INTSIZE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'U', "unique-uris", "Force URIs to be unique by adding random garbage at the end of duplicates; the argument is an upper bound for the number of URIs that will be read, and will be used to create a Bloom filter." ),
          new UnflaggedOption( "resolver", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the resolver." )
    });
   
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
   
    final int bufferSize = jsapResult.getInt( "bufferSize" );
    final String resolverName = jsapResult.getString( "resolver" );
    //final Class<?> tableClass = jsapResult.getClass( "class" );
    final boolean iso = jsapResult.getBoolean( "iso" );
    String termFile = jsapResult.getString( "termFile" );
   
    BloomFilter filter = null;
    final boolean uniqueURIs = jsapResult.userSpecified( "uniqueUris" );
    if ( uniqueURIs ) filter = new BloomFilter( jsapResult.getInt( "uniqueUris" ) );
   
    final Collection<? extends CharSequence> collection;
    if ( termFile == null ) {
      ArrayList<MutableString> termList = new ArrayList<MutableString>();
      final ProgressLogger pl = new ProgressLogger();
      pl.itemsName = "URIs";
      final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ), bufferSize ), pl );
     
      pl.start( "Reading URIs..." );
      MutableString uri;
      while( termIterator.hasNext() ) {
        uri = termIterator.next();
        if ( uniqueURIs ) makeUnique( filter, uri );
        termList.add( uri.copy() );
      }
      pl.done();
     
      collection = termList;
    }
    else {
      if ( uniqueURIs ) {
        // Create temporary file with unique URIs
        final ProgressLogger pl = new ProgressLogger();
        pl.itemsName = "URIs";
        pl.start( "Copying URIs..." );
        final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( new FileInputStream( termFile ) ), bufferSize ), pl );
        File temp = File.createTempFile( URLMPHVirtualDocumentResolver.class.getName(), ".uniqueuris" );
        temp.deleteOnExit();
        termFile = temp.toString();
        final FastBufferedOutputStream outputStream = new FastBufferedOutputStream( new FileOutputStream( termFile ), bufferSize );
        MutableString uri;
        while( termIterator.hasNext() ) {
          uri = termIterator.next();
          makeUnique( filter, uri );
          uri.writeUTF8( outputStream );
          outputStream.write( '\n' );
        }
        pl.done();
        outputStream.close();
      }
      collection = new FileLinesCollection( termFile, "UTF-8" );
    }
    LOGGER.debug( "Building function..." );
    final int width = jsapResult.getInt( "width" );
    if ( jsapResult.getBoolean( "sorted" ) ) BinIO.storeObject( new URLMPHVirtualDocumentResolver( new ShiftAddXorSignedStringMap( collection.iterator(), new TwoStepsLcpMonotoneMinimalPerfectHashFunction<CharSequence>( collection, iso ? TransformationStrategies.prefixFreeIso() : TransformationStrategies.prefixFreeUtf16() ), width ) ), resolverName );
    else BinIO.storeObject( new URLMPHVirtualDocumentResolver( new ShiftAddXorSignedStringMap( collection.iterator(), new MWHCFunction<CharSequence>( collection, iso ? TransformationStrategies.iso() : TransformationStrategies.utf16() ), width ) ), resolverName );
    LOGGER.debug( " done." );
    }
}
TOP

Related Classes of it.unimi.dsi.mg4j.tool.URLMPHVirtualDocumentResolver

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.