Package it.unimi.dsi.io

Examples of it.unimi.dsi.io.FastBufferedReader


      final File sizesFile = new File( this.inputBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );
      writeSizes &= sizesFile.exists();
      someSizes |= sizesFile.exists();

      term[ i ] = new MutableString();
      termReader[ i ] = new FastBufferedReader( new InputStreamReader( new FileInputStream( this.inputBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
      if ( termReader[ i ].readLine( term[ i ] ) != null ) termQueue.enqueue( i ); // If the term list is nonempty, we enqueue it
    }

    if ( writeGlobCounts != someGlobCounts ) LOGGER.warn"Some (but not all) global-counts file missing" );
    if ( writeSizes != someSizes ) LOGGER.warn"Some (but not all) sizes file missing" );
View Full Code Here


    return super.parseProperty( key, values, metadata );
  }

  public PdfDocumentFactory() throws IOException {
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }
View Full Code Here

  }
 
  public PdfDocumentFactory( final Properties properties ) throws IOException, ConfigurationException {
    super( properties );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }
View Full Code Here

  }

  public PdfDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) throws IOException {
    super( defaultMetadata );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }
View Full Code Here

  }

  public PdfDocumentFactory( final String[] property ) throws IOException, ConfigurationException {
    super( property );
    this.textStripper= new PDFTextStripper();
    this.wordReader = new FastBufferedReader();
  }
View Full Code Here

    parser.setCallback( composedBuilder.compose() );

    Object o;
    try {
      o = defaultMetadata.get( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER );
      wordReader = o == null ? new FastBufferedReader() : ObjectParser.fromSpec( o.toString(), WordReader.class, MG4JClassParser.PACKAGE );
    }
    catch ( Exception e ) {
      throw new RuntimeException( e );
    }
    text = new char[ DEFAULT_BUFFER_SIZE ];
View Full Code Here

    public Object content( final int field ) throws IOException {
      ensureFieldIndex( field );
      ensureParsed();
      switch( field ) {
        case 0: return new FastBufferedReader( textExtractor.text );
        case 1: return new FastBufferedReader( textExtractor.title );
        case 2: return anchorExtractor.anchors;
        default: throw new IllegalArgumentException();
      }
    }
View Full Code Here

  public void runTermsOnly() throws IOException {
    final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
   
    final PrintWriter[] localTerms = new PrintWriter[ numIndices ];
    final int numTerms[] = new int[ numIndices ];
    final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
   
    for( int i = 0; i < numIndices; i++ ) localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) );

    // The current term
    final MutableString currTerm = new MutableString();
   
    pl.itemsName = "terms";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index terms..." );

    int termNumber = 0, k;
   
    while( terms.readLine( currTerm ) != null ) {
      k = strategy.localIndex( termNumber ); // The local index for this term
      if ( numTerms[ k ] != strategy.localNumber( termNumber ) ) throw new IllegalStateException();
      numTerms[ k ]++;
      currTerm.println( localTerms[ k ] );
      pl.update();
      termNumber++;
    }

    terms.close();
    for( int i = 0; i < numIndices; i++ ) localTerms[ i ].close();

    pl.done();
  }
View Full Code Here

    final boolean isHighPerformance = BitStreamHPIndex.class.isAssignableFrom( Class.forName( new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION ).getString( Index.PropertyKeys.INDEXCLASS ) ) );
   
    final InputBitStream globalIndex = new InputBitStream( inputBasename + DiskBasedIndex.INDEX_EXTENSION, bufferSize );
    final long globalPositionsLength = new File( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION ).length();
    final InputBitStream globalPositions = isHighPerformance ? new InputBitStream( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize ) : null;
    final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
    final InputBitStream offsets = new InputBitStream( inputBasename + DiskBasedIndex.OFFSETS_EXTENSION );
   
    final File posNumBitsFile = new File( inputBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
    final InputBitStream posNumBits = posNumBitsFile.exists() ? new InputBitStream( inputBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION ) : null;
    final InputBitStream frequencies = new InputBitStream( inputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
    final InputBitStream globCounts = new InputBitStream( inputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
    offsets.readGamma();
   
    for( int i = 0; i < numIndices; i++ ) {
      localIndexStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.INDEX_EXTENSION, bufferSize );
      if ( isHighPerformance ) localPositionsStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize );
      localFrequencies[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.FREQUENCIES_EXTENSION );
      localGlobCounts[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) );
      localOffsets[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.OFFSETS_EXTENSION );
      if ( posNumBits != null ) localPosNumBits[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
      localOffsets[ i ].writeGamma( 0 );
    }

    // The current term
    final MutableString currTerm = new MutableString();
   
    pl.expectedUpdates = ( new File( inputBasename + DiskBasedIndex.INDEX_EXTENSION ).length() + ( isHighPerformance ? new File( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION ).length() : 0 ) ) * 8;
    pl.itemsName = "bits";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index..." );

    int termNumber = 0, k, prevK = -1, previousHeaderLength = 0, newHeaderLength = 0;
    long length, count, positionsOffset = 0;
    int res, frequency;
   
    while( terms.readLine( currTerm ) != null ) {
      k = strategy.localIndex( termNumber ); // The local index for this term
      if ( numTerms[ k ] != strategy.localNumber( termNumber ) ) throw new IllegalStateException();
      numTerms[ k ]++;
     
      if ( isHighPerformance ) {
        final long temp = globalIndex.readBits();
        positionsOffset = globalIndex.readLongDelta();
        previousHeaderLength = (int)( globalIndex.readBits() - temp );
        if ( prevK != -1 ) {
          length = positionsOffset - globalPositions.readBits();
          pl.count += length;
          while( length > 0 ) {
            res = (int)Math.min( bufferSize * 8, length );
            globalPositions.read( buffer, res );
            localPositionsStream[ prevK ].write( buffer, res );
            length -= res;
          }
        }
        newHeaderLength = localIndexStream[ k ].writeLongDelta( localPositionsStream[ k ].writtenBits() );
      }
     
     
      frequency = frequencies.readGamma();
      localFrequencies[ k ].writeGamma( frequency );
      numberOfPostings[ k ] += frequency;

      if ( posNumBits != null ) localPosNumBits[ k ].writeGamma( posNumBits.readGamma() );
     
      count = globCounts.readLongGamma();
      numberOfOccurrences[ k ] += count;
      localGlobCounts[ k ].writeLongGamma( count );
     
      currTerm.println( localTerms[ k ] );
     
      length = offsets.readLongGamma() - previousHeaderLength;
      localOffsets[ k ].writeLongGamma( length + newHeaderLength );
      pl.count += length + previousHeaderLength - 1;
     
      while( length > 0 ) {
        res = (int)Math.min( bufferSize * 8, length );
        globalIndex.read( buffer, res );
        localIndexStream[ k ].write( buffer, res );
        length -= res;
      }
     
      pl.update();
      prevK = k;
      termNumber++;
    }

    // We pour the last piece of positions
    if ( isHighPerformance ) {
      if ( prevK != -1 ) {
        length = globalPositionsLength * 8 - globalPositions.readBits();
        System.err.println( globalPositionsLength * 8 - globalPositions.readBits() );
        while( length > 0 ) {
          res = (int)Math.min( bufferSize * 8, length );
          globalPositions.read( buffer, res );
          localPositionsStream[ prevK ].write( buffer, res );
          length -= res;
        }
      }
    }

    pl.done();

    terms.close();
    offsets.close();
    frequencies.close();
    globCounts.close();
    globalIndex.close();
    if ( posNumBits != null ) posNumBits.close();
View Full Code Here

                    do {
                      text.append( word.readSelfDelimUTF8( rawContent ) );
                      if ( exact ) text.append( nonWord.readSelfDelimUTF8( rawContent ) );
                      else text.append( ' ' );
                    } while ( word.length() > 0 || ( exact && nonWord.length() > 0 ) );
                    fbr = new FastBufferedReader( text );
                    nextFieldToRead++;
                  }
                  return fbr.read( cbuf, off, len );
                }
              };
            }
          } catch ( IOException e ) {
            throw new RuntimeException( e );
          } catch (ClassNotFoundException e) {
            throw new RuntimeException( e );
          }
          return result;
        }

        public WordReader wordReader( final int field )  {
          ensureFieldIndex( field );
          if ( DEBUG ) LOGGER.debug( "Called wordReader(" + field + ")" );
          try {
            skipToField( field );
          } catch ( Exception e ) {
            throw new RuntimeException( e );
          }
          //logger.debug( "Asked for a new word reader for field " + fieldName( field ) );
          switch ( fieldType( field ) ) {
          case TEXT:
            return new WordReader() {
              private static final long serialVersionUID = 1L;
              public boolean next( final MutableString word, final MutableString nonWord ) throws IOException {
                try {
                  word.readSelfDelimUTF8( rawContent );
                }
                catch( EOFException e ) {
                  return false; // TODO: a bit raw
                }
                nonWord.length( 0 );
               
                if ( exact ) {
                  try {
                    nonWord.readSelfDelimUTF8( rawContent );
                  }
                  catch( EOFException e ) {
                    return true; // TODO: a bit raw
                  }
                }
                else nonWord.append( ' ' );

                final boolean goOn = word.length() != 0 || ( exact && nonWord.length() != 0 );
                if ( DEBUG ) LOGGER.debug( "Got word <" + word + "|" + nonWord + "> exact=" + exact + " returning " + goOn );
                if ( ! goOn ) nextFieldToRead++;
                return goOn;
              }
              public WordReader setReader( final Reader reader ) {
                return this;
              }
              public WordReader copy() {
                throw new UnsupportedOperationException();
              }
            };
          case VIRTUAL:
            return new FastBufferedReader();
          default:
            return null;
          }

        }
View Full Code Here

TOP

Related Classes of it.unimi.dsi.io.FastBufferedReader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.