Package it.unimi.dsi.lang

Examples of it.unimi.dsi.lang.MutableString


    default: throw new IllegalArgumentException();
    }
  }
 
  public MarkingMutableString appendAndMark( final String s ) {
    return appendAndMark( new MutableString( s ) );
  }
View Full Code Here


  }
 
  public MarkingMutableString appendAndMark( final WordReader wordReader ) {
    //System.err.println( interval[ currInterval ] + "|" + new String( array, offset, length ) );
   
    MutableString word = new MutableString(), nonWord = new MutableString();
    try {
      while( wordReader.next( word, nonWord ) ) {
        if ( word.length() != 0 ) count++;
       
        if ( resume ) {
          while( currResumeInterval < interval.length && interval[ currResumeInterval ].interval.compareTo( count, leftRadius( currResumeInterval), rightRadius( currResumeInterval ) ) > 0 ) currResumeInterval++;
          if ( currResumeInterval == interval.length || ! interval[ currResumeInterval ].interval.contains( count, leftRadius( currResumeInterval), rightRadius( currResumeInterval ) ) ) {
            if ( ! skipping && oneCharOut ) append( marker.endOfBlock() );
            // There's nothing else we can do...
            if ( resume && currResumeInterval == interval.length ) return this;
            // Otherwise, we continue, but skipping.
            skipping = true;
            continue;
          }
         
          if ( skipping ) append( marker.startOfBlock() );
          skipping = false;
        }
       
        if ( word.length() !=0 ) {
          if ( ! marking && currMarkingInterval < interval.length && interval[ currMarkingInterval ].interval.contains( count ) ) {
            append( marker.startOfMark() );
            marking = true;
          }
         
          append( word );

          if ( marking && ( currMarkingInterval == interval.length || ! interval[ currMarkingInterval ].interval.contains( count + 1 ) ) ) {
            append( marker.endOfMark() );
            marking = false;
          }

          oneCharOut = true;
          if ( currMarkingInterval < interval.length && interval[ currMarkingInterval ].interval.compareTo( count + 1 ) > 0 ) currMarkingInterval++;
        }
       
        if ( nonWord.length() > 0 ) {
          oneCharOut = true;
          nonWord.squeezeWhitespace();
          append( escapeStrategy.escape( nonWord ) );
        }
      }
     
      if ( marking ) append( marker.endOfMark() );
View Full Code Here

  public Document document( int index ) throws IOException {
    ensureDocumentIndex( index );
    ensureFiles();
    documentsInputBitStream.position( docOffsets.getLong( index ) );
    final DataInputStream nonTextDataInputStream = hasNonText ? new DataInputStream( new FastBufferedInputStream( zipFile.getInputStream( zipFile.getEntry( Integer.toString( index ) ) ) ) ) : null;
    final MutableString uri = readSelfDelimitedUtf8String( documentsInputBitStream, new MutableString() );
    final MutableString title = readSelfDelimitedUtf8String( documentsInputBitStream, new MutableString() );

    return new AbstractDocument() {
      final MutableString fieldContent = new MutableString();
     
      @SuppressWarnings("unchecked")
      final Document fakeDocument = factory.getDocument( NullInputStream.getInstance(), Reference2ObjectMaps.EMPTY_MAP );
     
      int nextField = 0;

      public Object content( int field ) throws IOException {
        FieldType fieldType = factory.fieldType( field );

        if ( nextField > field ) throw new IllegalStateException();
        // Skip fields
        final MutableString s = new MutableString();
        int len;
        while( nextField < field ) {
          switch( fieldType ) {
          case TEXT:
            len = documentsInputBitStream.readDelta();
            if ( exact ) len *= 2;
            documentsInputBitStream.skipDeltas( len );
            break;
          case VIRTUAL:
            final int nfrag = nonTextDataInputStream.readInt();
            for ( int i = 0; i < 2 * nfrag; i++ ) MutableString.skipSelfDelimUTF8( nonTextDataInputStream );
            break;
          default:
            try { new ObjectInputStream( nonTextDataInputStream ).readObject(); } catch ( ClassNotFoundException e ) { throw new RuntimeException( e ); }
          }
          nextField++;
        }
       
        // Read field
        nextField++;

        switch( fieldType ) {
        case TEXT:
          len = documentsInputBitStream.readDelta();
          fieldContent.length( 0 );

          termsFrequencyKeeper.reset();
          if ( exact ) nonTermsFrequencyKeeper.reset();

          while( len-- != 0 ) {
            termsInputStream.position( termOffsets.getLong( termsFrequencyKeeper.decode( documentsInputBitStream.readDelta() ) ) );
            s.readSelfDelimUTF8( termsInputStream );
            fieldContent.append( s );
            if ( exact ) {
              nonTermsInputStream.position( nonTermOffsets.getLong( nonTermsFrequencyKeeper.decode( documentsInputBitStream.readDelta() ) ) );
              s.readSelfDelimUTF8( nonTermsInputStream );
              fieldContent.append( s );
            }
            else fieldContent.append( ' ');
          }
          return new FastBufferedReader( fieldContent );
        case VIRTUAL:
          final int nfrag = nonTextDataInputStream.readInt();
          MutableString doc = new MutableString();
          MutableString text = new MutableString();
          VirtualDocumentFragment[] fragArray = new VirtualDocumentFragment[ nfrag ];
          for ( int i = 0; i < nfrag; i++ ) {
            doc.readSelfDelimUTF8( (InputStream)nonTextDataInputStream );
            text.readSelfDelimUTF8( (InputStream)nonTextDataInputStream );
            fragArray[ i ] = new AnchorExtractor.Anchor( doc.copy(), text.copy() );
          }
          return new ObjectArrayList<VirtualDocumentFragment>( fragArray );

        default:
          try { return new ObjectInputStream( nonTextDataInputStream ).readObject(); } catch ( ClassNotFoundException e ) { throw new RuntimeException( e ); }
View Full Code Here

    FastBufferedOutputStream uriStream = null, titleStream = null;
   
    if ( jsapResult.userSpecified( "uris" ) ) uriStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "uris" ) ) );
    if ( jsapResult.userSpecified( "titles" ) ) titleStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "titles" ) ) );
   
    MutableString s = new MutableString();

    ProgressLogger progressLogger = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "documents" );
    if ( documentSequence instanceof DocumentCollection ) progressLogger.expectedUpdates = ((DocumentCollection)documentSequence).size();
    progressLogger.start( "Scanning..." );
   
    while( ( document = documentIterator.nextDocument() ) != null ) {
      if ( uriStream != null ) {
        s.replace( document.uri() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( uriStream );
        uriStream.write( '\n' );
      }
      if ( titleStream != null ) {
        s.replace( document.title() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( titleStream );
        titleStream.write( '\n' );
      }
      progressLogger.lightUpdate();
    }
   
View Full Code Here

    final long[] termFrequency = new long[ (int)collection.terms ];
    final long[] nonTermFrequency = collection.exact ? new long[ (int)collection.nonTerms ] : null;
    final InputBitStream documentsIbs = collection.documentsInputBitStream;
    final DocumentFactory factory = collection.factory;
    final boolean exact = collection.exact;
    final MutableString s = new MutableString();
    documentsIbs.position( 0 );
    for( int i = (int)collection.documents; i-- != 0; ) {
      readSelfDelimitedUtf8String( documentsIbs, s ); // Skip URI
      readSelfDelimitedUtf8String( documentsIbs, s ); // Skip title
      for( int f = factory.numberOfFields() - 1; f-- !=0; ) {
        int len = documentsIbs.readDelta();
        while( len-- != 0 ) {
          termFrequency[ documentsIbs.readDelta() ]++;
          if ( exact ) nonTermFrequency[ documentsIbs.readDelta() ]++;
        }
      }
    }
   
    int[] termPerm = new int[ termFrequency.length ];
    for( int i = termPerm.length; i-- != 0; ) termPerm[ i ] = i;
    IntArrays.quickSort( termPerm, 0, termPerm.length, new AbstractIntComparator() {
      public int compare( int arg0, int arg1 ) {
        return termFrequency[ arg1 ] - termFrequency[ arg0 ] < 0 ? -1 : termFrequency[ arg1 ] == termFrequency[ arg0 ] ? 0 : 1;
      }
    });
   
    int[] invTermPerm = new int[ termFrequency.length ];
    for( int i = invTermPerm.length; i-- != 0; ) invTermPerm[ termPerm[ i ] ] = i;
   
    int[] nonTermPerm = null, invNonTermPerm = null;
    if ( exact ) {
      nonTermPerm = new int[ termFrequency.length ];
      for( int i = nonTermPerm.length; i-- != 0; ) nonTermPerm[ i ] = i;
      IntArrays.quickSort( nonTermPerm, 0, nonTermPerm.length, new AbstractIntComparator() {
        public int compare( int arg0, int arg1 ) {
          return termFrequency[ arg1 ] - termFrequency[ arg0 ] < 0 ? -1 : termFrequency[ arg1 ] == termFrequency[ arg0 ] ? 0 : 1;
        }
      });
      invNonTermPerm = new int[ nonTermFrequency.length ];
      for( int i = invNonTermPerm.length; i-- != 0; ) invNonTermPerm[ nonTermPerm[ i ] ] = i;
    }

    File newDocumentsFile = File.createTempFile( SimpleCompressedDocumentCollection.class.getSimpleName(), "temp", new File( basename.toString() ).getParentFile() );
    OutputBitStream newDocumentsObs = new OutputBitStream( newDocumentsFile );
    documentsIbs.position( 0 );
    for( int i = (int)collection.documents; i-- != 0; ) {
      readSelfDelimitedUtf8String( documentsIbs, s ); // Skip URI
      SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String( newDocumentsObs, s );
      readSelfDelimitedUtf8String( documentsIbs, s ); // Skip title
      SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String( newDocumentsObs, s );
      for( int f = factory.numberOfFields() - 1; f-- !=0; ) {
        int len = documentsIbs.readDelta();
        newDocumentsObs.writeDelta( len );
        while( len-- != 0 ) {
          newDocumentsObs.writeDelta( invTermPerm[ documentsIbs.readDelta() ] );
          if ( exact ) newDocumentsObs.writeDelta( invNonTermPerm[ documentsIbs.readDelta() ] );
        }
      }
    }
    newDocumentsObs.close();
    new File( basename + DOCUMENTS_EXTENSION ).delete();
    newDocumentsFile.renameTo( new File( basename + DOCUMENTS_EXTENSION ) );
    newDocumentsObs = null;
    invTermPerm = invNonTermPerm = null;
   
    FastBufferedInputStream termsStream = new FastBufferedInputStream( new FileInputStream( basename + TERMS_EXTENSION ) ) ;
    MutableString term[] = new MutableString[ (int)collection.terms ];
    for( int i = 0; i < term.length; i++ ) term[ i ] = new MutableString().readSelfDelimUTF8( termsStream );
    termsStream.close();

    new FastBufferedOutputStream( new FileOutputStream( basename + TERMS_EXTENSION ) );
  }
View Full Code Here

   * @param additionalWhere an additional condition for the <samp>WHERE</samp> clause.
   * @return a complete query based on instance data and <code>additionalWhere</code>,
   */
 
  private String buildQuery( final String additionalWhere ) {
    final MutableString query = new MutableString();
    query.append( "SELECT " ).append( select );
    if ( where == null && additionalWhere != null ) query.append( " WHERE (" ).append( additionalWhere ).append( ")" );
    if ( where != null && additionalWhere == null ) query.append( " WHERE (" ).append( where ).append( ")" );
    if ( where != null && additionalWhere != null ) query.append( " WHERE (" ).append( where ).append( ") AND (" ).append( additionalWhere ).append( ")" );
    query.append( " ORDER BY 1" );
    return query.toString();
  }
View Full Code Here

  public int size() {
    return doc2id.length;
  }

  public Document document( final int index ) throws IOException {
    final MutableString title = new MutableString();
    return factory.getDocument( stream( index, title ), metadata( index, title ) );
  }
View Full Code Here

    if ( cutPoint[ 0 ] != 0 ) throw new IllegalArgumentException( "The first cutpoint must be 0" );
    this.cutPoint = cutPoint;
    // Defensive copy
    this.k = cutPoint.length - 1;
    this.cutPointTerm = new MutableString[ k + 1 ];
    for( int i = 0; i < k; i++ ) this.cutPointTerm[ i ] = new MutableString( cutPointTerm[ i ] );
    this.cutPointTerm[ k ] = new MutableString( "\uFFFF" );
  }
View Full Code Here

    final InputBitStream globCounts = new InputBitStream( basename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
    long gc[] = new long[ numberOfTerms ];
    for( int t = 0; t < numberOfTerms; t++ ) gc[ t ] = globCounts.readLongGamma();
    globCounts.close();

    final MutableString line = new MutableString();
    MutableString number;
    final FastBufferedReader reader = new FastBufferedReader( new FileReader( statFile ) );
   
    boolean dumping = false;
    int f, q;
    reader.readLine( line );
    while( reader.readLine( line ) != null ) {
      if ( line.charAt( 0 ) == '#' ) {
        number = line.substring( 2 );
        f = Integer.parseInt( number.delete( number.indexOf( ' ' ), number.length() ).toString() );
        double freq = (double)gc[ f ] / numberOfoccurrences;
        if ( print ) System.out.println( line + " " + format( freq ) );
        else {
          if ( quantumBitLength != 0 ) {
            // We choose using the quantum bit length
            number = line.substring( 2 );
            number = number.substring( number.indexOf( ' ' ) + 1 );
            q = Integer.parseInt( number.delete( number.indexOf( ' ' ), number.length() ).toString() );
            dumping = q >= lowQbl && q <= highQbl;
          }
          else dumping = freq >= lowGlobFreq && freq <= highGlobFreq;
        }
        if ( dumping ) line.println( System.out );
View Full Code Here

      final ProgressLogger pl = new ProgressLogger();
      pl.itemsName = "URIs";
      final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ), bufferSize ), pl );
     
      pl.start( "Reading URIs..." );
      MutableString uri;
      while( termIterator.hasNext() ) {
        uri = termIterator.next();
        if ( uniqueURIs ) makeUnique( filter, uri );
        termList.add( uri.copy() );
      }
      pl.done();
     
      collection = termList;
    }
    else {
      if ( uniqueURIs ) {
        // Create temporary file with unique URIs
        final ProgressLogger pl = new ProgressLogger();
        pl.itemsName = "URIs";
        pl.start( "Copying URIs..." );
        final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( new FileInputStream( termFile ) ), bufferSize ), pl );
        File temp = File.createTempFile( URLMPHVirtualDocumentResolver.class.getName(), ".uniqueuris" );
        temp.deleteOnExit();
        termFile = temp.toString();
        final FastBufferedOutputStream outputStream = new FastBufferedOutputStream( new FileOutputStream( termFile ), bufferSize );
        MutableString uri;
        while( termIterator.hasNext() ) {
          uri = termIterator.next();
          makeUnique( filter, uri );
          uri.writeUTF8( outputStream );
          outputStream.write( '\n' );
        }
        pl.done();
        outputStream.close();
      }
View Full Code Here

TOP

Related Classes of it.unimi.dsi.lang.MutableString

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.