Examples of it.unimi.dsi.lang.MutableString

it.unimi.dsi.lang.MutableString

        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        
        final MutableString word = new MutableString(), nonWord = new MutableString();
        
        int docCounter = 0;
        
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;


          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              // TODO: write tests for the other case
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );  
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents(  0  );
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                } 
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              // text index
              pos = 0;
              termsInDoc[ i ].clear();
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
                else {
                  if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
                  if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
                }
              }


              if ( allBitStreamIndices ) {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();


                  IndexIterator indexIterator = indexReader[ i ].documents( t );


                  int pointer = indexIterator.skipTo( currDoc );
                  if ( pointer == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c ) 
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );


                          for( int j = 0; j < c; j++ ) 
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )  
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    } 
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
                }
              }
              else {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();
                  IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );


                  if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c ) 
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );


                          for( int j = 0; j < c; j++ ) 
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )  
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  } 
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
                }
              }
            }
          }
          docCounter++;
          document.close();
          pl.update();
        }
      }
      else {
        LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );


        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        
        final MutableString word = new MutableString(), nonWord = new MutableString();
        
        int docCounter = 0;
        
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;

View Full Code Here

   * @param maxBefore maximum number of words to be considered before of the anchor.
   * @param maxAfter maximum number of words to be considered after the anchor.
   */
  public AnchorExtractor( int maxBefore, int maxAnchor, int maxAfter ) {
    preAnchor = new CircularCharArrayBuffer( maxBefore );
    anchor = new MutableString( maxAnchor );
    postAnchor = new MutableString( maxAfter );
    result = new MutableString( maxBefore + maxAnchor + maxAfter );
    this.maxAfter = maxAfter;
    this.maxAnchor = maxAnchor;
    state = State.BEFORE_ANCHOR;
  }

View Full Code Here

    result.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings );
    result.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );
    result.setProperty( Index.PropertyKeys.INDEXCLASS, FileIndex.class.getName() );
    // We save all flags, except for the PAYLOAD component, which is just used internally.
    for( Map.Entry<Component,Coding> e: flags.entrySet() )
      if ( e.getKey() != Component.PAYLOADS ) result.addProperty( Index.PropertyKeys.CODING, new MutableString().append( e.getKey() ).append( ':' ).append( e.getValue() ) );
    return result;
  }

View Full Code Here

  
  public void startDocument( final CharSequence title, final CharSequence uri ) throws IOException {
    final ZipEntry currEntry = new ZipEntry( Integer.toString( numberOfDocuments ) );
    currEntry.setComment( title.toString() );
    zipOut.putNextEntry( currEntry );
    new MutableString( uri != null ? uri : "" ).writeSelfDelimUTF8( zipOut );
    
  }

View Full Code Here


    final DocumentIterator docIt = inputSequence.iterator();
    if ( factory != inputSequence.factory() ) throw new IllegalStateException( "The factory provided by the constructor does not correspond to the factory of the input sequence" );
    final int numberOfFields = factory.numberOfFields();
    WordReader wordReader;
    MutableString word = new MutableString();
    MutableString nonWord = new MutableString();
    open( "" );
    for (;;) {
      Document document = docIt.nextDocument();
      if ( document == null ) break;
      startDocument( document.title(), document.uri() );

View Full Code Here


public class DoublingTermExpander extends AbstractTermExpander {


  @Override
  public Query expand( Term term ) {
    return new MultiTerm( term, new Term( new MutableString( term.term ).append( term.term ) ) );
  }

View Full Code Here

    return new MultiTerm( term, new Term( new MutableString( term.term ).append( term.term ) ) );
  }


  @Override
  public Query expand( Prefix prefix ) {
    return new MultiTerm( new Term( prefix.prefix ), new Term( new MutableString( prefix.prefix ).append( prefix.prefix ) ) );
  }

View Full Code Here

  public void reset() {
    curr = -1;
  }
  
  public String toString() {
    MutableString result = new MutableString();
    result.append( '[' );
    for( int i = 0; i < document.length; i++ ) {
      if ( i != 0 ) result.append( ", " );
      result.append( '<' ).append( document[ i ] ).append( ':' ).append( Arrays.toString(  position[ i ] ) ).append( '>' );
    }
    return result.append( ']' ).toString();
  }

View Full Code Here


  public int processDocument( WordReader wordReader, int documentIndex, int startPos, Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>> termMap, TermProcessor termProcessor )
      throws IOException {
    assertTrue( documentIndex >= 0 );
    Object2ObjectOpenHashMap<MutableString, IntArrayList> terms = new Object2ObjectOpenHashMap<MutableString, IntArrayList>();
    MutableString word = new MutableString(), nonWord = new MutableString();


    int pos = startPos;
    while ( wordReader.next( word, nonWord ) ) {
      if ( word.length() == 0 ) continue;
      if ( !termProcessor.processTerm( word ) ) {

View Full Code Here

public class PorterStemmerTest extends TestCase {


  public void testShort() {
    PorterStemmer stemmer = new PorterStemmer();
    
    MutableString s = new MutableString();
    s.append( 's' );
    stemmer.processTerm( s );
    assertEquals( "s", s.toString() );


    s.append( 's' );
    stemmer.processTerm( s );
    assertEquals( "ss", s.toString() );


  
    s.length( 0 );


    s.append( 'S' );
    stemmer.processTerm( s );
    assertEquals( "s", s.toString() );


    s.append( 's' );
    stemmer.processTerm( s );
    assertEquals( "ss", s.toString() );


  }

View Full Code Here

0 1 2 3 4 5 6 7

TOP

Related Classes of it.unimi.dsi.lang.MutableString

edu.uci.ics.crawler4j.crawler.LinkExtractor

it.unimi.dsi.mg4j.document.AbstractDocumentCollection

it.unimi.dsi.mg4j.document.AbstractDocumentFactory

it.unimi.dsi.mg4j.document.DocumentCollectionTest

it.unimi.dsi.mg4j.document.JavamailDocumentCollection

it.unimi.dsi.mg4j.document.JdbcDocumentCollection

it.unimi.dsi.mg4j.document.MapVirtualDocumentCollection

it.unimi.dsi.mg4j.document.SimpleCompressedDocumentCollection

it.unimi.dsi.mg4j.document.SimpleCompressedDocumentCollectionBuilder

it.unimi.dsi.mg4j.document.ZipDocumentCollection$ZipFactory

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.