Examples of org.apache.lucene.util.OfflineSorter.ByteSequencesReader

Package org.apache.lucene.util.OfflineSorter

Examples of org.apache.lucene.util.OfflineSorter.ByteSequencesReader

org.apache.lucene.util.OfflineSorter.ByteSequencesReader

      }
    });
    sorter.sort(unsorted, sorted);
    unsorted.delete();
    
    ByteSequencesReader reader = new ByteSequencesReader(sorted);
    BytesRef scratchLine = new BytesRef();
    
    // TODO: the flags themselves can be double-chars (long) or also numeric
    // either way the trick is to encode them as char... but they must be parsed differently
    
    String currentEntry = null;
    IntsRef currentOrds = new IntsRef();
    
    String line;
    while (reader.read(scratchLine)) {
      line = scratchLine.utf8ToString();
      String entry;
      char wordForm[];
      
      int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
      if (flagSep == -1) {
        wordForm = NOFLAGS;
        entry = line;
      } else {
        // note, there can be comments (morph description) after a flag.
        // we should really look for any whitespace: currently just tab and space
        int end = line.indexOf('\t', flagSep);
        if (end == -1)
          end = line.length();
        int end2 = line.indexOf(' ', flagSep);
        if (end2 == -1)
          end2 = line.length();
        end = Math.min(end, end2);
        
        String flagPart = line.substring(flagSep + 1, end);
        if (aliasCount > 0) {
          flagPart = getAliasValue(Integer.parseInt(flagPart));
        } 
        
        wordForm = flagParsingStrategy.parseFlags(flagPart);
        Arrays.sort(wordForm);
        entry = line.substring(0, flagSep);
      }


      int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
      if (cmp < 0) {
        throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
      } else {
        encodeFlags(flagsScratch, wordForm);
        int ord = flagLookup.add(flagsScratch);
        if (ord < 0) {
          // already exists in our hash
          ord = (-ord)-1;
        }
        // finalize current entry, and switch "current" if necessary
        if (cmp > 0 && currentEntry != null) {
          Util.toUTF32(currentEntry, scratchInts);
          words.add(scratchInts, currentOrds);
        }
        // swap current
        if (cmp > 0 || currentEntry == null) {
          currentEntry = entry;
          currentOrds = new IntsRef(); // must be this way
        }
        currentOrds.grow(currentOrds.length+1);
        currentOrds.ints[currentOrds.length++] = ord;
      }
    }
    
    // finalize last entry
    Util.toUTF32(currentEntry, scratchInts);
    words.add(scratchInts, currentOrds);
    
    reader.close();
    sorted.delete();
  }

View Full Code Here

      }
    });
    sorter.sort(unsorted, sorted);
    unsorted.delete();
    
    ByteSequencesReader reader = new ByteSequencesReader(sorted);
    BytesRef scratchLine = new BytesRef();
    
    // TODO: the flags themselves can be double-chars (long) or also numeric
    // either way the trick is to encode them as char... but they must be parsed differently
    
    String currentEntry = null;
    IntsRef currentOrds = new IntsRef();
    
    String line;
    while (reader.read(scratchLine)) {
      line = scratchLine.utf8ToString();
      String entry;
      char wordForm[];
      
      int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
      if (flagSep == -1) {
        wordForm = NOFLAGS;
        entry = line;
      } else {
        // note, there can be comments (morph description) after a flag.
        // we should really look for any whitespace: currently just tab and space
        int end = line.indexOf('\t', flagSep);
        if (end == -1)
          end = line.length();
        int end2 = line.indexOf(' ', flagSep);
        if (end2 == -1)
          end2 = line.length();
        end = Math.min(end, end2);
        
        String flagPart = line.substring(flagSep + 1, end);
        if (aliasCount > 0) {
          flagPart = getAliasValue(Integer.parseInt(flagPart));
        } 
        
        wordForm = flagParsingStrategy.parseFlags(flagPart);
        Arrays.sort(wordForm);
        entry = line.substring(0, flagSep);
      }


      int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
      if (cmp < 0) {
        throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
      } else {
        encodeFlags(flagsScratch, wordForm);
        int ord = flagLookup.add(flagsScratch);
        if (ord < 0) {
          // already exists in our hash
          ord = (-ord)-1;
        }
        // finalize current entry, and switch "current" if necessary
        if (cmp > 0 && currentEntry != null) {
          Util.toUTF32(currentEntry, scratchInts);
          words.add(scratchInts, currentOrds);
        }
        // swap current
        if (cmp > 0 || currentEntry == null) {
          currentEntry = entry;
          currentOrds = new IntsRef(); // must be this way
        }
        currentOrds.grow(currentOrds.length+1);
        currentOrds.ints[currentOrds.length++] = ord;
      }
    }
    
    // finalize last entry
    Util.toUTF32(currentEntry, scratchInts);
    words.add(scratchInts, currentOrds);
    
    reader.close();
    sorted.delete();
  }

View Full Code Here

      while ((spare = source.next()) != null) {
        encode(writer, output, buffer, spare, source.payload(), source.contexts(), source.weight());
      }
      writer.close();
      new OfflineSorter(tieBreakByCostComparator).sort(tempInput, tempSorted);
      ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(tempSorted);
      success = true;
      return reader;
      
    } finally {
      if (success) {

View Full Code Here

      while ((spare = source.next()) != null) {
        encode(writer, output, buffer, spare, source.weight());
      }
      writer.close();
      new OfflineSorter(tieBreakByCostComparator).sort(tempInput, tempSorted);
      ByteSequencesReader reader = new ByteSequencesReader(tempSorted);
      success = true;
      return reader;
      
    } finally {
      if (success) {

View Full Code Here

TOP

Related Classes of org.apache.lucene.util.OfflineSorter.ByteSequencesReader

org.apache.lucene.analysis.hunspell.Dictionary

org.apache.lucene.search.suggest.SortedInputIterator

org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.