Package org.apache.lucene.util

Examples of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator


    } else if (output == NO_OUTPUT) {
      return prefix;
    } else {
      assert prefix.length > 0;
      assert output.length > 0;
      CharsRef result = new CharsRef(prefix.length + output.length);
      System.arraycopy(prefix.chars, prefix.offset, result.chars, 0, prefix.length);
      System.arraycopy(output.chars, output.offset, result.chars, prefix.length, output.length);
      result.length = prefix.length + output.length;
      return result;
    }
View Full Code Here


  public CharsRef read(DataInput in) throws IOException {
    final int len = in.readVInt();
    if (len == 0) {
      return NO_OUTPUT;
    } else {
      final CharsRef output = new CharsRef(len);
      for(int idx=0;idx<len;idx++) {
        output.chars[idx] = (char) in.readVInt();
      }
      output.length = len;
      return output;
View Full Code Here

  CharsRef utf16Result2;
  private final BytesRef scratchBytes = new BytesRef();

  // Currently used only by assert statements
  private boolean initUTF16Results() {
    utf16Result1 = new CharsRef(10);
    utf16Result2 = new CharsRef(10);
    return true;
  }
View Full Code Here

        });
      }
    } else if (query instanceof TermRangeQuery) {
      final TermRangeQuery tq = (TermRangeQuery) query;
      if (tq.getField().equals(field)) {
        final CharsRef lowerBound;
        if (tq.getLowerTerm() == null) {
          lowerBound = null;
        } else {
          lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
        }
       
        final CharsRef upperBound;
        if (tq.getUpperTerm() == null) {
          upperBound = null;
        } else {
          upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
        }
       
        final boolean includeLower = tq.includesLower();
        final boolean includeUpper = tq.includesUpper();
        final CharsRef scratch = new CharsRef();
        final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
       
        // this is *not* an automaton, but its very simple
        list.add(new CharacterRunAutomaton(BasicAutomata.makeEmpty()) {
          @Override
View Full Code Here

   * @param termFreqMap a Map of terms and their frequencies
   * @param vector List of terms and their frequencies for a doc/field
   */
  private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator(null);
    final CharsRef spare = new CharsRef();
    BytesRef text;
    while((text = termsEnum.next()) != null) {
      UnicodeUtil.UTF8toUTF16(text, spare);
      final String term = spare.toString();
      if (isNoiseWord(term)) {
        continue;
      }
      final int freq = (int) termsEnum.totalTermFreq();

View Full Code Here

    }
    count = 0;
    trie = new JaspellTernarySearchTrie();
    trie.setMatchAlmostDiff(editDistance);
    BytesRef spare;
    final CharsRef charsSpare = new CharsRef();

    while ((spare = tfit.next()) != null) {
      final long weight = tfit.weight();
      if (spare.length == 0) {
        continue;
      }
      charsSpare.grow(spare.length);
      UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
      trie.put(charsSpare.toString(), Long.valueOf(weight));
    }
  }
View Full Code Here

    int maxCnt = Math.min(num, list.size());
    if (onlyMorePopular) {
      LookupPriorityQueue queue = new LookupPriorityQueue(num);
      for (String s : list) {
        long freq = ((Number)trie.get(s)).longValue();
        queue.insertWithOverflow(new LookupResult(new CharsRef(s), freq));
      }
      for (LookupResult lr : queue.getResults()) {
        res.add(lr);
      }
    } else {
      for (int i = 0; i < maxCnt; i++) {
        String s = list.get(i);
        long freq = ((Number)trie.get(s)).longValue();
        res.add(new LookupResult(new CharsRef(s), freq));
      }     
    }
    return res;
  }
View Full Code Here

    }

    ArrayList<String> tokens = new ArrayList<>();
    ArrayList<Number> vals = new ArrayList<>();
    BytesRef spare;
    CharsRef charsSpare = new CharsRef();
    while ((spare = tfit.next()) != null) {
      charsSpare.grow(spare.length);
      UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
      tokens.add(charsSpare.toString());
      vals.add(Long.valueOf(tfit.weight()));
    }
    autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
  }
View Full Code Here

    final BytesRef utf8Key = new BytesRef(key);
    try {

      Automaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRef spare = new CharsRef();

      //System.out.println("  now intersect exactFirst=" + exactFirst);
   
      // Intersect automaton w/ suggest wFST and get all
      // prefix starting nodes & their outputs:
View Full Code Here

        } else {
          finalLastToken = BytesRef.deepCopyOf(lastTokenFragment);
        }
        assert finalLastToken.offset == 0;
       
        CharsRef spare = new CharsRef();
       
        // complete top-N
        TopResults<Long> completions = null;
        try {
         
          // Because we store multiple models in one FST
          // (1gram, 2gram, 3gram), we must restrict the
          // search so that it only considers the current
          // model.  For highest order model, this is not
          // necessary since all completions in the FST
          // must be from this model, but for lower order
          // models we have to filter out the higher order
          // ones:
         
          // Must do num+seen.size() for queue depth because we may
          // reject up to seen.size() paths in acceptResult():
          Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {
           
            BytesRef scratchBytes = new BytesRef();
           
            @Override
            protected void addIfCompetitive(Util.FSTPath<Long> path) {
              if (path.arc.label != separator) {
                //System.out.println("    keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                super.addIfCompetitive(path);
              } else {
                //System.out.println("    prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
              }
            }
           
            @Override
            protected boolean acceptResult(IntsRef input, Long output) {
              Util.toBytesRef(input, scratchBytes);
              finalLastToken.grow(finalLastToken.length + scratchBytes.length);
              int lenSav = finalLastToken.length;
              finalLastToken.append(scratchBytes);
              //System.out.println("    accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
              boolean ret = seen.contains(finalLastToken) == false;
             
              finalLastToken.length = lenSav;
              return ret;
            }
          };
         
          // since this search is initialized with a single start node
          // it is okay to start with an empty input path here
          searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());
         
          completions = searcher.search();
          assert completions.isComplete;
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
       
        int prefixLength = token.length;
       
        BytesRef suffix = new BytesRef(8);
        //System.out.println("    " + completions.length + " completions");
       
        nextCompletion:
          for (Result<Long> completion : completions) {
            token.length = prefixLength;
            // append suffix
            Util.toBytesRef(completion.input, suffix);
            token.append(suffix);
           
            //System.out.println("    completion " + token.utf8ToString());
           
            // Skip this path if a higher-order model already
            // saw/predicted its last token:
            BytesRef lastToken = token;
            for(int i=token.length-1;i>=0;i--) {
              if (token.bytes[token.offset+i] == separator) {
                assert token.length-i-1 > 0;
                lastToken = new BytesRef(token.bytes, token.offset+i+1, token.length-i-1);
                break;
              }
            }
            if (seen.contains(lastToken)) {
              //System.out.println("      skip dup " + lastToken.utf8ToString());
              continue nextCompletion;
            }
            seen.add(BytesRef.deepCopyOf(lastToken));
            spare.grow(token.length);
            UnicodeUtil.UTF8toUTF16(token, spare);
            LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
            results.add(result);
            assert results.size() == seen.size();
            //System.out.println("  add result=" + result);
          }
        backoff *= ALPHA;
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.