Package org.apache.lucene.util

Examples of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator


        final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
        final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
        final IntsRef scratch = new IntsRef();
        for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
          builder.add(Util.toUTF16(ent.getKey(), scratch),
                      new CharsRef(ent.getValue()));
        }
        map = builder.finish();
        pendingPairs.clear();
      } catch (IOException ioe) {
        // Bogus FST IOExceptions!!  (will never happen)
View Full Code Here


    } else {
      this.origStemdict = null;
      // we don't need to ignore case here since we lowercase in this analyzer anyway
      StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
      CharArrayMap<String>.EntryIterator iter = stemOverrideDict.entrySet().iterator();
      CharsRef spare = new CharsRef();
      while (iter.hasNext()) {
        char[] nextKey = iter.nextKey();
        spare.copyChars(nextKey, 0, nextKey.length);
        builder.add(spare, iter.currentValue());
      }
      try {
        this.stemdict = builder.build();
      } catch (IOException ex) {
View Full Code Here

    this.delegate = delegate;
   
    for (String field : fields) {
      Set<String> stopWords = new HashSet<String>();
      Terms terms = MultiFields.getTerms(indexReader, field);
      CharsRef spare = new CharsRef();
      if (terms != null) {
        TermsEnum te = terms.iterator(null);
        BytesRef text;
        while ((text = te.next()) != null) {
          if (te.docFreq() > maxDocFreq) {
            UnicodeUtil.UTF8toUTF16(text, spare);
            stopWords.add(spare.toString());
          }
        }
      }
      stopWordsPerField.put(field, stopWords);
    }
View Full Code Here

        } else {
          finalLastToken = BytesRef.deepCopyOf(lastTokenFragment);
        }
        assert finalLastToken.offset == 0;
       
        CharsRef spare = new CharsRef();
       
        // complete top-N
        MinResult<Long> completions[] = null;
        try {
         
          // Because we store multiple models in one FST
          // (1gram, 2gram, 3gram), we must restrict the
          // search so that it only considers the current
          // model.  For highest order model, this is not
          // necessary since all completions in the FST
          // must be from this model, but for lower order
          // models we have to filter out the higher order
          // ones:
         
          // Must do num+seen.size() for queue depth because we may
          // reject up to seen.size() paths in acceptResult():
          Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {
           
            BytesRef scratchBytes = new BytesRef();
           
            @Override
            protected void addIfCompetitive(Util.FSTPath<Long> path) {
              if (path.arc.label != separator) {
                //System.out.println("    keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                super.addIfCompetitive(path);
              } else {
                //System.out.println("    prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
              }
            }
           
            @Override
            protected boolean acceptResult(IntsRef input, Long output) {
              Util.toBytesRef(input, scratchBytes);
              finalLastToken.grow(finalLastToken.length + scratchBytes.length);
              int lenSav = finalLastToken.length;
              finalLastToken.append(scratchBytes);
              //System.out.println("    accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
              boolean ret = seen.contains(finalLastToken) == false;
             
              finalLastToken.length = lenSav;
              return ret;
            }
          };
         
          // since this search is initialized with a single start node
          // it is okay to start with an empty input path here
          searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());
         
          completions = searcher.search();
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
       
        int prefixLength = token.length;
       
        BytesRef suffix = new BytesRef(8);
        //System.out.println("    " + completions.length + " completions");
       
        nextCompletion:
          for (MinResult<Long> completion : completions) {
            token.length = prefixLength;
            // append suffix
            Util.toBytesRef(completion.input, suffix);
            token.append(suffix);
           
            //System.out.println("    completion " + token.utf8ToString());
           
            // Skip this path if a higher-order model already
            // saw/predicted its last token:
            BytesRef lastToken = token;
            for(int i=token.length-1;i>=0;i--) {
              if (token.bytes[token.offset+i] == separator) {
                assert token.length-i-1 > 0;
                lastToken = new BytesRef(token.bytes, token.offset+i+1, token.length-i-1);
                break;
              }
            }
            if (seen.contains(lastToken)) {
              //System.out.println("      skip dup " + lastToken.utf8ToString());
              continue nextCompletion;
            }
            seen.add(BytesRef.deepCopyOf(lastToken));
            spare.grow(token.length);
            UnicodeUtil.UTF8toUTF16(token, spare);
            LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
            results.add(result);
            assert results.size() == seen.size();
            //System.out.println("  add result=" + result);
          }
        backoff *= ALPHA;
View Full Code Here

   * @return sorted list of the suggested words according to the comparator
   * @throws IOException If there is a low-level I/O error.
   */
  public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
      SuggestMode suggestMode, float accuracy) throws IOException {
    final CharsRef spare = new CharsRef();
    String text = term.text();
    if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
      return new SuggestWord[0];
   
    if (lowerCaseTerms) {
      term = new Term(term.field(), text.toLowerCase(Locale.ROOT));
    }
   
    int docfreq = ir.docFreq(term);
   
    if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
      return new SuggestWord[0];
    }
   
    int maxDoc = ir.maxDoc();
   
    if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) {
      return new SuggestWord[0];
    } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) {
      return new SuggestWord[0];
    }
   
    if (suggestMode!=SuggestMode.SUGGEST_MORE_POPULAR) docfreq = 0;
   
    if (thresholdFrequency >= 1f) {
      docfreq = Math.max(docfreq, (int) thresholdFrequency);
    } else if (thresholdFrequency > 0f) {
      docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1);
    }
   
    Collection<ScoreTerm> terms = null;
    int inspections = numSug * maxInspections;
   
    // try ed=1 first, in case we get lucky
    terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
    if (maxEdits > 1 && terms.size() < inspections) {
      HashSet<ScoreTerm> moreTerms = new HashSet<ScoreTerm>();
      moreTerms.addAll(terms);
      moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
      terms = moreTerms;
    }
   
    // create the suggestword response, sort it, and trim it to size.
   
    SuggestWord suggestions[] = new SuggestWord[terms.size()];
    int index = suggestions.length - 1;
    for (ScoreTerm s : terms) {
      SuggestWord suggestion = new SuggestWord();
      if (s.termAsString == null) {
        UnicodeUtil.UTF8toUTF16(s.term, spare);
        s.termAsString = spare.toString();
      }
      suggestion.string = s.termAsString;
      suggestion.score = s.score;
      suggestion.freq = s.docfreq;
      suggestions[index--] = suggestion;
View Full Code Here

    if (vector == null) {
      // null snippet
      return;
    }

    final CharsRef spare = new CharsRef();
    final TermsEnum termsEnum = vector.iterator(null);
    DocsAndPositionsEnum dpEnum = null;
    BytesRef text;
   
    int numDocs = reader.maxDoc();
   
    while ((text = termsEnum.next()) != null) {
      UnicodeUtil.UTF8toUTF16(text, spare);
      final String term = spare.toString();
      if (!termSet.contains(term)) {
        continue;
      }
      dpEnum = termsEnum.docsAndPositions(null, dpEnum);
      if (dpEnum == null) {
View Full Code Here

  public void testMaxPosition3WithSynomyms() throws IOException {
    MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false);
    tokenizer.setEnableChecks(false); // LimitTokenPositionFilter doesn't consume the entire stream that it wraps
   
    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("one"), new CharsRef("first"), true);
    builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
    builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
    CharsRef multiWordCharsRef = new CharsRef();
    SynonymMap.Builder.join(new String[] { "and", "indubitably", "single", "only" }, multiWordCharsRef);
    builder.add(new CharsRef("one"), multiWordCharsRef, true);
    SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
    builder.add(new CharsRef("two"), multiWordCharsRef, true);
    SynonymMap synonymMap = builder.build();
    TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
    stream = new LimitTokenPositionFilter(stream, 3); // consumeAllTokens defaults to false
   
    // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
View Full Code Here

    final BytesRef utf8Key = new BytesRef(key);
    try {

      Automaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRef spare = new CharsRef();

      //System.out.println("  now intersect exactFirst=" + exactFirst);
   
      // Intersect automaton w/ suggest wFST and get all
      // prefix starting nodes & their outputs:
View Full Code Here

   * @param termFreqMap a Map of terms and their frequencies
   * @param vector List of terms and their frequencies for a doc/field
   */
  private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator(null);
    final CharsRef spare = new CharsRef();
    BytesRef text;
    while((text = termsEnum.next()) != null) {
      UnicodeUtil.UTF8toUTF16(text, spare);
      final String term = spare.toString();
      if (isNoiseWord(term)) {
        continue;
      }
      final int freq = (int) termsEnum.totalTermFreq();

View Full Code Here

        } else if (outputs.upto < outputs.count) {
          // Still have pending outputs to replay at this
          // position
          input.reset();
          final int posIncr = outputs.posIncr;
          final CharsRef output = outputs.pullNext();
          clearAttributes();
          termAtt.copyBuffer(output.chars, output.offset, output.length);
          typeAtt.setType(TYPE_SYNONYM);
          int endOffset = outputs.getLastEndOffset();
          if (endOffset == -1) {
            endOffset = input.endOffset;
          }
          offsetAtt.setOffset(input.startOffset, endOffset);
          posIncrAtt.setPositionIncrement(posIncr);
          posLenAtt.setPositionLength(outputs.getLastPosLength());
          if (outputs.count == 0) {
            // Done with the buffered input and all outputs at
            // this position
            nextRead = rollIncr(nextRead);
            inputSkipCount--;
          }
          //System.out.println("  return token=" + termAtt.toString());
          return true;
        } else {
          // Done with the buffered input and all outputs at
          // this position
          input.reset();
          nextRead = rollIncr(nextRead);
          inputSkipCount--;
        }
      }

      if (finished && nextRead == nextWrite) {
        // End case: if any output syns went beyond end of
        // input stream, enumerate them now:
        final PendingOutputs outputs = futureOutputs[nextRead];
        if (outputs.upto < outputs.count) {
          final int posIncr = outputs.posIncr;
          final CharsRef output = outputs.pullNext();
          futureInputs[nextRead].reset();
          if (outputs.count == 0) {
            nextWrite = nextRead = rollIncr(nextRead);
          }
          clearAttributes();
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.