Package org.apache.lucene.util

Examples of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator


    while ((line = in.readLine()) != null) {
      if (line.length() == 0 || line.charAt(0) == '#') {
        continue; // ignore empty lines and comments
      }
     
      CharsRef inputs[];
      CharsRef outputs[];
     
      // TODO: we could process this more efficiently.
      String sides[] = split(line, "=>");
      if (sides.length > 1) { // explicit mapping
        if (sides.length != 2) {
          throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
        }
        String inputStrings[] = split(sides[0], ",");
        inputs = new CharsRef[inputStrings.length];
        for (int i = 0; i < inputs.length; i++) {
          inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
        }
       
        String outputStrings[] = split(sides[1], ",");
        outputs = new CharsRef[outputStrings.length];
        for (int i = 0; i < outputs.length; i++) {
          outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
        }
      } else {
        String inputStrings[] = split(line, ",");
        inputs = new CharsRef[inputStrings.length];
        for (int i = 0; i < inputs.length; i++) {
          inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
        }
        if (expand) {
          outputs = inputs;
        } else {
          outputs = new CharsRef[1];
View Full Code Here


  public void add(Reader in) throws IOException, ParseException {
    LineNumberReader br = new LineNumberReader(in);
    try {
      String line = null;
      String lastSynSetID = "";
      CharsRef synset[] = new CharsRef[8];
      int synsetSize = 0;
     
      while ((line = br.readLine()) != null) {
        String synSetID = line.substring(2, 11);

        if (!synSetID.equals(lastSynSetID)) {
          addInternal(synset, synsetSize);
          synsetSize = 0;
        }

        if (synset.length <= synsetSize+1) {
          CharsRef larger[] = new CharsRef[synset.length * 2];
          System.arraycopy(synset, 0, larger, 0, synsetSize);
          synset = larger;
        }
       
        synset[synsetSize] = parseSynonym(line, synset[synsetSize]);
View Full Code Here

    }
  }
  private CharsRef parseSynonym(String line, CharsRef reuse) throws IOException {
    if (reuse == null) {
      reuse = new CharsRef(8);
    }
   
    int start = line.indexOf('\'')+1;
    int end = line.lastIndexOf('\'');
   
View Full Code Here

        } else if (outputs.upto < outputs.count) {
          // Still have pending outputs to replay at this
          // position
          input.reset();
          final int posIncr = outputs.posIncr;
          final CharsRef output = outputs.pullNext();
          clearAttributes();
          termAtt.copyBuffer(output.chars, output.offset, output.length);
          typeAtt.setType(TYPE_SYNONYM);
          offsetAtt.setOffset(input.startOffset, input.endOffset);
          posIncrAtt.setPositionIncrement(posIncr);
          if (outputs.count == 0) {
            // Done with the buffered input and all outputs at
            // this position
            nextRead = rollIncr(nextRead);
            inputSkipCount--;
          }
          //System.out.println("  return token=" + termAtt.toString());
          return true;
        } else {
          // Done with the buffered input and all outputs at
          // this position
          input.reset();
          nextRead = rollIncr(nextRead);
          inputSkipCount--;
        }
      }

      if (finished && nextRead == nextWrite) {
        // End case: if any output syns went beyond end of
        // input stream, enumerate them now:
        final PendingOutputs outputs = futureOutputs[nextRead];
        if (outputs.upto < outputs.count) {
          final int posIncr = outputs.posIncr;
          final CharsRef output = outputs.pullNext();
          futureInputs[nextRead].reset();
          if (outputs.count == 0) {
            nextWrite = nextRead = rollIncr(nextRead);
          }
          clearAttributes();
View Full Code Here

      posIncr = 1;
    }

    public CharsRef pullNext() {
      assert upto < count;
      final CharsRef result = outputs[upto++];
      posIncr = 0;
      if (upto == count) {
        reset();
      }
      return result;
View Full Code Here

        final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
        System.arraycopy(outputs, 0, next, 0, count);
        outputs = next;
      }
      if (outputs[count] == null) {
        outputs[count] = new CharsRef();
      }
      outputs[count].copy(output, offset, len);
      count++;
    }
View Full Code Here

      }
     
      MapEntry e = workingSet.get(input);
      if (e == null) {
        e = new MapEntry();
        workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map   
      }
     
      e.ords.add(ord);
      e.includeOrig |= includeOrig;
      maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords);
View Full Code Here

      }

      final byte[] spare = new byte[5];
     
      Set<CharsRef> keys = workingSet.keySet();
      CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
      Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
     
      //System.out.println("fmap.build");
      for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
        CharsRef input = sortedKeys[keyIdx];
        MapEntry output = workingSet.get(input);

        int numEntries = output.ords.size();
        // output size, assume the worst case
        int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
View Full Code Here

  private CharTermAttribute termAtt;
  private PositionIncrementAttribute posIncrAtt;
  private OffsetAttribute offsetAtt;

  private void add(String input, String output, boolean keepOrig) {
    b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
          new CharsRef(output.replaceAll(" +", "\u0000")),
          keepOrig);
  }
View Full Code Here

    final BytesRef utf8Key = new BytesRef(key);
    try {

      Automaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRef spare = new CharsRef();

      //System.out.println("  now intersect exactFirst=" + exactFirst);
   
      // Intersect automaton w/ suggest wFST and get all
      // prefix starting nodes & their outputs:
      //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);

      //System.out.println("  prefixPaths: " + prefixPaths.size());

      BytesReader bytesReader = fst.getBytesReader();

      FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<Pair<Long,BytesRef>>();

      final List<LookupResult> results = new ArrayList<LookupResult>();

      List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);

      if (exactFirst) {

        int count = 0;
        for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
          if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
            // This node has END_BYTE arc leaving, meaning it's an
            // "exact" match:
            count++;
          }
        }

        // Searcher just to find the single exact only
        // match, if present:
        Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
        searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);

        // NOTE: we could almost get away with only using
        // the first start node.  The only catch is if
        // maxSurfaceFormsPerAnalyzedForm had kicked in and
        // pruned our exact match from one of these nodes
        // ...:
        for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
          if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
            // This node has END_BYTE arc leaving, meaning it's an
            // "exact" match:
            searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
          }
        }

        MinResult<Pair<Long,BytesRef>> completions[] = searcher.search();

        // NOTE: this is rather inefficient: we enumerate
        // every matching "exactly the same analyzed form"
        // path, and then do linear scan to see if one of
        // these exactly matches the input.  It should be
        // possible (though hairy) to do something similar
        // to getByOutput, since the surface form is encoded
        // into the FST output, so we more efficiently hone
        // in on the exact surface-form match.  Still, I
        // suspect very little time is spent in this linear
        // seach: it's bounded by how many prefix start
        // nodes we have and the
        // maxSurfaceFormsPerAnalyzedForm:
        for(MinResult<Pair<Long,BytesRef>> completion : completions) {
          if (utf8Key.bytesEquals(completion.output.output2)) {
            spare.grow(completion.output.output2.length);
            UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
            results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1)));
            break;
          }
        }

        if (results.size() == num) {
          // That was quick:
          return results;
        }
      }

      Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
      searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
                                                            num - results.size(),
                                                            num * maxAnalyzedPathsForOneInput,
                                                            weightComparator) {
        private final Set<BytesRef> seen = new HashSet<BytesRef>();

        @Override
        protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {

          // Dedup: when the input analyzes to a graph we
          // can get duplicate surface forms:
          if (seen.contains(output.output2)) {
            return false;
          }
          seen.add(output.output2);
         
          if (!exactFirst) {
            return true;
          } else {
            // In exactFirst mode, don't accept any paths
            // matching the surface form since that will
            // create duplicate results:
            if (utf8Key.bytesEquals(output.output2)) {
              // We found exact match, which means we should
              // have already found it in the first search:
              assert results.size() == 1;
              return false;
            } else {
              return true;
            }
          }
        }
      };

      prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
     
      for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
        searcher.addStartPaths(path.fstNode, path.output, true, path.input);
      }

      MinResult<Pair<Long,BytesRef>> completions[] = searcher.search();

      for(MinResult<Pair<Long,BytesRef>> completion : completions) {
        spare.grow(completion.output.output2.length);
        UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
        LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));

        // TODO: for fuzzy case would be nice to return
        // how many edits were required

        //System.out.println("    result=" + result);
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.CharsRef$UTF16SortedAsUTF8Comparator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.