Package org.apache.lucene.search.spell

Examples of org.apache.lucene.search.spell.TermFreqPayloadIterator


    String prefix = getClass().getSimpleName();
    File directory = Sort.defaultTempDir();
    File tempInput = File.createTempFile(prefix, ".input", directory);
    File tempSorted = File.createTempFile(prefix, ".sorted", directory);

    TermFreqPayloadIterator payloads;
    if (iterator instanceof TermFreqPayloadIterator) {
      payloads = (TermFreqPayloadIterator) iterator;
    } else {
      payloads = null;
    }
    hasPayloads = payloads != null;

    Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
    Sort.ByteSequencesReader reader = null;
    BytesRef scratch = new BytesRef();

    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();

    boolean success = false;
    byte buffer[] = new byte[8];
    try {
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef surfaceForm;

      while ((surfaceForm = iterator.next()) != null) {
        Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
       
        maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());

        for (IntsRef path : paths) {

          Util.toBytesRef(path, scratch);
         
          // length of the analyzed text (FST input)
          if (scratch.length > Short.MAX_VALUE-2) {
            throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")");
          }
          short analyzedLength = (short) scratch.length;

          // compute the required length:
          // analyzed sequence + weight (4) + surface + analyzedLength (short)
          int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;

          BytesRef payload;

          if (hasPayloads) {
            if (surfaceForm.length > (Short.MAX_VALUE-2)) {
              throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
            }
            payload = payloads.payload();
            // payload + surfaceLength (short)
            requiredLength += payload.length + 2;
          } else {
            payload = null;
          }
View Full Code Here


    if (searcher != null) {
      searcher.getIndexReader().close();
      searcher = null;
    }

    TermFreqPayloadIterator payloads;
    if (iter instanceof TermFreqPayloadIterator) {
      payloads = (TermFreqPayloadIterator) iter;
    } else {
      payloads = null;
    }
    Directory dirTmp = getDirectory(new File(indexPath.toString() + ".tmp"));

    IndexWriter w = null;
    IndexWriter w2 = null;
    AtomicReader r = null;
    boolean success = false;
    try {
      Analyzer gramAnalyzer = new AnalyzerWrapper() {
          @Override
          protected Analyzer getWrappedAnalyzer(String fieldName) {
            return indexAnalyzer;
          }

          @Override
          protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
            if (fieldName.equals("textgrams") && minPrefixChars > 0) {
              return new TokenStreamComponents(components.getTokenizer(),
                                               new EdgeNGramTokenFilter(matchVersion,
                                                                        components.getTokenStream(),
                                                                        1, minPrefixChars));
            } else {
              return components;
            }
          }
        };

      w = new IndexWriter(dirTmp,
                          getIndexWriterConfig(matchVersion, gramAnalyzer));
      BytesRef text;
      Document doc = new Document();
      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
      ft.setIndexOptions(IndexOptions.DOCS_ONLY);
      ft.setOmitNorms(true);
      Field textField = new Field(TEXT_FIELD_NAME, "", ft);
      doc.add(textField);

      Field textGramField = new Field("textgrams", "", ft);
      doc.add(textGramField);

      Field textDVField = new BinaryDocValuesField(TEXT_FIELD_NAME, new BytesRef());
      doc.add(textDVField);

      // TODO: use threads...?
      Field weightField = new NumericDocValuesField("weight", 0);
      doc.add(weightField);

      Field payloadField;
      if (payloads != null) {
        payloadField = new BinaryDocValuesField("payloads", new BytesRef());
        doc.add(payloadField);
      } else {
        payloadField = null;
      }

      //long t0 = System.nanoTime();
      while ((text = iter.next()) != null) {
        String textString = text.utf8ToString();
        textField.setStringValue(textString);
        textGramField.setStringValue(textString);
        textDVField.setBytesValue(text);
        weightField.setLongValue(iter.weight());
        if (payloads != null) {
          payloadField.setBytesValue(payloads.payload());
        }
        w.addDocument(doc);
      }
      //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec");
View Full Code Here

    if (searcher != null) {
      searcher.getIndexReader().close();
      searcher = null;
    }

    TermFreqPayloadIterator payloads;
    if (iter instanceof TermFreqPayloadIterator) {
      payloads = (TermFreqPayloadIterator) iter;
    } else {
      payloads = null;
    }
    Directory dirTmp = getDirectory(new File(indexPath.toString() + ".tmp"));

    IndexWriter w = null;
    IndexWriter w2 = null;
    AtomicReader r = null;
    boolean success = false;
    try {
      Analyzer gramAnalyzer = new AnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
          @Override
          protected Analyzer getWrappedAnalyzer(String fieldName) {
            return indexAnalyzer;
          }

          @Override
          protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
            if (fieldName.equals("textgrams") && minPrefixChars > 0) {
              return new TokenStreamComponents(components.getTokenizer(),
                                               new EdgeNGramTokenFilter(matchVersion,
                                                                        components.getTokenStream(),
                                                                        1, minPrefixChars));
            } else {
              return components;
            }
          }
        };

      w = new IndexWriter(dirTmp,
                          getIndexWriterConfig(matchVersion, gramAnalyzer));
      BytesRef text;
      Document doc = new Document();
      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
      ft.setIndexOptions(IndexOptions.DOCS_ONLY);
      ft.setOmitNorms(true);
      Field textField = new Field(TEXT_FIELD_NAME, "", ft);
      doc.add(textField);

      Field textGramField = new Field("textgrams", "", ft);
      doc.add(textGramField);

      Field textDVField = new BinaryDocValuesField(TEXT_FIELD_NAME, new BytesRef());
      doc.add(textDVField);

      // TODO: use threads...?
      Field weightField = new NumericDocValuesField("weight", 0);
      doc.add(weightField);

      Field payloadField;
      if (payloads != null) {
        payloadField = new BinaryDocValuesField("payloads", new BytesRef());
        doc.add(payloadField);
      } else {
        payloadField = null;
      }

      //long t0 = System.nanoTime();
      while ((text = iter.next()) != null) {
        String textString = text.utf8ToString();
        textField.setStringValue(textString);
        textGramField.setStringValue(textString);
        textDVField.setBytesValue(text);
        weightField.setLongValue(iter.weight());
        if (payloads != null) {
          payloadField.setBytesValue(payloads.payload());
        }
        w.addDocument(doc);
      }
      //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec");
View Full Code Here

    String prefix = getClass().getSimpleName();
    File directory = Sort.defaultTempDir();
    File tempInput = File.createTempFile(prefix, ".input", directory);
    File tempSorted = File.createTempFile(prefix, ".sorted", directory);

    TermFreqPayloadIterator payloads;
    if (iterator instanceof TermFreqPayloadIterator) {
      payloads = (TermFreqPayloadIterator) iterator;
    } else {
      payloads = null;
    }
    hasPayloads = payloads != null;

    Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
    Sort.ByteSequencesReader reader = null;
    BytesRef scratch = new BytesRef();

    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();

    boolean success = false;
    byte buffer[] = new byte[8];
    try {
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef surfaceForm;

      while ((surfaceForm = iterator.next()) != null) {
        Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
       
        maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());

        for (IntsRef path : paths) {

          Util.toBytesRef(path, scratch);
         
          // length of the analyzed text (FST input)
          if (scratch.length > Short.MAX_VALUE-2) {
            throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")");
          }
          short analyzedLength = (short) scratch.length;

          // compute the required length:
          // analyzed sequence + weight (4) + surface + analyzedLength (short)
          int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;

          BytesRef payload;

          if (hasPayloads) {
            if (surfaceForm.length > (Short.MAX_VALUE-2)) {
              throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
            }
            payload = payloads.payload();
            // payload + surfaceLength (short)
            requiredLength += payload.length + 2;
          } else {
            payload = null;
          }
View Full Code Here

    String prefix = getClass().getSimpleName();
    File directory = Sort.defaultTempDir();
    File tempInput = File.createTempFile(prefix, ".input", directory);
    File tempSorted = File.createTempFile(prefix, ".sorted", directory);

    TermFreqPayloadIterator payloads;
    if (iterator instanceof TermFreqPayloadIterator) {
      payloads = (TermFreqPayloadIterator) iterator;
    } else {
      payloads = null;
    }
    hasPayloads = payloads != null;

    Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
    Sort.ByteSequencesReader reader = null;
    BytesRef scratch = new BytesRef();

    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();

    boolean success = false;
    byte buffer[] = new byte[8];
    try {
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef surfaceForm;

      while ((surfaceForm = iterator.next()) != null) {
        Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
       
        maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());

        for (IntsRef path : paths) {

          Util.toBytesRef(path, scratch);
         
          // length of the analyzed text (FST input)
          if (scratch.length > Short.MAX_VALUE-2) {
            throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length + ")");
          }
          short analyzedLength = (short) scratch.length;

          // compute the required length:
          // analyzed sequence + weight (4) + surface + analyzedLength (short)
          int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;

          BytesRef payload;

          if (hasPayloads) {
            if (surfaceForm.length > (Short.MAX_VALUE-2)) {
              throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
            }
            payload = payloads.payload();
            // payload + surfaceLength (short)
            requiredLength += payload.length + 2;
          } else {
            payload = null;
          }
View Full Code Here

TOP

Related Classes of org.apache.lucene.search.spell.TermFreqPayloadIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.