Package com.greplin.lucene.filter

Source Code of com.greplin.lucene.filter.PhraseFilterBenchmark

/*
* Copyright 2013 The greplin-lucene-utils Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.greplin.lucene.filter;

import com.google.common.base.Joiner;
import com.greplin.lucene.query.BooleanQueryBuilder;
import com.greplin.lucene.util.FilterIntersectionProvider;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

import java.io.IOException;
import java.util.Random;

/**
* Random benchmark for the phrase filter.
*/
public class PhraseFilterBenchmark {

  private static final Random RANDOM = new Random();

  private static final int TOTAL_DOCS = 10000;

  private static final int AVERAGE_WORDS_PER_DOC = 100;

  private static final int WORDS_PER_DOC_DEVIATION = 95;

  private static final int SECOND_FIELD_MATCH_PERCENTAGE = 50;


  private static final int ROUNDS = 4;

  private static final int TOTAL_QUERIES = 5000;

  private static final int[] WORDS_PER_QUERY = {2, 2, 2, 2, 3, 3, 4, 5};


  private static final String FIELD = "f";


  private static final String[] WORDS = (
      "the quick brown fox jumps over the lazy dog"
      + " a stitch in time saves nine"
      + " an apple a day keeps the doctor away"
      + " two wrongs do not make a right"
      + " the pen is mightier than the sword"
      + " the squeaky wheel gets the grease"
      + " no man is an island"
      + " fortune favors the bold"
      + " people who live in glass houses should not throw stones"
      + " better late than never"
      + " hope for the best but prepare for the worst"
      + " birds of a feather flock together"
      + " keep your friends close and your enemies closer"
      + " a picture is worth a thousand words"
      + " there is no such thing as a free lunch"
      + " there is no place like home"
      + " discretion is the greater part of valor"
      + " the early bird catches the worm"
      + " never look a gift horse in the mouth"
      + " you cannot make an omelet without breaking a few eggs"
      + " you cannot always get what you want"
      + " cleanliness is next to godliness"
      + " a watched pot never boils"
      + " beggars cannot be choosers"
      + " actions speak louder than words"
      + " if it is not broke, do not fix it"
      + " practice makes perfect"
      + " too many cooks spoil the broth"
      + " easy come easy go"
      + " do not bite the hand that feeds you"
      + " all good things must come to an end"
      + " if you cannot beat them, join them"
      + " there is no time like the present"
      + " beauty is in the eye of the beholder"
      + " necessity is the mother of invention"
      + " a penny saved is a penny earned"
      + " familiarity breeds contempt"
      + " you cannot judge a book by its cover"
      + " good things come to those who wait"
      + " do not put all your eggs in one basket"
      + " two heads are better than one"
      + " the grass is always greener on the other side of the hill"
      + " do unto others as you would have them do unto you"
      + " a chain is only as strong as its weakest link"
      + " honesty is the best policy"
      + " absence makes the heart grow fonder"
      + " you can lead a horse to water but you cannot make him drink"
      + " do not count your chickens before they hatch"
      + " if you want something done right you have to do it yourself"
  ).split(" ");

  private static final int NUMBER_OF_SEGMENTS = 4;

  private static String[] words(int count) {
    String[] words = new String[count];
    for (int w = 0; w < count; w++) {
      words[w] = WORDS[RANDOM.nextInt(WORDS.length)];
    }
    return words;
  }

  public static void main(String[] argv) {
    Directory directory = new RAMDirectory();
    try {
      IndexWriter writer = new IndexWriter(
          directory, new IndexWriterConfig(Version.LUCENE_32, new WhitespaceAnalyzer(Version.LUCENE_32)));
      int done = 0;
      for (int i = 0; i < NUMBER_OF_SEGMENTS; i++) {
        int remaining = NUMBER_OF_SEGMENTS - i;
        int numberOfDocs;
        if (remaining == 1) {
          numberOfDocs = TOTAL_DOCS - done;
        } else {
          numberOfDocs = RANDOM.nextInt(TOTAL_DOCS - done - remaining) + 1;
        }
        done += numberOfDocs;
        System.out.println("Segment #" + i + " has " + numberOfDocs + " docs");

        for (int d = 0; d < numberOfDocs; d++) {
          int wordCount = RANDOM.nextInt(WORDS_PER_DOC_DEVIATION * 2) + AVERAGE_WORDS_PER_DOC - WORDS_PER_DOC_DEVIATION;
          Document doc = new Document();
          doc.add(new Field("f", Joiner.on(' ').join(words(wordCount)), Field.Store.YES, Field.Index.ANALYZED));
          doc.add(new Field("second", RANDOM.nextInt(100) < SECOND_FIELD_MATCH_PERCENTAGE ? "yes" : "no",
              Field.Store.NO, Field.Index.ANALYZED));
          writer.addDocument(doc);
        }
        writer.commit();
      }
      writer.close();

      IndexReader reader = IndexReader.open(directory);
      IndexSearcher searcher = new IndexSearcher(reader);

      String[][] queries = new String[TOTAL_QUERIES][];
      Term[][] terms = new Term[TOTAL_QUERIES][];

      for (int q = 0; q < TOTAL_QUERIES; q++) {
        queries[q] = words(WORDS_PER_QUERY[RANDOM.nextInt(WORDS_PER_QUERY.length)]);
        terms[q] = new Term[queries[q].length];
        for (int qw = 0; qw < queries[q].length; qw++) {
          terms[q][qw] = new Term(FIELD, queries[q][qw]);
        }
      }

      // Warm up.
      new PhraseFilter(FIELD, queries[0]).getDocIdSet(reader);

      for (int round = 0; round < ROUNDS; round++) {
        System.out.println();
        String name1 = "filter";
        String name2 = "query";

        long ms1 = 0, ms2 = 0;
        for (int step = 0; step < 2; step++) {
          System.gc();
          System.gc();
          System.gc();

          if (step == (round & 1)) {
            long millis = System.currentTimeMillis();
            long hits = 0;
            for (String[] queryWords : queries) {
              PhraseFilter pf = new PhraseFilter(
                  new FilterIntersectionProvider(TermsFilter.from(new Term("second", "yes"))), FIELD, queryWords);
              hits += searcher.search(new FilteredQuery(new MatchAllDocsQuery(), pf), 1).totalHits;
            }
            ms1 = System.currentTimeMillis() - millis;
            System.out.println("Finished " + name1 + " in " + ms1 + "ms with " + hits + " hits");
          } else {
            long millis = System.currentTimeMillis();
            long hits = 0;
            for (Term[] queryTerms : terms) {
              PhraseQuery pq = new PhraseQuery();
              for (Term term : queryTerms) {
                pq.add(term);
              }
              Query query = BooleanQueryBuilder.builder()
                  .must(new TermQuery(new Term("second", "yes")))
                  .must(pq)
                  .build();
              hits += searcher.search(query, 1).totalHits;
            }
            ms2 = System.currentTimeMillis() - millis;
            System.out.println("Finished " + name2 + " in " + ms2 + "ms with " + hits + " hits");
          }
        }
        System.out.println(name1 + " took " + (int) ((100.0 * ms1) / ms2) + "% as much time as " + name2);
      }

    } catch (IOException e) {
      e.printStackTrace();
    }
  }


}
TOP

Related Classes of com.greplin.lucene.filter.PhraseFilterBenchmark

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.