Examples of org.apache.lucene.util.LineFileDocs

org.apache.lucene.util.LineFileDocs
Minimal port of benchmark's LneDocSource + DocMaker, so tests can enum docs from a line file created by benchmark's WriteLineDoc task

  @Test
  public void testNRTThreads() throws Exception {


    final long t0 = System.currentTimeMillis();


    final LineFileDocs docs = new LineFileDocs(random);
    final File tempDir = _TestUtil.getTempDir("nrtopenfiles");
    final MockDirectoryWrapper dir = newFSDirectory(tempDir);
    dir.setCheckIndexOnClose(false); // don't double-checkIndex, we do it ourselves.
    final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
    conf.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() {
      @Override
      public void warm(IndexReader reader) throws IOException {
        if (VERBOSE) {
          System.out.println("TEST: now warm merged reader=" + reader);
        }
        final int maxDoc = reader.maxDoc();
        int sum = 0;
        final int inc = Math.max(1, maxDoc/50);
        for(int docID=0;docID<maxDoc;docID += inc) {
          if (reader.isDeleted(docID)) {
            final Document doc = reader.document(docID);
            sum += doc.getFields().size();
          }
        }


        IndexSearcher searcher = newSearcher(reader);
        sum += searcher.search(new TermQuery(new Term("body", "united")), 10).totalHits;
        searcher.close();


        if (VERBOSE) {
          System.out.println("TEST: warm visited " + sum + " fields");
        }
      }
      });
    
    final IndexWriter writer = new IndexWriter(dir, conf);
    if (VERBOSE) {
      writer.setInfoStream(System.out);
    }
    _TestUtil.reduceOpenFiles(writer);


    final int NUM_INDEX_THREADS = 2;
    final int NUM_SEARCH_THREADS = 3;


    final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : RANDOM_MULTIPLIER;


    final AtomicBoolean failed = new AtomicBoolean();
    final AtomicInteger addCount = new AtomicInteger();
    final AtomicInteger delCount = new AtomicInteger();
    final AtomicInteger packCount = new AtomicInteger();


    final Set<String> delIDs = Collections.synchronizedSet(new HashSet<String>());
    final List<SubDocs> allSubDocs = Collections.synchronizedList(new ArrayList<SubDocs>());


    final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC*1000;
    Thread[] threads = new Thread[NUM_INDEX_THREADS];
    for(int thread=0;thread<NUM_INDEX_THREADS;thread++) {
      threads[thread] = new Thread() {
          @Override
          public void run() {
            // TODO: would be better if this were cross thread, so that we make sure one thread deleting anothers added docs works:
            final List<String> toDeleteIDs = new ArrayList<String>();
            final List<SubDocs> toDeleteSubDocs = new ArrayList<SubDocs>();
            while(System.currentTimeMillis() < stopTime && !failed.get()) {
              try {
                Document doc = docs.nextDoc();
                if (doc == null) {
                  break;
                }
                final String addedField;
                if (random.nextBoolean()) {
                  addedField = "extra" + random.nextInt(10);
                  doc.add(new Field(addedField, "a random field", Field.Store.NO, Field.Index.ANALYZED));
                } else {
                  addedField = null;
                }
                if (random.nextBoolean()) {
                  if (VERBOSE) {
                    System.out.println(Thread.currentThread().getName() + ": add doc id:" + doc.get("docid"));
                  }


                  if (random.nextBoolean()) {
                    // Add a pack of adjacent sub-docs
                    final String packID;
                    final SubDocs delSubDocs;
                    if (toDeleteSubDocs.size() > 0 && random.nextBoolean()) {
                      delSubDocs = toDeleteSubDocs.get(random.nextInt(toDeleteSubDocs.size()));
                      assert !delSubDocs.deleted;
                      toDeleteSubDocs.remove(delSubDocs);
                      // reuse prior packID
                      packID = delSubDocs.packID;
                    } else {
                      delSubDocs = null;
                      // make new packID
                      packID = packCount.getAndIncrement() + "";
                    }


                    final Field packIDField = newField("packID", packID, Field.Store.YES, Field.Index.NOT_ANALYZED);
                    final List<String> docIDs = new ArrayList<String>();
                    final SubDocs subDocs = new SubDocs(packID, docIDs);
                    final List<Document> docsList = new ArrayList<Document>();


                    allSubDocs.add(subDocs);
                    doc.add(packIDField);
                    docsList.add(cloneDoc(doc));
                    docIDs.add(doc.get("docid"));


                    final int maxDocCount = _TestUtil.nextInt(random, 1, 10);
                    while(docsList.size() < maxDocCount) {
                      doc = docs.nextDoc();
                      if (doc == null) {
                        break;
                      }
                      docsList.add(cloneDoc(doc));
                      docIDs.add(doc.get("docid"));
                    }
                    addCount.addAndGet(docsList.size());


                    if (delSubDocs != null) {
                      delSubDocs.deleted = true;
                      delIDs.addAll(delSubDocs.subIDs);
                      delCount.addAndGet(delSubDocs.subIDs.size());
                      if (VERBOSE) {
                        System.out.println("TEST: update pack packID=" + delSubDocs.packID + " count=" + docsList.size() + " docs=" + docIDs);
                      }
                      writer.updateDocuments(new Term("packID", delSubDocs.packID), docsList);
                      /*
                      // non-atomic:
                      writer.deleteDocuments(new Term("packID", delSubDocs.packID));
                      for(Document subDoc : docsList) {
                        writer.addDocument(subDoc);
                      }
                      */
                    } else {
                      if (VERBOSE) {
                        System.out.println("TEST: add pack packID=" + packID + " count=" + docsList.size() + " docs=" + docIDs);
                      }
                      writer.addDocuments(docsList);
                      
                      /*
                      // non-atomic:
                      for(Document subDoc : docsList) {
                        writer.addDocument(subDoc);
                      }
                      */
                    }
                    doc.removeField("packID");


                    if (random.nextInt(5) == 2) {
                      if (VERBOSE) {
                        //System.out.println(Thread.currentThread().getName() + ": buffer del id:" + packID);
                      }
                      toDeleteSubDocs.add(subDocs);
                    }


                  } else {
                    writer.addDocument(doc);
                    addCount.getAndIncrement();


                    if (random.nextInt(5) == 3) {
                      if (VERBOSE) {
                        //System.out.println(Thread.currentThread().getName() + ": buffer del id:" + doc.get("docid"));
                      }
                      toDeleteIDs.add(doc.get("docid"));
                    }
                  }
                } else {
                  // we use update but it never replaces a
                  // prior doc
                  if (VERBOSE) {
                    System.out.println(Thread.currentThread().getName() + ": update doc id:" + doc.get("docid"));
                  }
                  writer.updateDocument(new Term("docid", doc.get("docid")), doc);
                  addCount.getAndIncrement();


                  if (random.nextInt(5) == 3) {
                    if (VERBOSE) {
                      //System.out.println(Thread.currentThread().getName() + ": buffer del id:" + doc.get("docid"));
                    }
                    toDeleteIDs.add(doc.get("docid"));
                  }
                }


                if (random.nextInt(30) == 17) {
                  if (VERBOSE) {
                    System.out.println(Thread.currentThread().getName() + ": apply " + toDeleteIDs.size() + " deletes");
                  }
                  for(String id : toDeleteIDs) {
                    if (VERBOSE) {
                      System.out.println(Thread.currentThread().getName() + ": del term=id:" + id);
                    }
                    writer.deleteDocuments(new Term("docid", id));
                  }
                  final int count = delCount.addAndGet(toDeleteIDs.size());
                  if (VERBOSE) {
                    System.out.println(Thread.currentThread().getName() + ": tot " + count + " deletes");
                  }
                  delIDs.addAll(toDeleteIDs);
                  toDeleteIDs.clear();


                  for(SubDocs subDocs : toDeleteSubDocs) {
                    assert !subDocs.deleted;
                    writer.deleteDocuments(new Term("packID", subDocs.packID));
                    subDocs.deleted = true;
                    if (VERBOSE) {
                      System.out.println("  del subs: " + subDocs.subIDs + " packID=" + subDocs.packID);
                    }
                    delIDs.addAll(subDocs.subIDs);
                    delCount.addAndGet(subDocs.subIDs.size());
                  }
                  toDeleteSubDocs.clear();
                }
                if (addedField != null) {
                  doc.removeField(addedField);
                }
              } catch (Throwable t) {
                System.out.println(Thread.currentThread().getName() + ": hit exc");
                t.printStackTrace();
                failed.set(true);
                throw new RuntimeException(t);
              }
            }
            if (VERBOSE) {
              System.out.println(Thread.currentThread().getName() + ": indexing done");
            }
          }
        };
      threads[thread].setDaemon(true);
      threads[thread].start();
    }


    if (VERBOSE) {
      System.out.println("TEST: DONE start indexing threads [" + (System.currentTimeMillis()-t0) + " ms]");
    }


    // let index build up a bit
    Thread.sleep(100);


    IndexReader r = IndexReader.open(writer, true);
    boolean any = false;


    // silly starting guess:
    final AtomicInteger totTermCount = new AtomicInteger(100);


    final ExecutorService es = Executors.newCachedThreadPool();


    while(System.currentTimeMillis() < stopTime && !failed.get()) {
      if (random.nextBoolean()) {
        if (VERBOSE) {
          System.out.println("TEST: now reopen r=" + r);
        }
        final IndexReader r2 = r.reopen();
        if (r != r2) {
          r.close();
          r = r2;
        }
      } else {
        if (VERBOSE) {
          System.out.println("TEST: now close reader=" + r);
        }
        r.close();
        writer.commit();
        final Set<String> openDeletedFiles = dir.getOpenDeletedFiles();
        if (openDeletedFiles.size() > 0) {
          System.out.println("OBD files: " + openDeletedFiles);
        }
        any |= openDeletedFiles.size() > 0;
        //assertEquals("open but deleted: " + openDeletedFiles, 0, openDeletedFiles.size());
        if (VERBOSE) {
          System.out.println("TEST: now open");
        }
        r = IndexReader.open(writer, true);
      }
      if (VERBOSE) {
        System.out.println("TEST: got new reader=" + r);
      }
      //System.out.println("numDocs=" + r.numDocs() + "
      //openDelFileCount=" + dir.openDeleteFileCount());


      smokeTestReader(r);


      if (r.numDocs() > 0) {


        final IndexSearcher s = new IndexSearcher(r, es);


        // run search threads
        final long searchStopTime = System.currentTimeMillis() + 500;
        final Thread[] searchThreads = new Thread[NUM_SEARCH_THREADS];
        final AtomicInteger totHits = new AtomicInteger();
        for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) {
          searchThreads[thread] = new Thread() {
              @Override
              public void run() {
                try {
                  TermEnum termEnum = s.getIndexReader().terms(new Term("body", ""));
                  int seenTermCount = 0;
                  int shift;
                  int trigger;
                  if (totTermCount.get() < 10) {
                    shift = 0;
                    trigger = 1;
                  } else {
                    trigger = totTermCount.get()/10;
                    shift = random.nextInt(trigger);
                  }
                  while(System.currentTimeMillis() < searchStopTime) {
                    Term term = termEnum.term();
                    if (term == null) {
                      if (seenTermCount < 10) {
                        break;
                      }
                      totTermCount.set(seenTermCount);
                      seenTermCount = 0;
                      trigger = totTermCount.get()/10;
                      //System.out.println("trigger " + trigger);
                      shift = random.nextInt(trigger);
                      termEnum = s.getIndexReader().terms(new Term("body", ""));
                      continue;
                    }
                    seenTermCount++;
                    // search 10 terms
                    if (trigger == 0) {
                      trigger = 1;
                    }
                    if ((seenTermCount + shift) % trigger == 0) {
                      //if (VERBOSE) {
                      //System.out.println(Thread.currentThread().getName() + " now search body:" + term.utf8ToString());
                      //}
                      totHits.addAndGet(runQuery(s, new TermQuery(term)));
                    }
                    termEnum.next();
                  }
                  if (VERBOSE) {
                    System.out.println(Thread.currentThread().getName() + ": search done");
                  }
                } catch (Throwable t) {
                  System.out.println(Thread.currentThread().getName() + ": hit exc");
                  failed.set(true);
                  t.printStackTrace(System.out);
                  throw new RuntimeException(t);
                }
              }
            };
          searchThreads[thread].setDaemon(true);
          searchThreads[thread].start();
        }


        for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) {
          searchThreads[thread].join();
        }


        if (VERBOSE) {
          System.out.println("TEST: DONE search: totHits=" + totHits);
        }
      } else {
        Thread.sleep(100);
      }
    }


    es.shutdown();
    es.awaitTermination(1, TimeUnit.SECONDS);


    if (VERBOSE) {
      System.out.println("TEST: all searching done [" + (System.currentTimeMillis()-t0) + " ms]");
    }


    //System.out.println("numDocs=" + r.numDocs() + " openDelFileCount=" + dir.openDeleteFileCount());
    r.close();
    final Set<String> openDeletedFiles = dir.getOpenDeletedFiles();
    if (openDeletedFiles.size() > 0) {
      System.out.println("OBD files: " + openDeletedFiles);
    }
    any |= openDeletedFiles.size() > 0;


    assertFalse("saw non-zero open-but-deleted count", any);
    if (VERBOSE) {
      System.out.println("TEST: now join");
    }
    for(int thread=0;thread<NUM_INDEX_THREADS;thread++) {
      threads[thread].join();
    }
    if (VERBOSE) {
      System.out.println("TEST: done join [" + (System.currentTimeMillis()-t0) + " ms]; addCount=" + addCount + " delCount=" + delCount);
    }


    final IndexReader r2 = writer.getReader();
    final IndexSearcher s = newSearcher(r2);
    boolean doFail = false;
    for(String id : delIDs) {
      final TopDocs hits = s.search(new TermQuery(new Term("docid", id)), 1);
      if (hits.totalHits != 0) {
        System.out.println("doc id=" + id + " is supposed to be deleted, but got docID=" + hits.scoreDocs[0].doc);
        doFail = true;
      }
    }


    // Make sure each group of sub-docs are still in docID order:
    for(SubDocs subDocs : allSubDocs) {
      if (!subDocs.deleted) {
        // We sort by relevance but the scores should be identical so sort falls back to by docID:
        TopDocs hits = s.search(new TermQuery(new Term("packID", subDocs.packID)), 20);
        assertEquals(subDocs.subIDs.size(), hits.totalHits);
        int lastDocID = -1;
        int startDocID = -1;
        for(ScoreDoc scoreDoc : hits.scoreDocs) {
          final int docID = scoreDoc.doc;
          if (lastDocID != -1) {
            assertEquals(1+lastDocID, docID);
          } else {
            startDocID = docID;
          }
          lastDocID = docID;
          final Document doc = s.doc(docID);
          assertEquals(subDocs.packID, doc.get("packID"));
        }


        lastDocID = startDocID - 1;
        for(String subID : subDocs.subIDs) {
          hits = s.search(new TermQuery(new Term("docid", subID)), 1);
          assertEquals(1, hits.totalHits);
          final int docID = hits.scoreDocs[0].doc;
          if (lastDocID != -1) {
            assertEquals(1+lastDocID, docID);
          }
          lastDocID = docID;
        }          
      } else {
        for(String subID : subDocs.subIDs) {
          assertEquals(0, s.search(new TermQuery(new Term("docid", subID)), 1).totalHits);
        }
      }
    }
    
    final int endID = Integer.parseInt(docs.nextDoc().get("docid"));
    for(int id=0;id<endID;id++) {
      String stringID = ""+id;
      if (!delIDs.contains(stringID)) {
        final TopDocs hits = s.search(new TermQuery(new Term("docid", stringID)), 1);
        if (hits.totalHits != 1) {
          System.out.println("doc id=" + stringID + " is not supposed to be deleted, but got hitCount=" + hits.totalHits);
          doFail = true;
        }
      }
    }
    assertFalse(doFail);
    
    assertEquals("index=" + writer.segString() + " addCount=" + addCount + " delCount=" + delCount, addCount.get() - delCount.get(), r2.numDocs());
    r2.close();


    writer.commit();
    assertEquals("index=" + writer.segString() + " addCount=" + addCount + " delCount=" + delCount, addCount.get() - delCount.get(), writer.numDocs());


    assertFalse(writer.anyNonBulkMerges);
    writer.close(false);
    _TestUtil.checkIndex(dir);
    s.close();
    dir.close();
    _TestUtil.rmDir(tempDir);
    docs.close();
    if (VERBOSE) {
      System.out.println("TEST: done [" + (System.currentTimeMillis()-t0) + " ms]");
    }
  }

View Full Code Here

   */
  public static void createRandomIndex(int numdocs, RandomIndexWriter writer,
      long seed) throws IOException {
    Random random = new Random(seed);
    // primary source for our data is from linefiledocs, its realistic.
    LineFileDocs lineFileDocs = new LineFileDocs(random, false); // no docvalues in 4x
    
    // TODO: we should add other fields that use things like docs&freqs but omit
    // positions,
    // because linefiledocs doesn't cover all the possibilities.
    for (int i = 0; i < numdocs; i++) {
      writer.addDocument(lineFileDocs.nextDoc());
    }
    
    lineFileDocs.close();
  }

View Full Code Here

    }
  }


  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {


    final LineFileDocs docs = new LineFileDocs(random);
    Document doc = null;
    Field field = null, currentField = null;
    StringReader bogus = new StringReader("");
    if (iw != null) {
      doc = new Document();
      FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
      if (random.nextBoolean()) {
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(random.nextBoolean());
        ft.setStoreTermVectorPositions(random.nextBoolean());
        if (ft.storeTermVectorPositions() && !PREFLEX_IMPERSONATION_IS_ACTIVE) {
          ft.setStoreTermVectorPayloads(random.nextBoolean());
        }
      }
      if (random.nextBoolean()) {
        ft.setOmitNorms(true);
      }
      String pf = _TestUtil.getPostingsFormat("dummy");
      boolean supportsOffsets = !doesntSupportOffsets.contains(pf);
      switch(random.nextInt(4)) {
        case 0: ft.setIndexOptions(IndexOptions.DOCS_ONLY); break;
        case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
        case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
        default:
                if (supportsOffsets && offsetsAreCorrect) {
                  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
                } else {
                  ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
                }
      }
      currentField = field = new Field("dummy", bogus, ft);
      doc.add(currentField);
    }
    
    try {
      for (int i = 0; i < iterations; i++) {
        String text;
        
        if (random.nextInt(10) == 7) {
          // real data from linedocs
          text = docs.nextDoc().get("body");
          if (text.length() > maxWordLength) {
            
            // Take a random slice from the text...:
            int startPos = random.nextInt(text.length() - maxWordLength);
            if (startPos > 0 && Character.isLowSurrogate(text.charAt(startPos))) {

View Full Code Here

    }
  }


  @Ignore
  public void testWiki() throws Exception {
    final LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false);
    // Skip header:
    lfd.nextDoc();
    FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(random()));
    sug.build(new InputIterator() {


        private int count;


        @Override
        public long weight() {
          return 1;
        }


        @Override
        public Comparator<BytesRef> getComparator() {
          return null;
        }


        @Override
        public BytesRef next() {
          Document doc;
          try {
            doc = lfd.nextDoc();
          } catch (IOException ioe) {
            throw new RuntimeException(ioe);
          }
          if (doc == null) {
            return null;

View Full Code Here

   * populates a writer with random stuff. this must be fully reproducable with the seed!
   */
  public static void createRandomIndex(int numdocs, RandomIndexWriter writer, long seed) throws IOException {
    Random random = new Random(seed);
    // primary source for our data is from linefiledocs, its realistic.
    LineFileDocs lineFileDocs = new LineFileDocs(random);


    // TODO: we should add other fields that use things like docs&freqs but omit positions,
    // because linefiledocs doesn't cover all the possibilities.
    for (int i = 0; i < numdocs; i++) {
      Document document = lineFileDocs.nextDoc();
      // grab the title and add some SortedSet instances for fun
      String title = document.get("titleTokenized");
      String split[] = title.split("\\s+");
      for (String trash : split) {
        document.add(new SortedSetDocValuesField("sortedset", new BytesRef(trash)));
      }
      // add a numeric dv field sometimes
      document.removeFields("sparsenumeric");
      if (random.nextInt(4) == 2) {
        document.add(new NumericDocValuesField("sparsenumeric", random.nextInt()));
      }
      writer.addDocument(document);
    }
    
    lineFileDocs.close();
  }

View Full Code Here

    IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT,
        new MockAnalyzer(random()));
    Similarity provider = new MySimProvider();
    config.setSimilarity(provider);
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
    final LineFileDocs docs = new LineFileDocs(random());
    int num = atLeast(100);
    for (int i = 0; i < num; i++) {
      Document doc = docs.nextDoc();
      float nextFloat = random().nextFloat();
      Field f = new TextField(floatTestField, "" + nextFloat, Field.Store.YES);
      f.setBoost(nextFloat);


      doc.add(f);
      writer.addDocument(doc);
      doc.removeField(floatTestField);
      if (rarely()) {
        writer.commit();
      }
    }
    writer.commit();
    writer.close();
    AtomicReader open = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir));
    NumericDocValues norms = open.getNormValues(floatTestField);
    assertNotNull(norms);
    for (int i = 0; i < open.maxDoc(); i++) {
      Document document = open.document(i);
      float expected = Float.parseFloat(document.get(floatTestField));
      assertEquals(expected, Float.intBitsToFloat((int)norms.get(i)), 0.0f);
    }
    open.close();
    dir.close();
    docs.close();
  }

View Full Code Here

    NRTCachingDirectory cachedDir = new NRTCachingDirectory(dir, 2.0, 25.0);
    IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
    conf.setMergeScheduler(cachedDir.getMergeScheduler());
    RandomIndexWriter w = new RandomIndexWriter(random, cachedDir, conf);
    w.w.setInfoStream(VERBOSE ? System.out : null);
    final LineFileDocs docs = new LineFileDocs(random);    
    final int numDocs = _TestUtil.nextInt(random, 100, 400);


    if (VERBOSE) {
      System.out.println("TEST: numDocs=" + numDocs);
    }


    final List<String> ids = new ArrayList<String>();
    IndexReader r = null;
    for(int docCount=0;docCount<numDocs;docCount++) {
      final Document doc = docs.nextDoc();
      ids.add(new String(doc.get("docid")));
      w.addDocument(doc);
      if (random.nextInt(20) == 17) {
        if (r == null) {
          r = IndexReader.open(w.w, false);

View Full Code Here

  @Test
  public void testNRTManager() throws Exception {


    final long t0 = System.currentTimeMillis();


    final LineFileDocs docs = new LineFileDocs(random);
    final File tempDir = _TestUtil.getTempDir("nrtopenfiles");
    final MockDirectoryWrapper _dir = newFSDirectory(tempDir);
    _dir.setCheckIndexOnClose(false);  // don't double-checkIndex, we do it ourselves
    Directory dir = _dir;
    final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(IndexWriterConfig.OpenMode.CREATE);


    if (LuceneTestCase.TEST_NIGHTLY) {
      // newIWConfig makes smallish max seg size, which
      // results in tons and tons of segments for this test
      // when run nightly:
      MergePolicy mp = conf.getMergePolicy();
      if (mp instanceof TieredMergePolicy) {
        ((TieredMergePolicy) mp).setMaxMergedSegmentMB(5000.);
      } else if (mp instanceof LogByteSizeMergePolicy) {
        ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1000.);
      } else if (mp instanceof LogMergePolicy) {
        ((LogMergePolicy) mp).setMaxMergeDocs(100000);
      }
    }


    conf.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() {
      @Override
      public void warm(IndexReader reader) throws IOException {
        if (VERBOSE) {
          System.out.println("TEST: now warm merged reader=" + reader);
        }
        final int maxDoc = reader.maxDoc();
        int sum = 0;
        final int inc = Math.max(1, maxDoc/50);
        for(int docID=0;docID<maxDoc;docID += inc) {
          if (!reader.isDeleted(docID)) {
            final Document doc = reader.document(docID);
            sum += doc.getFields().size();
          }
        }


        IndexSearcher searcher = newSearcher(reader);
        sum += searcher.search(new TermQuery(new Term("body", "united")), 10).totalHits;
        searcher.close();


        if (VERBOSE) {
          System.out.println("TEST: warm visited " + sum + " fields");
        }
      }
      });


    if (random.nextBoolean()) {
      if (VERBOSE) {
        System.out.println("TEST: wrap NRTCachingDir");
      }


      NRTCachingDirectory nrtDir = new NRTCachingDirectory(dir, 5.0, 60.0);
      conf.setMergeScheduler(nrtDir.getMergeScheduler());
      dir = nrtDir;
    }
    
    final IndexWriter writer = new IndexWriter(dir, conf);
    
    if (VERBOSE) {
      writer.setInfoStream(System.out);
    }
    _TestUtil.reduceOpenFiles(writer);
    //System.out.println("TEST: conf=" + writer.getConfig());


    final ExecutorService es = random.nextBoolean() ? null : Executors.newCachedThreadPool(new NamedThreadFactory("NRT search threads"));


    final double minReopenSec = 0.01 + 0.05 * random.nextDouble();
    final double maxReopenSec = minReopenSec * (1.0 + 10 * random.nextDouble());


    if (VERBOSE) {
      System.out.println("TEST: make NRTManager maxReopenSec=" + maxReopenSec + " minReopenSec=" + minReopenSec);
    }


    final NRTManager nrt = new NRTManager(writer, es);
    final NRTManagerReopenThread nrtThread = new NRTManagerReopenThread(nrt, maxReopenSec, minReopenSec);
    nrtThread.setName("NRT Reopen Thread");
    nrtThread.setPriority(Math.min(Thread.currentThread().getPriority()+2, Thread.MAX_PRIORITY));
    nrtThread.setDaemon(true);
    nrtThread.start();


    final int NUM_INDEX_THREADS = _TestUtil.nextInt(random, 1, 3);
    final int NUM_SEARCH_THREADS = _TestUtil.nextInt(random, 1, 3);
    //final int NUM_INDEX_THREADS = 1;
    //final int NUM_SEARCH_THREADS = 1;
    if (VERBOSE) {
      System.out.println("TEST: " + NUM_INDEX_THREADS + " index threads; " + NUM_SEARCH_THREADS + " search threads");
    }


    final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : RANDOM_MULTIPLIER;


    final AtomicBoolean failed = new AtomicBoolean();
    final AtomicInteger addCount = new AtomicInteger();
    final AtomicInteger delCount = new AtomicInteger();
    final AtomicInteger packCount = new AtomicInteger();
    final List<Long> lastGens = new ArrayList<Long>();


    final Set<String> delIDs = Collections.synchronizedSet(new HashSet<String>());
    final List<SubDocs> allSubDocs = Collections.synchronizedList(new ArrayList<SubDocs>());


    final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC*1000;
    Thread[] threads = new Thread[NUM_INDEX_THREADS];
    for(int thread=0;thread<NUM_INDEX_THREADS;thread++) {
      threads[thread] = new Thread() {
          @Override
          public void run() {
            // TODO: would be better if this were cross thread, so that we make sure one thread deleting anothers added docs works:
            final List<String> toDeleteIDs = new ArrayList<String>();
            final List<SubDocs> toDeleteSubDocs = new ArrayList<SubDocs>();


            long gen = 0;
            while(System.currentTimeMillis() < stopTime && !failed.get()) {


              //System.out.println(Thread.currentThread().getName() + ": cycle");
              try {
                // Occassional longish pause if running
                // nightly
                if (LuceneTestCase.TEST_NIGHTLY && random.nextInt(6) == 3) {
                  if (VERBOSE) {
                    System.out.println(Thread.currentThread().getName() + ": now long sleep");
                  }
                  Thread.sleep(_TestUtil.nextInt(random, 50, 500));
                }


                // Rate limit ingest rate:
                Thread.sleep(_TestUtil.nextInt(random, 1, 10));
                if (VERBOSE) {
                  System.out.println(Thread.currentThread() + ": done sleep");
                }


                Document doc = docs.nextDoc();
                if (doc == null) {
                  break;
                }
                final String addedField;
                if (random.nextBoolean()) {
                  addedField = "extra" + random.nextInt(10);
                  doc.add(new Field(addedField, "a random field", Field.Store.NO, Field.Index.ANALYZED));
                } else {
                  addedField = null;
                }
                if (random.nextBoolean()) {


                  if (random.nextBoolean()) {
                    // Add a pack of adjacent sub-docs
                    final String packID;
                    final SubDocs delSubDocs;
                    if (toDeleteSubDocs.size() > 0 && random.nextBoolean()) {
                      delSubDocs = toDeleteSubDocs.get(random.nextInt(toDeleteSubDocs.size()));
                      assert !delSubDocs.deleted;
                      toDeleteSubDocs.remove(delSubDocs);
                      // reuse prior packID
                      packID = delSubDocs.packID;
                    } else {
                      delSubDocs = null;
                      // make new packID
                      packID = packCount.getAndIncrement() + "";
                    }


                    final Field packIDField = newField("packID", packID, Field.Store.YES, Field.Index.NOT_ANALYZED);
                    final List<String> docIDs = new ArrayList<String>();
                    final SubDocs subDocs = new SubDocs(packID, docIDs);
                    final List<Document> docsList = new ArrayList<Document>();


                    allSubDocs.add(subDocs);
                    doc.add(packIDField);
                    docsList.add(cloneDoc(doc));
                    docIDs.add(doc.get("docid"));


                    final int maxDocCount = _TestUtil.nextInt(random, 1, 10);
                    while(docsList.size() < maxDocCount) {
                      doc = docs.nextDoc();
                      if (doc == null) {
                        break;
                      }
                      docsList.add(cloneDoc(doc));
                      docIDs.add(doc.get("docid"));
                    }
                    addCount.addAndGet(docsList.size());


                    if (delSubDocs != null) {
                      delSubDocs.deleted = true;
                      delIDs.addAll(delSubDocs.subIDs);
                      delCount.addAndGet(delSubDocs.subIDs.size());
                      if (VERBOSE) {
                        System.out.println("TEST: update pack packID=" + delSubDocs.packID + " count=" + docsList.size() + " docs=" + docIDs);
                      }
                      gen = nrt.updateDocuments(new Term("packID", delSubDocs.packID), docsList);
                      /*
                      // non-atomic:
                      nrt.deleteDocuments(new Term("packID", delSubDocs.packID));
                      for(Document subDoc : docsList) {
                        nrt.addDocument(subDoc);
                      }
                      */
                    } else {
                      if (VERBOSE) {
                        System.out.println("TEST: add pack packID=" + packID + " count=" + docsList.size() + " docs=" + docIDs);
                      }
                      gen = nrt.addDocuments(docsList);
                      
                      /*
                      // non-atomic:
                      for(Document subDoc : docsList) {
                        nrt.addDocument(subDoc);
                      }
                      */
                    }
                    doc.removeField("packID");


                    if (random.nextInt(5) == 2) {
                      if (VERBOSE) {
                        System.out.println(Thread.currentThread().getName() + ": buffer del id:" + packID);
                      }
                      toDeleteSubDocs.add(subDocs);
                    }


                    // randomly verify the add/update "took":
                    if (random.nextInt(20) == 2) {
                      final boolean applyDeletes = delSubDocs != null;
                      final IndexSearcher s = nrt.get(gen, applyDeletes);
                      try {
                        assertEquals(docsList.size(), s.search(new TermQuery(new Term("packID", packID)), 10).totalHits);
                      } finally {
                        nrt.release(s);
                      }
                    }


                  } else {
                    if (VERBOSE) {
                      System.out.println(Thread.currentThread().getName() + ": add doc docid:" + doc.get("docid"));
                    }


                    gen = nrt.addDocument(doc);
                    addCount.getAndIncrement();


                    // randomly verify the add "took":
                    if (random.nextInt(20) == 2) {
                      //System.out.println(Thread.currentThread().getName() + ": verify");
                      final IndexSearcher s = nrt.get(gen, false);
                      //System.out.println(Thread.currentThread().getName() + ": got s=" + s);
                      try {
                        assertEquals(1, s.search(new TermQuery(new Term("docid", doc.get("docid"))), 10).totalHits);
                      } finally {
                        nrt.release(s);
                      }
                      //System.out.println(Thread.currentThread().getName() + ": done verify");
                    }


                    if (random.nextInt(5) == 3) {
                      if (VERBOSE) {
                        System.out.println(Thread.currentThread().getName() + ": buffer del id:" + doc.get("docid"));
                      }
                      toDeleteIDs.add(doc.get("docid"));
                    }
                  }
                } else {
                  // we use update but it never replaces a
                  // prior doc
                  if (VERBOSE) {
                    System.out.println(Thread.currentThread().getName() + ": update doc id:" + doc.get("docid"));
                  }
                  gen = nrt.updateDocument(new Term("docid", doc.get("docid")), doc);
                  addCount.getAndIncrement();


                  // randomly verify the add "took":
                  if (random.nextInt(20) == 2) {
                    final IndexSearcher s = nrt.get(gen, true);
                    try {
                      assertEquals(1, s.search(new TermQuery(new Term("docid", doc.get("docid"))), 10).totalHits);
                    } finally {
                      nrt.release(s);
                    }
                  }


                  if (random.nextInt(5) == 3) {
                    if (VERBOSE) {
                      System.out.println(Thread.currentThread().getName() + ": buffer del id:" + doc.get("docid"));
                    }
                    toDeleteIDs.add(doc.get("docid"));
                  }
                }


                if (random.nextInt(30) == 17) {
                  if (VERBOSE) {
                    System.out.println(Thread.currentThread().getName() + ": apply " + toDeleteIDs.size() + " deletes");
                  }
                  for(String id : toDeleteIDs) {
                    if (VERBOSE) {
                      System.out.println(Thread.currentThread().getName() + ": del term=id:" + id);
                    }
                    gen = nrt.deleteDocuments(new Term("docid", id));


                    // randomly verify the delete "took":
                    if (random.nextInt(20) == 7) {
                      final IndexSearcher s = nrt.get(gen, true);
                      try {
                        assertEquals(0, s.search(new TermQuery(new Term("docid", id)), 10).totalHits);
                      } finally {
                        nrt.release(s);
                      }
                    }
                  }


                  final int count = delCount.addAndGet(toDeleteIDs.size());
                  if (VERBOSE) {
                    System.out.println(Thread.currentThread().getName() + ": tot " + count + " deletes");
                  }
                  delIDs.addAll(toDeleteIDs);
                  toDeleteIDs.clear();


                  for(SubDocs subDocs : toDeleteSubDocs) {
                    assertTrue(!subDocs.deleted);
                    gen = nrt.deleteDocuments(new Term("packID", subDocs.packID));
                    subDocs.deleted = true;
                    if (VERBOSE) {
                      System.out.println("  del subs: " + subDocs.subIDs + " packID=" + subDocs.packID);
                    }
                    delIDs.addAll(subDocs.subIDs);
                    delCount.addAndGet(subDocs.subIDs.size());


                    // randomly verify the delete "took":
                    if (random.nextInt(20) == 7) {
                      final IndexSearcher s = nrt.get(gen, true);
                      try {
                        assertEquals(0, s.search(new TermQuery(new Term("packID", subDocs.packID)), 1).totalHits);
                      } finally {
                        nrt.release(s);
                      }
                    }
                  }
                  toDeleteSubDocs.clear();
                }
                if (addedField != null) {
                  doc.removeField(addedField);
                }
              } catch (Throwable t) {
                System.out.println(Thread.currentThread().getName() + ": FAILED: hit exc");
                t.printStackTrace();
                failed.set(true);
                throw new RuntimeException(t);
              }
            }


            lastGens.add(gen);
            if (VERBOSE) {
              System.out.println(Thread.currentThread().getName() + ": indexing done");
            }
          }
        };
      threads[thread].setDaemon(true);
      threads[thread].start();
    }


    if (VERBOSE) {
      System.out.println("TEST: DONE start indexing threads [" + (System.currentTimeMillis()-t0) + " ms]");
    }


    // let index build up a bit
    Thread.sleep(100);


    // silly starting guess:
    final AtomicInteger totTermCount = new AtomicInteger(100);


    // run search threads
    final Thread[] searchThreads = new Thread[NUM_SEARCH_THREADS];
    final AtomicInteger totHits = new AtomicInteger();


    if (VERBOSE) {
      System.out.println("TEST: start search threads");
    }


    for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) {
      searchThreads[thread] = new Thread() {
          @Override
          public void run() {
            while(System.currentTimeMillis() < stopTime && !failed.get()) {
              final IndexSearcher s = nrt.get(random.nextBoolean());
              try {
                try {
                  smokeTestSearcher(s);
                  if (s.getIndexReader().numDocs() > 0) {


                    TermEnum termEnum = s.getIndexReader().terms(new Term("body", ""));
                    int seenTermCount = 0;
                    int shift;
                    int trigger;
                    if (totTermCount.get() < 10) {
                      shift = 0;
                      trigger = 1;
                    } else {
                      trigger = totTermCount.get()/10;
                      shift = random.nextInt(trigger);
                    }


                    while(System.currentTimeMillis() < stopTime) {
                      Term term = termEnum.term();
                      if (term == null) {
                        if (seenTermCount == 0) {
                          break;
                        }
                        totTermCount.set(seenTermCount);
                        seenTermCount = 0;
                        if (totTermCount.get() < 10) {
                          shift = 0;
                          trigger = 1;
                        } else {
                          trigger = totTermCount.get()/10;
                          //System.out.println("trigger " + trigger);
                          shift = random.nextInt(trigger);
                        }
                        termEnum = s.getIndexReader().terms(new Term("body", ""));
                        continue;
                      }
                      seenTermCount++;
                      // search 10 terms
                      if (trigger == 0) {
                        trigger = 1;
                      }
                      if ((seenTermCount + shift) % trigger == 0) {
                        //if (VERBOSE) {
                        //System.out.println(Thread.currentThread().getName() + " now search body:" + term.utf8ToString());
                        //}
                        totHits.addAndGet(runQuery(s, new TermQuery(term)));
                      }
                      termEnum.next();
                    }
                    if (VERBOSE) {
                      System.out.println(Thread.currentThread().getName() + ": search done");
                    }
                  }
                } finally {
                  nrt.release(s);
                }
              } catch (Throwable t) {
                System.out.println(Thread.currentThread().getName() + ": FAILED: hit exc");
                failed.set(true);
                t.printStackTrace(System.out);
                throw new RuntimeException(t);
              }
            }
          }
        };
      searchThreads[thread].setDaemon(true);
      searchThreads[thread].start();
    }


    if (VERBOSE) {
      System.out.println("TEST: now join");
    }
    for(int thread=0;thread<NUM_INDEX_THREADS;thread++) {
      threads[thread].join();
    }
    for(int thread=0;thread<NUM_SEARCH_THREADS;thread++) {
      searchThreads[thread].join();
    }


    if (VERBOSE) {
      System.out.println("TEST: done join [" + (System.currentTimeMillis()-t0) + " ms]; addCount=" + addCount + " delCount=" + delCount);
      System.out.println("TEST: search totHits=" + totHits);
    }


    long maxGen = 0;
    for(long gen : lastGens) {
      maxGen = Math.max(maxGen, gen);
    }


    final IndexSearcher s = nrt.get(maxGen, true);


    boolean doFail = false;
    for(String id : delIDs) {
      final TopDocs hits = s.search(new TermQuery(new Term("docid", id)), 1);
      if (hits.totalHits != 0) {
        System.out.println("doc id=" + id + " is supposed to be deleted, but got docID=" + hits.scoreDocs[0].doc);
        doFail = true;
      }
    }


    // Make sure each group of sub-docs are still in docID order:
    for(SubDocs subDocs : allSubDocs) {
      if (!subDocs.deleted) {
        // We sort by relevance but the scores should be identical so sort falls back to by docID:
        TopDocs hits = s.search(new TermQuery(new Term("packID", subDocs.packID)), 20);
        assertEquals(subDocs.subIDs.size(), hits.totalHits);
        int lastDocID = -1;
        int startDocID = -1;
        for(ScoreDoc scoreDoc : hits.scoreDocs) {
          final int docID = scoreDoc.doc;
          if (lastDocID != -1) {
            assertEquals(1+lastDocID, docID);
          } else {
            startDocID = docID;
          }
          lastDocID = docID;
          final Document doc = s.doc(docID);
          assertEquals(subDocs.packID, doc.get("packID"));
        }


        lastDocID = startDocID - 1;
        for(String subID : subDocs.subIDs) {
          hits = s.search(new TermQuery(new Term("docid", subID)), 1);
          assertEquals(1, hits.totalHits);
          final int docID = hits.scoreDocs[0].doc;
          if (lastDocID != -1) {
            assertEquals(1+lastDocID, docID);
          }
          lastDocID = docID;
        }          
      } else {
        for(String subID : subDocs.subIDs) {
          assertEquals(0, s.search(new TermQuery(new Term("docid", subID)), 1).totalHits);
        }
      }
    }
    
    final int endID = Integer.parseInt(docs.nextDoc().get("docid"));
    for(int id=0;id<endID;id++) {
      String stringID = ""+id;
      if (!delIDs.contains(stringID)) {
        final TopDocs hits = s.search(new TermQuery(new Term("docid", stringID)), 1);
        if (hits.totalHits != 1) {
          System.out.println("doc id=" + stringID + " is not supposed to be deleted, but got hitCount=" + hits.totalHits);
          doFail = true;
        }
      }
    }
    assertFalse(doFail);


    assertEquals("index=" + writer.segString() + " addCount=" + addCount + " delCount=" + delCount, addCount.get() - delCount.get(), s.getIndexReader().numDocs());
    nrt.release(s);


    if (es != null) {
      es.shutdown();
      es.awaitTermination(1, TimeUnit.SECONDS);
    }


    writer.commit();
    assertEquals("index=" + writer.segString() + " addCount=" + addCount + " delCount=" + delCount, addCount.get() - delCount.get(), writer.numDocs());


    if (VERBOSE) {
      System.out.println("TEST: now close NRTManager");
    }
    nrtThread.close();
    nrt.close();
    assertFalse(writer.anyNonBulkMerges);
    writer.close(false);
    _TestUtil.checkIndex(dir);
    dir.close();
    _TestUtil.rmDir(tempDir);
    docs.close();


    if (VERBOSE) {
      System.out.println("TEST: done [" + (System.currentTimeMillis()-t0) + " ms]");
    }
  }

View Full Code Here

      // no
      CodecProvider.getDefault().setDefaultFieldCodec("Standard");
    }
    */


    final LineFileDocs docs = new LineFileDocs(random);
    final int RUN_TIME_MSEC = atLeast(500);
    final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
    final File tempDir = _TestUtil.getTempDir("fstlines");
    final MockDirectoryWrapper dir = new MockDirectoryWrapper(random, FSDirectory.open(tempDir));
    final IndexWriter writer = new IndexWriter(dir, conf);
    writer.setInfoStream(VERBOSE ? System.out : null);
    final long stopTime = System.currentTimeMillis() + RUN_TIME_MSEC;
    Document doc;
    int docCount = 0;
    while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) {
      writer.addDocument(doc);
      docCount++;
    }
    IndexReader r = IndexReader.open(writer, true);
    writer.close();

View Full Code Here

      assertEquals(new BytesRef("for all the fish"), results.get(2).payload);
    }
  }
  
  public void testRandomRealisticKeys() throws IOException {
    LineFileDocs lineFile = new LineFileDocs(random());
    Map<String, Long> mapping = new HashMap<String, Long>();
    List<Input> keys = new ArrayList<Input>();
    
    int howMany = atLeast(100); // this might bring up duplicates
    for (int i = 0; i < howMany; i++) {
      Document nextDoc = lineFile.nextDoc();
      String title = nextDoc.getField("title").stringValue();
      int randomWeight = random().nextInt(100);
      keys.add(new Input(title, randomWeight));
      if (!mapping.containsKey(title) || mapping.get(title) < randomWeight) {
          mapping.put(title, Long.valueOf(randomWeight));
      }
    }
    AnalyzingSuggester analyzingSuggester = new AnalyzingSuggester(new MockAnalyzer(random()), new MockAnalyzer(random()),
        AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, random().nextBoolean());
    boolean doPayloads = random().nextBoolean();
    if (doPayloads) {
      List<Input> keysAndPayloads = new ArrayList<Input>();
      for (Input termFreq : keys) {
        keysAndPayloads.add(new Input(termFreq.term, termFreq.v, new BytesRef(Long.toString(termFreq.v))));
      }
      analyzingSuggester.build(new InputArrayIterator(keysAndPayloads));
    } else {
      analyzingSuggester.build(new InputArrayIterator(keys));  
    }
    
    for (Input termFreq : keys) {
      List<LookupResult> lookup = analyzingSuggester.lookup(termFreq.term.utf8ToString(), false, keys.size());
      for (LookupResult lookupResult : lookup) {
        assertEquals(mapping.get(lookupResult.key), Long.valueOf(lookupResult.value));
        if (doPayloads) {
          assertEquals(lookupResult.payload.utf8ToString(), Long.toString(lookupResult.value));
        } else {
          assertNull(lookupResult.payload);
        }
      }
    }
    
    lineFile.close();
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.util.LineFileDocs

org.apache.lucene.analysis.BaseTokenStreamTestCase

org.apache.lucene.codecs.lucene40.TestReuseDocsEnum

org.apache.lucene.document.Document

org.apache.lucene.document.Field

org.apache.lucene.document.FieldType

org.apache.lucene.document.SortedDocValuesField

org.apache.lucene.document.StringField

org.apache.lucene.index.memory.MemoryIndexTest

org.apache.lucene.index.TestCustomNorms

org.apache.lucene.index.TestDuelingCodecs

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.