Package org.terrier.structures

Examples of org.terrier.structures.Index$UpdatingCollectionStatistics


      docProperties.put("filename", docnos[i]);
      sourceDocs[i] = new FileDocument(new ByteArrayInputStream(documents[i].getBytes()), docProperties, new EnglishTokeniser());
    }
    Collection col = new CollectionDocumentList(sourceDocs, "filename");
    indexer.index(new Collection[]{col});   
    Index index = Index.createIndex();
    assertEquals(sourceDocs.length, index.getCollectionStatistics().getNumberOfDocuments());
    return index;
  }
View Full Code Here


   
    Collection col = new CollectionDocumentList(sourceDocs, "filename");
    indexer.createDirectIndex(new Collection[]{col});
    indexer.createInvertedIndex();
   
    Index index = !fieldsExpected ?
        Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)
        : Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, "fields");
    assertNotNull(index);
   
    MetaIndex meta = index.getMetaIndex();
    assertNotNull(meta);
    assertEquals("doc1", index.getMetaIndex().getItem("filename", 0));
    assertEquals("doc2", index.getMetaIndex().getItem("filename", 1));
   
    IterablePosting ip = null;
    BitPostingIndexInputStream bpiis = null;
   
    /** INVERTED FILE */   
   
    Lexicon<String> lexicon = index.getLexicon();
   
    /**
     * Test {@link IterablePosting} entries from a {@link InvertedIndex}
     */
    InvertedIndex invertedIndex = index.getInvertedIndex();
    assertNotNull(invertedIndex);
    // for each term
    for (int t = 0; t < termStrings.length; t++) {
      LexiconEntry le = lexicon.getLexiconEntry(termStrings[t]);
      assertNotNull(le);
      ip = invertedIndex.getPostings((BitIndexPointer) le);
      // for each document
      int d = 0;
      while (ip.next() != IterablePosting.EOL) {
        assertEquals(invIds[t][d], ip.getId());
        assertEquals(invTfs[t][d], ip.getFrequency());
        assertEquals(doclens[invIds[t][d]], ip.getDocumentLength());
        if (fieldsExpected) {
          assertEquals(2, invFfs[t][d].length);
          for (int f = 0; f < 2; f++) {
            assertEquals(invFfs[t][d][f], ((FieldIterablePosting) ip).getFieldFrequencies()[f]);
          }
        }
        d++;
      }
      ip.close();
    }
    // post-check
    assertEquals(IterablePosting.EOL, ip.next());

    /**
     * Test {@link IterablePosting} entries from a {@link InvertedIndexInputStream}
     */
    bpiis = (BitPostingIndexInputStream) index.getIndexStructureInputStream("inverted");
    assertNotNull(bpiis);
    // for each term
    for (int t = 0; t < invIds.length; t++) {
      assertTrue(bpiis.hasNext());
      ip = bpiis.next();
      assertNotNull(ip);
      // for each document
      int d = 0;
      while (ip.next() != IterablePosting.EOL) {
        assertEquals(invIds[t][d], ip.getId());
        assertEquals(invTfs[t][d], ip.getFrequency());
        assertEquals(doclens[invIds[t][d]], ip.getDocumentLength());
        if (fieldsExpected) {
          assertEquals(2, invFfs[t][d].length);
          for (int f = 0; f < 2; f++) {
            assertEquals(invFfs[t][d][f], ((FieldIterablePosting) ip).getFieldFrequencies()[f]);
          }
        }
        d++;
      }
    }
    // post-check
    assertFalse(bpiis.hasNext());

    /**
     * Test posting array entries from a {@link InvertedIndex}
     */
    // for each term
    for (int t = 0; t < termStrings.length; t++) {
      LexiconEntry le = lexicon.getLexiconEntry(termStrings[t]);
      assertNotNull(le);
     
      int[][] documents = invertedIndex.getDocuments(le);
     
      if (!fieldsExpected) {
        assertTrue(documents.length >= 2);
      }
      else {
        // array should have length at least 4: 1 for the id, 1 for the
        // frequency, 2 for the fields (optionally more for the blocks)
        assertTrue(documents.length >= 4);
      }
     
      // check number of terms
      assertEquals(invIds[t].length, documents[0].length);
      assertEquals(invTfs[t].length, documents[1].length);
     
      // for each document
      for (int d = 0; d < documents[0].length; d++) {
        // test document id
        assertEquals(invIds[t][d], documents[0][d]);
        // test document frequency
        assertEquals(invTfs[t][d], documents[1][d]);
        if (fieldsExpected) {
          // test number of indexed fields
          assertEquals(2, invFfs[t][d].length);
          // test field frequency
          for (int f = 0; f < 2; f++) {
            assertEquals(invFfs[t][d][f], documents[2+f][d]);
          }
        }
      }
    }   
           
    /** DIRECT FILE */
   
    if (directExpected) {
      DocumentIndex documentIndex = index.getDocumentIndex();

      /**
       * Test {@link IterablePosting} entries from a {@link DirectIndex}
       */
      DirectIndex directIndex = index.getDirectIndex();
      assertNotNull(directIndex);
      // for each document
      for (int d = 0; d < dirTfs.length; d++) {
        DocumentIndexEntry de = documentIndex.getDocumentEntry(d);
        assertNotNull(de);
        ip = directIndex.getPostings((BitIndexPointer) de);
        FieldPosting fp = fieldsExpected ? (FieldPosting)ip : null;
        // for each term
        int t = 0;
        int countFoundTerms = 0;
        while (ip.next() != IterablePosting.EOL) {
          int termid = ip.getId();
          assertTrue(termid >= 0);
          String term = lexicon.getLexiconEntry(termid).getKey();
          assertNotNull(term);
          countFoundTerms++;
          assertTrue(dirTfs[d].containsKey(term));
          assertEquals(dirTfs[d].get(term), ip.getFrequency());
          assertEquals(doclens[d], ip.getDocumentLength());         
         
          if (fieldsExpected) {
            assertEquals(2, fp.getFieldFrequencies().length);
            for (int f = 0; f < 2; f++) {
              assertEquals(dirFfs[d].get(term)[f], fp.getFieldFrequencies()[f]);
            }
          }
          t++;
        }
        assertEquals(dirTfs[d].size() ,countFoundTerms);
        ip.close();
      }
      // post-check
      assertEquals(IterablePosting.EOL, ip.next());

      /**
       * Test {@link IterablePosting} entries from a {@link DirectIndexInputStream}
       */
      bpiis = (BitPostingIndexInputStream) index.getIndexStructureInputStream("direct");
      assertNotNull(bpiis);
      // for each document
      for (int d = 0; d < dirTfs.length; d++) {
        assertTrue(bpiis.hasNext());
        ip = bpiis.next();
View Full Code Here

 
  protected abstract Matching makeMatching(Index i);
 
  @Test public void testSingleDocumentIndexMatching() throws Exception
  {
    Index index = IndexTestUtils.makeIndex(
        new String[]{"doc1"},
        new String[]{"The quick brown fox jumps over the lazy dog"});
    System.err.println("testSingleDocumentIndexMatching: " + index.toString());
    assertNotNull(index);
    assertEquals(1, index.getCollectionStatistics().getNumberOfDocuments());
    Matching matching = makeMatching(index);
    assertNotNull(matching);
    MatchingQueryTerms mqt = new MatchingQueryTerms();
    mqt.setTermProperty("quick", 1);
    mqt.setDefaultTermWeightingModel(new DLH13());
View Full Code Here

    assertTrue(rs.getScores()[0] > 0);
  }
 
  @Test public void testTwoDocumentsIndexMatching() throws Exception
  {
    Index index = IndexTestUtils.makeIndex(
        new String[]{"doc1", "doc2"},
        new String[]{
            "The quick brown fox jumps over the lazy dog",
            "how much is that dog in the window"});
    System.err.println("testTwoDocumentsIndexMatching: " + index.toString());
    assertNotNull(index);
    assertEquals(2, index.getCollectionStatistics().getNumberOfDocuments());
    Matching matching = makeMatching(index);
    assertNotNull(matching);
    MatchingQueryTerms mqt;
    ResultSet rs;
   
View Full Code Here

    assertTrue(rs.getScores()[1] > 0);
  }
 
  @Test public void testThreeDocumentsSynonymIndexMatching() throws Exception
  {
    Index index = IndexTestUtils.makeIndex(
        new String[]{"doc1", "doc2", "doc3"},
        new String[]{
            "The quick brown fox jumps over the lazy dog",
            "how much is that dog in the window",
            "the one with the waggily tail"});
    System.err.println("testThreeDocumentsSynonymIndexMatching: " + index.toString());
    assertNotNull(index);
    assertEquals(3, index.getCollectionStatistics().getNumberOfDocuments());
    Matching matching = makeMatching(index);
    assertNotNull(matching);
    MatchingQueryTerms mqt;
    ResultSet rs;
   
View Full Code Here

 
 
 
  @Test public void testMatchingNonStatisticsOverwrite() throws Exception
  {
    Index index = IndexTestUtils.makeIndex(
        new String[]{"doc1"},
        new String[]{"The quick brown fox jumps over the lazy dog"});
    assertNotNull(index);
    System.err.println("testMatchingNonStatisticsOverwrite: " + index.toString());
    assertEquals(1, index.getCollectionStatistics().getNumberOfDocuments());
    Matching matching = makeMatching(index);
    assertNotNull(matching);
   
    MatchingQueryTerms mqt = new MatchingQueryTerms();
    mqt.setDefaultTermWeightingModel(new DLH13());
    LexiconEntry le = index.getLexicon().getLexiconEntry("quick");
    assertNotNull(le);
    le.setStatistics(1, 40);
    mqt.setTermProperty("quick", le);
    ResultSet rs = matching.match("query1", mqt);
    assertNotNull(rs);
View Full Code Here

    assertEquals(path, ApplicationSetup.TERRIER_INDEX_PATH);
    assertEquals(prefix, ApplicationSetup.TERRIER_INDEX_PREFIX);
   
    //check that indexing actually created an index
    assertTrue("Index does not exist at ["+ApplicationSetup.TERRIER_INDEX_PATH+","+ApplicationSetup.TERRIER_INDEX_PREFIX+"]", Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX));
    Index i = Index.createIndex();
    assertNotNull(Index.getLastIndexLoadError(), i);
    assertTrue("Index does not have an inverted structure", i.hasIndexStructure("inverted"));
    assertTrue("Index does not have an lexicon structure", i.hasIndexStructure("lexicon"));
    assertTrue("Index does not have an document structure", i.hasIndexStructure("document"));
    assertTrue("Index does not have an meta structure", i.hasIndexStructure("meta"));
    addDirectStructure(i);
    i.close();
    finishIndexing();
  }
View Full Code Here

    checkIndex();
  }
 
  protected void checkIndex() throws Exception
  {
    Index i = Index.createIndex();
    for(BatchEndToEndTestEventHooks hook : testHooks)
    {
      hook.checkIndex(this, i);
    }
  }
View Full Code Here

    Writer w = Files.writeFileWriter(tmpFile);
    for(String row : rows)
      w.append(row + "\n");
    w.close();
   
    Index index = Index.createNewIndex(folder.newFolder("index").toString(), "data");
    index.setIndexProperty("num.Documents", ""+docnos.length);
    IndexUtil.forceStructure(index, "meta", new ArrayMetaIndex(docnos));
    Logger.getRootLogger().setLevel(Level.ALL);
    Matching rtr = new TRECResultsMatching(index, tmpFile.toString());
    return rtr;
  }
View Full Code Here

 
  @SuppressWarnings("unchecked")
  @Override
  protected void checkIndex() throws Exception
 
    Index index = Index.createIndex();
    assertNotNull("Failed to get an index", index);
    final String[] expectedStructures = new String[]{
      "inverted", "lexicon", "meta", "document", "document-factory", "lexicon-keyfactory", "lexicon-valuefactory", "direct"
    };
    final String[] expectedStructuresInputStream = new String[]{
        "inverted", "lexicon", "meta", "document", "direct"
    };
           
    for (String structureName : expectedStructures )
      assertTrue("Index has no "+ structureName + " structure", index.hasIndexStructure(structureName));
    for (String structureName : expectedStructuresInputStream )
      assertTrue("Index has no "+ structureName + " inputstream structure", index.hasIndexStructure(structureName));
   
    checkDocumentLengths(index, DOCUMENT_LENGTHS, DOCUMENT_UNIQUE_TERMS);
    checkMetaIndex(index, DOCUMENT_NAMES);
    checkLexicon(index);
    checkInvertedIndexStream(index, DOCUMENT_LENGTHS);
    checkDirectIndex(index,
        index.getCollectionStatistics().getNumberOfUniqueTerms(),
        index.getCollectionStatistics().getNumberOfUniqueTerms(),
        DOCUMENT_LENGTHS,
        DOCUMENT_UNIQUE_TERMS);
    checkCollectionStatistics(index);
    if (FieldScore.FIELDS_COUNT > 0)
    {
      assertTrue("LexiconEntry is not of type FieldLexiconEntry", ((FixedSizeWriteableFactory<LexiconEntry>)index.getIndexStructure("lexicon-valuefactory")).newInstance()
          instanceof FieldLexiconEntry);
      assertTrue("DocumentIndexEntry is not of type FieldDocumentIndexEntry", ((FixedSizeWriteableFactory<DocumentIndexEntry>)index.getIndexStructure("document-factory")).newInstance()
          instanceof FieldDocumentIndexEntry);
    }
    else
    {
      assertTrue("LexiconEntry is not of type BasicLexiconEntry", ((FixedSizeWriteableFactory<LexiconEntry>)index.getIndexStructure("lexicon-valuefactory")).newInstance()
          instanceof BasicLexiconEntry);
    }
    index.close();
    super.checkIndex();
  }
View Full Code Here

TOP

Related Classes of org.terrier.structures.Index$UpdatingCollectionStatistics

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.