Package org.terrier.structures

Examples of org.terrier.structures.InvertedIndex


    //create a hashset with the document identifiers
    //and their index for phrase each term. For example,
    //if docidsMap[2].
    final TIntIntHashMap[] docidsMap = new TIntIntHashMap[phraseLength];
   
    final InvertedIndex invIndex = index.getInvertedIndex();
   
    int blockLengthIndex = -1;
    //BlockInvertedIndex invIndex = (BlockInvertedIndex)inv;
    int[][][] postings = new int[phraseLength][][];
    for (int i = 0; i < phraseLength; i++) {
      docidsMap[i] = new TIntIntHashMap();
      String t = ((SingleTermQuery) phraseTerms.get(i)).getTerm();
      if (terms.getStatistics(t) == null)
      {
        LexiconEntry le = index.getLexicon().getLexiconEntry(t);
        if (le == null)
          continue;
        terms.setTermProperty(t, le);
      }

     
      //for each phrase term, we store the identifiers of
      //documents that contain that term in a hashmap
      //we also convert the block frequencies into
      //indexes for the block ids array, so that we
      //can obtain easily the block ids of a phrase
      //term for each document.
      //
      //For j-th document in the postings lists postings[i]
      //the positions start at postings[i][4][postings[i][3][j-1]]
      //and end at postings[i][4][postings[i][3][j]-1]
      postings[i] = invIndex.getDocuments((LexiconEntry)terms.getStatistics(t));
      blockLengthIndex = postings[i].length - 2;
     
      for (int j = 0; j < postings[i][0].length; j++) {
        //note that the entries in the docidsMap hash sets have
        //been increased by one
View Full Code Here


  public boolean modifyScores(Index index, MatchingQueryTerms query, ResultSet resultSet) {
    // The rest of the method applies proximity weighting as outlined
    // by Yves Rasolofo for queries of 1 < length < 5.
    //TODO replace ApplicationSetup.BLOCK_QUERYING

    InvertedIndex invertedIndex = index.getInvertedIndex();
    if (invertedIndex instanceof BlockInvertedIndex &&
        query.length() > 1 && query.length() < 5) {
     
      Lexicon<String> lexicon = index.getLexicon();
     
      int[] docids = resultSet.getDocids();
      double[] scores = resultSet.getScores();
     
     
      //check when the application of proximity started.
      long proximityStart = System.currentTimeMillis();
     
     
      // the constants used by the algorithm
      double N = index.getCollectionStatistics().getNumberOfDocuments();
      int blockSize = ApplicationSetup.BLOCK_SIZE;
      //The okapi constants for use with the proximity algorithm
      double k = 2.0d;
      double k1 = 1.2d;
      double k3 = 1000d;
      double b = 0.9d;
      int topDocs = 100;
      double avdl =
        1.0D * index.getCollectionStatistics().getAverageDocumentLength();
      double K = k * ((1 - b) + (b * (1 / avdl)));
      // an array holding the proximity weight for each docid
      // corresponds to the scores array
      double[] TPRSV = new double[scores.length];
      //arrays to reference the first terms block information
      int[][] term1Pointers;
      int[] term1blockfreqs;
      int[] term1blockids;
      int[] term1docids;
      //int[] term1freqs;
      //term2Pointers holds the information for the second term of each pair
      //each of the other arrays are used to reduce the number of references
      int[][] term2Pointers;
      int[] term2docids;
      //int[] term2termfreqs;
      int[] term2blockfreqs;
      int[] term2blockids;
      // calculate all the possible combinations of query term pairs
      ArrayList<String[]> queryTermPairs = generateQueryTermPairs(query);
      //Iterator termPairIterator<ArrayList<String>> = queryTermPairs.iterator();
      // for all term pairs
      for (String[] queryTermPair : queryTermPairs)
      {
        final String term1 = queryTermPair[0];
        final String term2 = queryTermPair[1];
       
        //we seek the query term in the lexicon
        LexiconEntry tEntry1 = lexicon.getLexiconEntry(term1);
        if (tEntry1 == null)//and if it is not found, we continue with the next term pair
          continue;
        //double term1KeyFrequency = query.getTermWeight(term1);
       
        double term1DocumentFrequency = (double)tEntry1.getDocumentFrequency();
       
        //we seek the 2nd query term in the lexicon
        LexiconEntry tEntry2 = lexicon.getLexiconEntry(term2);
        //and if it is not found, we continue with the next term pair
        if (tEntry1 == null)
          continue;
        //double term2KeyFrequency = query.getTermWeight(term2);
        double term2DocumentFrequency = (double)tEntry2.getDocumentFrequency();
        term1Pointers = invertedIndex.getDocuments(tEntry1);
       
        term1docids = term1Pointers[0];
        term1blockfreqs = term1Pointers[2];
        term1blockids = term1Pointers[3];
        term2Pointers = invertedIndex.getDocuments(tEntry2);
        term2docids = term2Pointers[0];
        term2blockfreqs = term2Pointers[2];
        term2blockids = term2Pointers[3];
        int length1 = term1docids.length;
        int length2 = term2docids.length;
View Full Code Here

    Lexicon<String> lexicon = index.getLexicon();
   
    /**
     * Test {@link IterablePosting} entries from a {@link InvertedIndex}
     */
    InvertedIndex invertedIndex = index.getInvertedIndex();
    assertNotNull(invertedIndex);
    // for each term
    for (int t = 0; t < termStrings.length; t++) {
      LexiconEntry le = lexicon.getLexiconEntry(termStrings[t]);
      assertNotNull(le);
      ip = invertedIndex.getPostings((BitIndexPointer) le);
      // for each document
      int d = 0;
      while (ip.next() != IterablePosting.EOL) {
        assertEquals(invIds[t][d], ip.getId());
        assertEquals(invTfs[t][d], ip.getFrequency());
        assertEquals(doclens[invIds[t][d]], ip.getDocumentLength());
        if (fieldsExpected) {
          assertEquals(2, invFfs[t][d].length);
          for (int f = 0; f < 2; f++) {
            assertEquals(invFfs[t][d][f], ((FieldIterablePosting) ip).getFieldFrequencies()[f]);
          }
        }
        d++;
      }
      ip.close();
    }
    // post-check
    assertEquals(IterablePosting.EOL, ip.next());

    /**
     * Test {@link IterablePosting} entries from a {@link InvertedIndexInputStream}
     */
    bpiis = (BitPostingIndexInputStream) index.getIndexStructureInputStream("inverted");
    assertNotNull(bpiis);
    // for each term
    for (int t = 0; t < invIds.length; t++) {
      assertTrue(bpiis.hasNext());
      ip = bpiis.next();
      assertNotNull(ip);
      // for each document
      int d = 0;
      while (ip.next() != IterablePosting.EOL) {
        assertEquals(invIds[t][d], ip.getId());
        assertEquals(invTfs[t][d], ip.getFrequency());
        assertEquals(doclens[invIds[t][d]], ip.getDocumentLength());
        if (fieldsExpected) {
          assertEquals(2, invFfs[t][d].length);
          for (int f = 0; f < 2; f++) {
            assertEquals(invFfs[t][d][f], ((FieldIterablePosting) ip).getFieldFrequencies()[f]);
          }
        }
        d++;
      }
    }
    // post-check
    assertFalse(bpiis.hasNext());

    /**
     * Test posting array entries from a {@link InvertedIndex}
     */
    // for each term
    for (int t = 0; t < termStrings.length; t++) {
      LexiconEntry le = lexicon.getLexiconEntry(termStrings[t]);
      assertNotNull(le);
     
      int[][] documents = invertedIndex.getDocuments(le);
     
      if (!fieldsExpected) {
        assertTrue(documents.length >= 2);
      }
      else {
View Full Code Here

TOP

Related Classes of org.terrier.structures.InvertedIndex

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.