Examples of org.terrier.structures.FSOMapFileLexiconOutputStream

org.terrier.structures.FSOMapFileLexiconOutputStream
A LexiconOutputStream for FSOMapFileLexicon. Writes to a FSOrderedMapFile. @author craigm @since 3.0


  /** return the lexicon outputstream or the current index at the specified filename */
  @SuppressWarnings("unchecked")
  protected LexiconOutputStream<String> getLexOutputStream(String structureName) throws IOException
  {
    return new FSOMapFileLexiconOutputStream(
        index.getPath(), index.getPrefix(), 
        structureName, 
        (FixedSizeWriteableFactory<Text>)index.getIndexStructure(defaultStructureName+"-keyfactory"));
  }

View Full Code Here

   * @throws IOException
   */
  @SuppressWarnings("unchecked")
  protected LexiconOutputStream<String> getLexOutputStream(String _structureName) throws IOException
  {
    return new FSOMapFileLexiconOutputStream(
        index.getPath(), index.getPrefix(), 
        _structureName, 
        (FixedSizeWriteableFactory<Text>)index.getIndexStructure("lexicon-keyfactory"));
  }

View Full Code Here

  public void startReduce(LinkedList<MapData> mapData) throws IOException
  {
    //logger.info("The number of Reduce Tasks being used : "+jc.getNumReduceTasks());
    ((HadoopRunsMerger)(super.merger)).beginMerge(mapData);
    this.currentIndex.setIndexProperty("max.term.length", ApplicationSetup.getProperty("max.term.length", ""+20));
    lexstream = new FSOMapFileLexiconOutputStream(this.currentIndex, "lexicon", 
        (FieldScore.FIELDS_COUNT  > 0 ? FieldLexiconEntry.Factory.class : BasicLexiconEntry.Factory.class));
    // Tell the merger how many to Reducers to merge for
    ((HadoopRunsMerger) merger).setNumReducers(
        mutipleIndices ? jc.getNumReduceTasks() : 1);
  }

View Full Code Here

   * The file names and the number of runs are given by the private queue
   */
  public void performMultiWayMerge() throws IOException {
    String[][] _fileNames = getFileNames();
    this.currentIndex.setIndexProperty("max.term.length", ApplicationSetup.getProperty("max.term.length", ""+20));
    LexiconOutputStream<String> lexStream = new FSOMapFileLexiconOutputStream(this.currentIndex, "lexicon", 
        (super.numFields > 0 ? FieldLexiconEntry.Factory.class : BasicLexiconEntry.Factory.class));
    
    try{
      if (useFieldInformation)
        createFieldRunMerger(_fileNames);
      else
        createRunMerger(_fileNames);
      merger.beginMerge(_fileNames.length, path + ApplicationSetup.FILE_SEPARATOR + prefix +  ".inverted.bf");
      while(!merger.isDone()){
        merger.mergeOne(lexStream);
      }
      merger.endMerge(lexStream);
      lexStream.close();
      //the constructor for FieldLexiconEntry is wrong - replace it
      if (super.numFields > 0)
      {
        this.currentIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
      }

View Full Code Here

    {
      throw new IllegalArgumentException("No index found at " + index_path + ","+ ApplicationSetup.TERRIER_INDEX_PREFIX+"-"+0);
    }
    
    //3. create the new lexicon
    LexiconOutputStream<String> lexOut = new FSOMapFileLexiconOutputStream(
        dest, tmpLexiconStructure, 
        (FixedSizeWriteableFactory<Text>) dest.getIndexStructure(lexiconStructure + "-keyfactory"),
        (Class<? extends FixedSizeWriteableFactory<LexiconEntry>>) dest.getIndexStructure(lexiconStructure + "-valuefactory").getClass());
    
    //4. append each source lexicon on to the new lexicon, amending the filenumber as we go
    int termId = 0;
    for(int i=0;i<numberOfReducers;i++)
    {
      //the partition did not have any stuff
      if (! existsIndices[i])
      {
        //touch an empty inverted index file for this segment, as BitPostingIndex requires that all of the files exist
        Files.writeFileStream(BitPostingIndexInputStream.getFilename(
            dest, invertedStructure, (byte)numberOfReducers, (byte)i)).close();
        continue;
      }
      //else, append the lexicon
      Iterator<Map.Entry<String,LexiconEntry>> lexIn = (Iterator<Map.Entry<String, LexiconEntry>>) srcIndices[i].getIndexStructureInputStream("lexicon");
      while(lexIn.hasNext())
      {
        Map.Entry<String,LexiconEntry> e = lexIn.next();
        e.getValue().setTermId(termId);
        ((BitIndexPointer)e.getValue()).setFileNumber((byte)i);
        lexOut.writeNextEntry(e.getKey(), e.getValue());
        termId++;
      }
      IndexUtil.close(lexIn);
      //rename the inverted file to be part of the destination index
      Files.rename(
          BitPostingIndexInputStream.getFilename(srcIndices[i], invertedStructure, (byte)1, (byte)1), 
          BitPostingIndexInputStream.getFilename(dest, invertedStructure, (byte)numberOfReducers, (byte)i));
    }
    lexOut.close();
    
    //5. change over lexicon structures
    final String[] structureSuffices = new String[]{"", "-entry-inputstream"};
    //remove old lexicon structures
    for (String suffix : structureSuffices)

View Full Code Here

      FixedSizeWriteableFactory<LexiconEntry> lvf = 
        (FixedSizeWriteableFactory<LexiconEntry>)srcIndex1.getIndexStructure("lexicon-valuefactory");
        
      //setting the output stream
      LexiconOutputStream<String> lexOutStream = 
        new FSOMapFileLexiconOutputStream(destIndex, "lexicon", (Class <FixedSizeWriteableFactory<LexiconEntry>>) lvf.getClass());


      int newCodes = (int)srcIndex1.getCollectionStatistics().getNumberOfUniqueTerms(); 
      
      PostingIndex inverted1 = srcIndex1.getInvertedIndex();
      PostingIndex inverted2 = srcIndex2.getInvertedIndex();
      
      DirectInvertedOutputStream invOS =null;
      try{
        invOS = (fieldCount > 0 ? fieldInvertedFileOutputStreamClass : invertedFileOutputStreamClass)
          .getConstructor(String.class)
          .newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR +  
            destIndex.getPrefix() + ".inverted"+ BitIn.USUAL_EXTENSION);
        
      } catch (Exception e) {
        logger.error("Couldn't create specified DirectInvertedOutputStream", e);
        return;
      }




      boolean hasMore1 = false;
      boolean hasMore2 = false;
      String term1;
      String term2;
      Map.Entry<String,LexiconEntry> lee1 = null;
      Map.Entry<String,LexiconEntry> lee2 = null;
      hasMore1 = lexInStream1.hasNext();
      if (hasMore1)
        lee1 = lexInStream1.next();
      hasMore2 = lexInStream2.hasNext();
      if (hasMore2)
        lee2 = lexInStream2.next();
      while (hasMore1 && hasMore2) {
    
        term1 = lee1.getKey();
        term2 = lee2.getKey();
        
        int lexicographicalCompare = term1.compareTo(term2);
        if (lexicographicalCompare < 0) {
          //write to inverted file postings for the term that only occurs in 1st index
          BitIndexPointer newPointer = invOS.writePostings(inverted1.getPostings(lee1.getValue()));
          lee1.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
        
        } else if (lexicographicalCompare > 0) {
          //write to inverted file postings for the term that only occurs in 2nd index
          //docids are transformed as we go.
          BitIndexPointer newPointer = 
            invOS.writePostings(inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
          lee2.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          
          int newCode = newCodes++;
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
          lee2.getValue().setTermId(newCode);
          lexOutStream.writeNextEntry(term2, lee2.getValue());
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        } else {
          //write to postings for a term that occurs in both indices
          
          //1. postings from the first index are unchanged
          IterablePosting ip1 = inverted1.getPostings(lee1.getValue());
          BitIndexPointer newPointer1 = invOS.writePostings(ip1);
          
          //2. postings from the 2nd index have their docids transformed
          IterablePosting ip2 = inverted2.getPostings(lee2.getValue());
          BitIndexPointer newPointer2 = invOS.writePostings(ip2, ip1.getId() - numberOfDocs1);
          
          numberOfPointers+= newPointer1.getNumberOfEntries() + newPointer2.getNumberOfEntries();
            
          //don't set numberOfEntries, as LexiconEntry.add() will take care of this.
          lee1.getValue().setPointer(newPointer1);
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), lee1.getValue().getTermId());
          
          lee1.getValue().add(lee2.getValue());
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
          
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        }
      }
      
      if (hasMore1) {
        lee2 = null;
        while (hasMore1) {
          //write to inverted file as well.
          BitIndexPointer newPointer = invOS.writePostings(
              inverted1.getPostings(lee1.getValue()));
          lee1.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
          hasMore1 = lexInStream1.hasNext();
          if (hasMore1)
            lee1 = lexInStream1.next();
        }
      } else if (hasMore2) {
        lee1 = null;
        while (hasMore2) {
          //write to inverted file as well.
          BitIndexPointer newPointer = invOS.writePostings(
              inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
          lee2.getValue().setPointer(newPointer);
          numberOfPointers+=newPointer.getNumberOfEntries();
          int newCode = newCodes++;
          if (keepTermCodeMap)
            termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
          lee2.getValue().setTermId(newCode);
          lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
          hasMore2 = lexInStream2.hasNext();
          if (hasMore2)
            lee2 = lexInStream2.next();
        }    
      }
      IndexUtil.close(lexInStream1);
      IndexUtil.close(lexInStream2);
      


      inverted1.close();
      inverted2.close();
      invOS.close();
      
      destIndex.setIndexProperty("num.Documents", ""+numberOfDocuments);
      destIndex.addIndexStructure(
            "inverted",
            invertedFileInputClass,
            "org.terrier.structures.Index,java.lang.String,org.terrier.structures.DocumentIndex,java.lang.Class", 
            "index,structureName,document,"+ 
              (fieldCount > 0
                ? fieldInvertedIndexPostingIteratorClass
                : basicInvertedIndexPostingIteratorClass ));
          destIndex.addIndexStructureInputStream(
                      "inverted",
                      invertedFileInputStreamClass,
                      "org.terrier.structures.Index,java.lang.String,java.util.Iterator,java.lang.Class",
                      "index,structureName,lexicon-entry-inputstream,"+
                        (fieldCount > 0
                          ? fieldInvertedIndexPostingIteratorClass
                : basicInvertedIndexPostingIteratorClass ));
          destIndex.setIndexProperty("index.inverted.fields.count", ""+fieldCount);
      lexOutStream.close();
      if (fieldCount > 0)
      {
        destIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
      }
      destIndex.flush();

View Full Code Here

      {
        destIndex.setIndexProperty(property, srcIndex1.getIndexProperty(property, null));
      }
      
      //setting the output stream
      LexiconOutputStream<String> lexOutStream = new FSOMapFileLexiconOutputStream(
          destIndex, 
          "lexicon",
           (Class <FixedSizeWriteableFactory<LexiconEntry>>)destIndex.getIndexStructure("lexicon-valuefactory").getClass()
          );
      
      boolean hasMore1 = false;
      boolean hasMore2 = false;
      String term1;
      String term2;


      int termId = 0;
      
      Pointer p = new SimpleBitIndexPointer();
    
      hasMore1 = lexInStream1.hasNext(); 
      hasMore2 = lexInStream2.hasNext(); 
      Map.Entry<String,LexiconEntry> lee1 = lexInStream1.next();
      Map.Entry<String,LexiconEntry> lee2 = lexInStream2.next();
      while (hasMore1 && hasMore2) {
        
        
        
        term1 = lee1.getKey();
        term2 = lee2.getKey();
        int lexicographicalCompare = term1.compareTo(term2);
        if (lexicographicalCompare < 0) {
          lee1.getValue().setTermId(termId);
          lee1.getValue().setPointer(p);
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          termId++;
          if (hasMore1 = lexInStream1.hasNext()) lee1 = lexInStream1.next();
        
        } else if (lexicographicalCompare > 0) {
          lee2.getValue().setTermId(termId);
          lee2.getValue().setPointer(p);
          lexOutStream.writeNextEntry(term2, lee2.getValue());
          termId++;
          if (hasMore2 = lexInStream2.hasNext()) lee2 = lexInStream2.next();
        } else {
          lee1.getValue().setTermId(termId);
          lee1.getValue().setPointer(p);
          lee1.getValue().add(lee2.getValue());
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          if (hasMore1 = lexInStream1.hasNext()) lee1 = lexInStream1.next();
          if (hasMore2 = lexInStream2.hasNext()) lee2 = lexInStream2.next();
          termId++;
        }
      }
      
      if (hasMore1) {
        while (hasMore1) {
          lee1.getValue().setTermId(termId);
          lee1.getValue().setPointer(p);
          lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
          if (hasMore1 = lexInStream1.hasNext()) lee1 = lexInStream1.next();
          termId++;
        }
      } else if (hasMore2) {
        while (hasMore2) {
          lee1.getValue().setTermId(termId);
          lee1.getValue().setPointer(p);
          lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
          if (hasMore2 = lexInStream2.hasNext()) lee2 = lexInStream2.next();
          termId++;
        }    
      }
      IndexUtil.close(lexInStream1);
      IndexUtil.close(lexInStream2);
      lexOutStream.close();
      //recopy the value factory to ensure the field settings are correct
      for(String structure : new String[]{"lexicon-valuefactory"})
      {
        IndexUtil.copyStructure(srcIndex1, destIndex, structure, structure);
      }

View Full Code Here

TOP

Related Classes of org.terrier.structures.FSOMapFileLexiconOutputStream

org.terrier.applications.HadoopIndexing

org.terrier.indexing.BasicSinglePassIndexer

org.terrier.indexing.hadoop.Hadoop_BasicSinglePassIndexer

org.terrier.structures.indexing.InvertedIndexBuilder

org.terrier.structures.indexing.LexiconBuilder

org.terrier.structures.merging.LexiconMerger

org.terrier.structures.merging.StructureMerger

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.