Examples of SegmentReader


Examples of net.nutch.segment.SegmentReader

      writer.maxFieldLength = maxFieldLength;
      //writer.infoStream = LogFormatter.getLogStream(LOG, Level.INFO);
      writer.setUseCompoundFile(false);
      writer.setSimilarity(new NutchSimilarity());

      SegmentReader sr = null;

      long start = System.currentTimeMillis();
      long delta = start;
      long curTime, total = 0;
      long count = 0;
      try {
          LOG.info("* Opening segment " + srcDir.getName());
          sr = new SegmentReader(nfs, srcDir, false, true, true, true);

          total = sr.size;
         
          String segmentName = srcDir.getCanonicalFile().getName();
          FetcherOutput fetcherOutput = new FetcherOutput();
          ParseText parseText = new ParseText();
          ParseData parseData = new ParseData();
          LOG.info("* Indexing segment " + srcDir.getName());

          //
          // Iterate through all docs in the input
          //
          maxDocs = Math.min(sr.size, maxDocs);
          for (count = 0; count < maxDocs; count++) {
              if (!sr.get(count, fetcherOutput, null, parseText, parseData)) continue;

              // only index the page if it was fetched correctly
              if (fetcherOutput.getStatus() != FetcherOutput.SUCCESS) {
                  continue;                             
              }

              // reconstruct parse
              Parse parse = new ParseImpl(parseText.getText(), parseData);

              // build initial document w/ core fields
              Document doc = makeDocument(segmentName, count,
                                          fetcherOutput, parse);

              // run filters to add more fields to the document
              doc = IndexingFilters.filter(doc, parse, fetcherOutput);
   
              // add the document to the index
              writer.addDocument(doc);
              if (count > 0 && count % LOG_STEP == 0) {
                curTime = System.currentTimeMillis();
                LOG.info(" Processed " + count + " records (" +
                        ((float)LOG_STEP * 1000.0f / (float)(curTime - delta)) +
                        " rec/s)");
                delta = curTime;
              }
          }
      } catch (EOFException e) {
          LOG.warning("Unexpected EOF in: " + srcDir +
                      " at entry #" + count + ".  Ignoring.");
      } finally {
        sr.close();
      }
      LOG.info("* Optimizing index...");
      writer.optimize();
      writer.close();

View Full Code Here

Examples of net.nutch.segment.SegmentReader

    try {
      segdirs = new ArrayList();
      // open all segments
      for (int i = 0; i < allsegdirs.size(); i++) {
        File dir = (File) allsegdirs.get(i);
        SegmentReader sr = null;
        try {
          // try to autofix it if corrupted...
          sr = new SegmentReader(nfs, dir, true);
        } catch (Exception e) {
          // this segment is hosed beyond repair, don't use it
          continue;
        }
        segdirs.add(dir);
        totalRecords += sr.size;
        LOG.info(" - segment " + dir.getName() + ": " + sr.size + " records.");
        readers.put(dir.getName(), sr);
      }
      long total = totalRecords;
      LOG.info("* TOTAL " + total + " input records in " + segdirs.size() + " segments.");
      LOG.info("* Creating master index...");
      stage = SegmentMergeStatus.STAGE_MASTERIDX;
      // XXX Note that Lucene indexes don't work with NutchFileSystem for now.
      // XXX For now always assume LocalFileSystem here...
      Vector masters = new Vector();
      File fsmtIndexDir = new File(output, ".fastmerge_index");
      File masterDir = new File(fsmtIndexDir, "0");
      if (!masterDir.mkdirs()) {
        LOG.severe("Could not create a master index dir: " + masterDir);
        return;
      }
      masters.add(masterDir);
      IndexWriter iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true);
      iw.setUseCompoundFile(false);
      iw.mergeFactor = INDEX_MERGE_FACTOR;
      iw.minMergeDocs = INDEX_MIN_MERGE_DOCS;
      long s1 = System.currentTimeMillis();
      Iterator it = readers.values().iterator();
      processedRecords = 0L;
      delta = System.currentTimeMillis();
      while (it.hasNext()) {
        SegmentReader sr = (SegmentReader) it.next();
        String name = sr.segmentDir.getName();
        FetcherOutput fo = new FetcherOutput();
        for (long i = 0; i < sr.size; i++) {
          try {
            if (!sr.get(i, fo, null, null, null)) break;

            Document doc = new Document();
            doc.add(new Field("sd", name + "|" + i, true, false, false));
            doc.add(new Field("uh", MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false));
            doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, false));
            doc.add(new Field("time", DateField.timeToString(fo.getFetchDate()), true, false, false));
            iw.addDocument(doc);
            processedRecords++;
            if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
              LOG.info(" Processed " + processedRecords + " records (" +
                      (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
              delta = System.currentTimeMillis();
            }
            if (processedRecords > 0 && (processedRecords % INDEX_SIZE == 0)) {
              iw.optimize();
              iw.close();
              LOG.info(" - creating next subindex...");
              masterDir = new File(fsmtIndexDir, "" + masters.size());
              if (!masterDir.mkdirs()) {
                LOG.severe("Could not create a master index dir: " + masterDir);
                return;
              }
              masters.add(masterDir);
              iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true);
              iw.setUseCompoundFile(false);
              iw.mergeFactor = INDEX_MERGE_FACTOR;
              iw.minMergeDocs = INDEX_MIN_MERGE_DOCS;
            }
          } catch (Throwable t) {
            // we can assume the data is invalid from now on - break here
            LOG.info(" - segment " + name + " truncated to " + (i + 1) + " records");
            break;
          }
        }
      }
      iw.optimize();
      LOG.info("* Creating index took " + (System.currentTimeMillis() - s1) + " ms");
      s1 = System.currentTimeMillis();
      // merge all other indexes using the latest IndexWriter (still open):
      if (masters.size() > 1) {
        LOG.info(" - merging subindexes...");
        stage = SegmentMergeStatus.STAGE_MERGEIDX;
        IndexReader[] ireaders = new IndexReader[masters.size() - 1];
        for (int i = 0; i < masters.size() - 1; i++) ireaders[i] = IndexReader.open((File)masters.get(i));
        iw.addIndexes(ireaders);
        for (int i = 0; i < masters.size() - 1; i++) {
          ireaders[i].close();
          FileUtil.fullyDelete((File)masters.get(i));
        }
      }
      iw.close();
      LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + " ms");
      LOG.info("* Removing duplicate entries...");
      stage = SegmentMergeStatus.STAGE_DEDUP;
      IndexReader ir = IndexReader.open(masterDir);
      int i = 0;
      long cnt = 0L;
      processedRecords = 0L;
      s1 = System.currentTimeMillis();
      delta = s1;
      TermEnum te = ir.terms();
      while(te.next()) {
        Term t = te.term();
        if (t == null) continue;
        if (!(t.field().equals("ch") || t.field().equals("uh"))) continue;
        cnt++;
        processedRecords = cnt / 2;
        if (cnt > 0 && (cnt % (LOG_STEP  * 2) == 0)) {
          LOG.info(" Processed " + processedRecords + " records (" +
                  (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
          delta = System.currentTimeMillis();
        }
        // Enumerate all docs with the same URL hash or content hash
        TermDocs td = ir.termDocs(t);
        if (td == null) continue;
        int id = -1;
        String time = null;
        Document doc = null;
        // Keep only the latest version of the document with
        // the same hash (url or content). Note: even if the content
        // hash is identical, other metadata may be different, so even
        // in this case it makes sense to keep the latest version.
        while (td.next()) {
          int docid = td.doc();
          if (!ir.isDeleted(docid)) {
            doc = ir.document(docid);
            if (time == null) {
              time = doc.get("time");
              id = docid;
              continue;
            }
            String dtime = doc.get("time");
            // "time" is a DateField, and can be compared lexicographically
            if (dtime.compareTo(time) > 0) {
              if (id != -1) {
                ir.delete(id);
              }
              time = dtime;
              id = docid;
            } else {
              ir.delete(docid);
            }
          }
        }
      }
      //
      // keep the IndexReader open...
      //
     
      LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms");
      stage = SegmentMergeStatus.STAGE_WRITING;
      processedRecords = 0L;
      Vector outDirs = new Vector();
      File outDir = new File(output, SegmentWriter.getNewSegmentName());
      outDirs.add(outDir);
      LOG.info("* Merging all segments into " + output.getName());
      s1 = System.currentTimeMillis();
      delta = s1;
      nfs.mkdirs(outDir);
      SegmentWriter sw = new SegmentWriter(nfs, outDir, true);
      LOG.fine(" - opening first output segment in " + outDir.getName());
      FetcherOutput fo = new FetcherOutput();
      Content co = new Content();
      ParseText pt = new ParseText();
      ParseData pd = new ParseData();
      int outputCnt = 0;
      for (int n = 0; n < ir.maxDoc(); n++) {
        if (ir.isDeleted(n)) {
          //System.out.println("-del");
          continue;
        }
        Document doc = ir.document(n);
        String segDoc = doc.get("sd");
        int idx = segDoc.indexOf('|');
        String segName = segDoc.substring(0, idx);
        String docName = segDoc.substring(idx + 1);
        SegmentReader sr = (SegmentReader) readers.get(segName);
        long docid;
        try {
          docid = Long.parseLong(docName);
        } catch (Exception e) {
          continue;
        }
        try {
          // get data from the reader
          sr.get(docid, fo, co, pt, pd);
        } catch (Throwable thr) {
          // don't break the loop, because only one of the segments
          // may be corrupted...
          LOG.fine(" - corrupt record no. " + docid + " in segment " + sr.segmentDir.getName() + " - skipping.");
          continue;
        }
        sw.append(fo, co, pt, pd);
        outputCnt++;
        processedRecords++;
        if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
          LOG.info(" Processed " + processedRecords + " records (" +
                  (float)(LOG_STEP * 1000)/(float)(System.currentTimeMillis() - delta) + " rec/s)");
          delta = System.currentTimeMillis();
        }
        if (processedRecords % maxCount == 0) {
          sw.close();
          outDir = new File(output, SegmentWriter.getNewSegmentName());
          LOG.fine(" - starting next output segment in " + outDir.getName());
          nfs.mkdirs(outDir);
          sw = new SegmentWriter(nfs, outDir, true);
          outDirs.add(outDir);
        }
      }
      LOG.info("* Merging took " + (System.currentTimeMillis() - s1) + " ms");
      ir.close();
      sw.close();
      FileUtil.fullyDelete(fsmtIndexDir);
      for (Iterator iter = readers.keySet().iterator(); iter.hasNext();) {
        SegmentReader sr = (SegmentReader) readers.get(iter.next());
        sr.close();
      }
      if (runIndexer) {
        stage = SegmentMergeStatus.STAGE_INDEXING;
        totalRecords = outDirs.size();
        processedRecords = 0L;
View Full Code Here

Examples of net.nutch.segment.SegmentReader

          File f = new File(dataDir, "seg" + i);
          nfs.mkdirs(f);
          createSegmentData(nfs, f, true);
        }
        runTool(dataDir, outSegment);
        SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]);
        assertEquals(SEGMENT_CNT * PAGE_CNT, sr.size);
        sr.close();
      } catch (Throwable e) {
        e.printStackTrace();
        fail(e.getMessage() + ", " + e.getStackTrace());
      }
    } finally {
View Full Code Here

Examples of net.nutch.segment.SegmentReader

          File f = new File(dataDir, "seg" + i);
          nfs.mkdirs(f);
          createSegmentData(nfs, f, false);
        }
        runTool(dataDir, outSegment);
        SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]);
        assertEquals(PAGE_CNT, sr.size);
        sr.close();
      } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
      }
    } catch (Throwable ex) {
View Full Code Here

Examples of net.nutch.segment.SegmentReader

              // do nothing
              ;
          }
        }
        runTool(dataDir, outSegment);
        SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]);
        // we arrive at this expression as follows:
        // 1. SEGMENT_CNT - 1 : because we trash one whole segment
        // 2. 2 * PAGE_CNT / 4: because for two segments
        // we truncate 1/4 of the data file
        // 3. + 2: because sometimes truncation falls on
        // the boundary of the last entry
        int maxCnt = PAGE_CNT * (SEGMENT_CNT - 1) - 2 * PAGE_CNT / 4 + 2 * (SEGMENT_CNT -1);
        //System.out.println("maxCnt=" + maxCnt + ", sr.size=" + sr.size);
        assertTrue(sr.size < maxCnt);
        sr.close();
      } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
      }
    } catch (Throwable ex) {
View Full Code Here

Examples of org.apache.lucene.index.SegmentReader

    List<IndexReader> boboReaderList = new LinkedList<IndexReader>();
    ReaderUtil.gatherSubReaders((List<IndexReader>)boboReaderList, in);
    Map<String,BoboIndexReader> readerMap = new HashMap<String,BoboIndexReader>();
    for (IndexReader reader : boboReaderList){
      BoboIndexReader boboReader = (BoboIndexReader)reader;
      SegmentReader sreader = (SegmentReader)(boboReader.in);
      readerMap.put(sreader.getSegmentName(),boboReader);
    }
   
    ArrayList<BoboIndexReader> currentReaders = new ArrayList<BoboIndexReader>(size);
    boolean isNewReader = false;
    for (int i=0;i<size;++i){
      SegmentInfo sinfo = (SegmentInfo)sinfos.get(i);
      BoboIndexReader breader = readerMap.remove(sinfo.name);
      if (breader!=null){
        // should use SegmentReader.reopen
        // TODO: see LUCENE-2559
        BoboIndexReader newReader = (BoboIndexReader)breader.reopen(true);
        if (newReader!=breader){
          isNewReader = true;
        }
        if (newReader!=null){
          currentReaders.add(newReader);
        }
      }
      else{
        isNewReader = true;
        SegmentReader newSreader = SegmentReader.get(true, sinfo, 1);
        breader = BoboIndexReader.getInstanceAsSubReader(newSreader,this._facetHandlers,this._runtimeFacetHandlerFactories);
        breader._dir = _dir;
        currentReaders.add(breader);
      }
    }
    isNewReader = isNewReader || (readerMap.size() != 0);
    if (!isNewReader){
      return this;
    }
    else{
      MultiReader newMreader = new MultiReader(currentReaders.toArray(new BoboIndexReader[currentReaders.size()]),false);
      BoboIndexReader newReader = BoboIndexReader.getInstanceAsSubReader(newMreader,this._facetHandlers,this._runtimeFacetHandlerFactories);
      newReader._dir = _dir;
      return newReader;
    }
  }
  else if (in instanceof SegmentReader){
     // should use SegmentReader.reopen
    // TODO: see LUCENE-2559
   
    SegmentReader sreader = (SegmentReader)in;
    int numDels = sreader.numDeletedDocs();
   
    SegmentInfo sinfo = null;
    boolean sameSeg = false;
    //get SegmentInfo instance
    for (int i=0;i<size;++i){
    SegmentInfo sinfoTmp = (SegmentInfo)sinfos.get(i);
    if (sinfoTmp.name.equals(sreader.getSegmentName())){
      int numDels2 = sinfoTmp.getDelCount();
      sameSeg = numDels==numDels2;
      sinfo = sinfoTmp;
      break;
    }
    }
  
    if (sinfo == null){
      // segment no longer exists
      return null;
    }
    if (sameSeg){
      return this
    }
    else{
    SegmentReader newSreader = SegmentReader.get(true, sinfo, 1);
    return BoboIndexReader.getInstanceAsSubReader(newSreader,this._facetHandlers,this._runtimeFacetHandlerFactories);
    }
  }
  else{
    // should not reach here, a catch-all default case
View Full Code Here

Examples of org.apache.lucene.index.SegmentReader

  }

  /** Returns true if the given reader is sorted by the given sorter. */
  public static boolean isSorted(AtomicReader reader, Sorter sorter) {
    if (reader instanceof SegmentReader) {
      final SegmentReader segReader = (SegmentReader) reader;
      final Map<String, String> diagnostics = segReader.getSegmentInfo().info.getDiagnostics();
      if (diagnostics != null && sorter.getID().equals(diagnostics.get(SORTER_ID_PROP))) {
        return true;
      }
    }
    return false;
View Full Code Here

Examples of org.apache.lucene.index.SegmentReader

   
    populate(directory, config);

    DirectoryReader r0 = IndexReader.open(directory);
    SegmentReader r = LuceneTestCase.getOnlySegmentReader(r0);
    String segment = r.getSegmentName();
    r.close();

    FieldInfosReader infosReader = new PreFlexRWCodec().fieldInfosFormat().getFieldInfosReader();
    FieldInfos fieldInfos = infosReader.read(directory, segment, "", IOContext.READONCE);
    String segmentFileName = IndexFileNames.segmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_INDEX_EXTENSION);
    long tiiFileLength = directory.fileLength(segmentFileName);
View Full Code Here

Examples of org.apache.lucene.index.SegmentReader

  public int merge(MergeState mergeState) throws IOException {
    int docCount = 0;
    int idx = 0;

    for (AtomicReader reader : mergeState.readers) {
      final SegmentReader matchingSegmentReader = mergeState.matchingSegmentReaders[idx++];
      CompressingTermVectorsReader matchingVectorsReader = null;
      if (matchingSegmentReader != null) {
        final TermVectorsReader vectorsReader = matchingSegmentReader.getTermVectorsReader();
        // we can only bulk-copy if the matching reader is also a CompressingTermVectorsReader
        if (vectorsReader != null && vectorsReader instanceof CompressingTermVectorsReader) {
          matchingVectorsReader = (CompressingTermVectorsReader) vectorsReader;
        }
      }

      final int maxDoc = reader.maxDoc();
      final Bits liveDocs = reader.getLiveDocs();

      if (matchingVectorsReader == null
          || matchingVectorsReader.getCompressionMode() != compressionMode
          || matchingVectorsReader.getChunkSize() != chunkSize
          || matchingVectorsReader.getPackedIntsVersion() != PackedInts.VERSION_CURRENT) {
        // naive merge...
        for (int i = nextLiveDoc(0, liveDocs, maxDoc); i < maxDoc; i = nextLiveDoc(i + 1, liveDocs, maxDoc)) {
          final Fields vectors = reader.getTermVectors(i);
          addAllDocVectors(vectors, mergeState);
          ++docCount;
          mergeState.checkAbort.work(300);
        }
      } else {
        final CompressingStoredFieldsIndexReader index = matchingVectorsReader.getIndex();
        final IndexInput vectorsStream = matchingVectorsReader.getVectorsStream();
        for (int i = nextLiveDoc(0, liveDocs, maxDoc); i < maxDoc; ) {
          if (pendingDocs.isEmpty()
              && (i == 0 || index.getStartPointer(i - 1) < index.getStartPointer(i))) { // start of a chunk
            final long startPointer = index.getStartPointer(i);
            vectorsStream.seek(startPointer);
            final int docBase = vectorsStream.readVInt();
            final int chunkDocs = vectorsStream.readVInt();
            assert docBase + chunkDocs <= matchingSegmentReader.maxDoc();
            if (docBase + chunkDocs < matchingSegmentReader.maxDoc()
                && nextDeletedDoc(docBase, liveDocs, docBase + chunkDocs) == docBase + chunkDocs) {
              final long chunkEnd = index.getStartPointer(docBase + chunkDocs);
              final long chunkLength = chunkEnd - vectorsStream.getFilePointer();
              indexWriter.writeIndex(chunkDocs, this.vectorsStream.getFilePointer());
              this.vectorsStream.writeVInt(docCount);
View Full Code Here

Examples of org.apache.lucene.index.SegmentReader

  public int merge(MergeState mergeState) throws IOException {
    int docCount = 0;
    int idx = 0;

    for (AtomicReader reader : mergeState.readers) {
      final SegmentReader matchingSegmentReader = mergeState.matchingSegmentReaders[idx++];
      CompressingStoredFieldsReader matchingFieldsReader = null;
      if (matchingSegmentReader != null) {
        final StoredFieldsReader fieldsReader = matchingSegmentReader.getFieldsReader();
        // we can only bulk-copy if the matching reader is also a CompressingStoredFieldsReader
        if (fieldsReader != null && fieldsReader instanceof CompressingStoredFieldsReader) {
          matchingFieldsReader = (CompressingStoredFieldsReader) fieldsReader;
        }
      }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.