Source Code of org.apache.nutch.indexer.IndexSegment

/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.indexer;


import org.apache.nutch.pagedb.*;
import org.apache.nutch.linkdb.*;
import org.apache.nutch.fetcher.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.db.*;
import org.apache.nutch.io.*;
import org.apache.nutch.fs.*;
import org.apache.nutch.segment.SegmentReader;
import org.apache.nutch.util.*;


import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;


import java.util.logging.*;
import java.util.*;
import java.io.*;


/** Creates an index for the output corresponding to a single fetcher run. */
public class IndexSegment {
  public static final String DONE_NAME = "index.done";
  public static final Logger LOG =
    LogFormatter.getLogger("org.apache.nutch.index.IndexSegment");
  
  public static int LOG_STEP = 20000;


  private boolean boostByLinkCount =
    NutchConf.get().getBoolean("indexer.boost.by.link.count", false);


  private float scorePower = NutchConf.get().getFloat("indexer.score.power", 0.5f);
  private int maxFieldLength = NutchConf.get().getInt("indexer.max.tokens", 10000);
  private int MERGE_FACTOR = NutchConf.get().getInt("indexer.mergeFactor",
      IndexWriter.DEFAULT_MERGE_FACTOR);
  private int MIN_MERGE_DOCS = NutchConf.get().getInt("indexer.minMergeDocs",
      IndexWriter.DEFAULT_MIN_MERGE_DOCS);
  private int MAX_MERGE_DOCS = NutchConf.get().getInt("indexer.maxMergeDocs",
      IndexWriter.DEFAULT_MAX_MERGE_DOCS);
  private int TERM_INDEX_INTERVAL =
    NutchConf.get().getInt("indexer.termIndexInterval",
                           IndexWriter.DEFAULT_TERM_INDEX_INTERVAL);
  private NutchFileSystem nfs;
  private long maxDocs = Long.MAX_VALUE;
  private File srcDir;
  private File localWorkingDir;


  /**
   * Index a segment in the given NFS.
   */
  public IndexSegment(NutchFileSystem nfs, long maxDocs, File srcDir, File localWorkingDir) {
      this.nfs = nfs;
      this.maxDocs = maxDocs;
      this.srcDir = srcDir;
      this.localWorkingDir = localWorkingDir;
  }


  /** Determines the power of link analyis scores.  Each pages's boost is
   * set to <i>score<sup>scorePower</sup></i> where <i>score</i> is its link
   * analysis score and <i>scorePower</i> is the value passed to this method.
   */
  public void setScorePower(float power) { scorePower = power; }


  public void indexPages() throws Exception {
      //
      // First, see if it's ever been indexed before
      //
      File doneFile = new File(srcDir, DONE_NAME);
      if (nfs.exists(doneFile)) {
          throw new IOException("already indexed: " + doneFile + " exists");
      }


      //
      // OK, fine.  Build the writer to the local file, set params
      //
      File outputIndex = new File(srcDir, "index");
      File tmpOutputIndex = new File(localWorkingDir, "index");


      File localOutput = nfs.startLocalOutput(outputIndex, tmpOutputIndex);


      IndexWriter writer
          = new IndexWriter(localOutput, 
                            new NutchDocumentAnalyzer(), true);
      writer.mergeFactor = MERGE_FACTOR;
      writer.minMergeDocs = MIN_MERGE_DOCS;
      writer.maxMergeDocs = MAX_MERGE_DOCS;
      writer.setTermIndexInterval(TERM_INDEX_INTERVAL);
      writer.maxFieldLength = maxFieldLength;
      //writer.infoStream = LogFormatter.getLogStream(LOG, Level.FINE);
      writer.setUseCompoundFile(false);
      writer.setSimilarity(new NutchSimilarity());


      SegmentReader sr = null;


      long start = System.currentTimeMillis();
      long delta = start;
      long curTime, total = 0;
      long count = 0;
      try {
          LOG.info("* Opening segment " + srcDir.getName());
          sr = new SegmentReader(nfs, srcDir, false, true, true, true);


          total = sr.size;
          
          String segmentName = srcDir.getCanonicalFile().getName();
          FetcherOutput fetcherOutput = new FetcherOutput();
          ParseText parseText = new ParseText();
          ParseData parseData = new ParseData();
          LOG.info("* Indexing segment " + srcDir.getName());


          //
          // Iterate through all docs in the input
          //
          maxDocs = Math.min(sr.size, maxDocs);
          for (count = 0; count < maxDocs; count++) {
            if (!sr.next(fetcherOutput, null, parseText, parseData)) continue;


              // only index the page if it was fetched correctly
              if (!fetcherOutput.getProtocolStatus().isSuccess()) {
                  continue;                              
              }


              // reconstruct parse
              Parse parse = new ParseImpl(parseText.getText(), parseData);


              // build initial document w/ core fields
              Document doc = makeDocument(segmentName, count,
                                          fetcherOutput, parse);


              // run filters to add more fields to the document
              doc = IndexingFilters.filter(doc, parse, fetcherOutput);
    
              // add the document to the index
              writer.addDocument(doc);
              if (count > 0 && count % LOG_STEP == 0) {
                curTime = System.currentTimeMillis();
                LOG.info(" Processed " + count + " records (" +
                        ((float)LOG_STEP * 1000.0f / (float)(curTime - delta)) +
                        " rec/s)");
                delta = curTime;
              }
          }
      } catch (EOFException e) {
          LOG.warning("Unexpected EOF in: " + srcDir +
                      " at entry #" + count + ".  Ignoring.");
      } finally {
        sr.close();
      }
      LOG.info("* Optimizing index...");
      writer.optimize();
      writer.close();


      //
      // Put the local file in its place via NFS
      //
      //nfs.completeLocalOutput(new File(outputDir, "index"), new File(srcDir, "index"));
      LOG.info("* Moving index to NFS if needed...");
      nfs.completeLocalOutput(outputIndex, tmpOutputIndex);


      //
      // Emit "done" file
      //
      OutputStream out = nfs.create(doneFile);
      out.close();
      delta = System.currentTimeMillis() - start;
      float eps = (float) count / (float) (delta / 1000);
      LOG.info("DONE indexing segment " + srcDir.getName() + ": total " + total +
              " records in " + ((float) delta / 1000f) + " s (" + eps + " rec/s).");
  }


  /** 
   * Add core fields, required by other core components & features (i.e.,
   * merge, dedup, explain). 
   */
  private Document makeDocument(String segmentName, long docNo,
                                FetcherOutput fo, Parse parse) {


    Document doc = new Document();


    // add docno & segment, used to map from merged index back to segment files
    doc.add(Field.UnIndexed("docNo", Long.toString(docNo, 16)));
    doc.add(Field.UnIndexed("segment", segmentName));


    // add digest, used by dedup
    doc.add(Field.UnIndexed("digest", fo.getMD5Hash().toString()));


    float boost = calculateBoost(fo.getFetchListEntry().getPage().getScore(),
            scorePower, boostByLinkCount, fo.getAnchors().length);
    // 4. Apply boost to all indexed fields.
    doc.setBoost(boost);


    // store boost for use by explain and dedup
    doc.add(Field.UnIndexed("boost", Float.toString(boost)));


    return doc;
  }


  public static float calculateBoost(float pageScore, float scorePower,
          boolean boostByLinkCount, int linkCount) {
    // 1. Start with page's score from DB -- 1.0 if no link analysis.
    float res = pageScore;
    // 2. Apply scorePower to this.
    res = (float)Math.pow(pageScore, scorePower);
    // 3. Optionally boost by log of incoming anchor count.
    if (boostByLinkCount)
      res *= (float)Math.log(Math.E + linkCount);
    return res;
  }


  /** 
   * Create an index for the input files in the named directory. 
   */
  public static void main(String[] args) throws Exception {
      String usage = "IndexSegment (-local | -ndfs <namenode:port>) <segment_directory> [-dir <workingdir>]";
      if (args.length == 0) {
          System.err.println("Usage: " + usage);
          return;
      }


      NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
      try {
          int maxDocs = Integer.MAX_VALUE;
          File srcDir = null;
          File workingDir = new File(new File("").getCanonicalPath());
          for (int i = 0; i < args.length; i++) {
              if (args[i] != null) {
                  if (args[i].equals("-max")) {        // parse -max option
                      i++;
                      maxDocs = Integer.parseInt(args[i]);
                  } else if (args[i].equals("-dir")) {
                      i++;
                      workingDir = new File(new File(args[i]).getCanonicalPath());
                  } else {
                      srcDir = new File(args[i]);
                  }
              }
          }


          workingDir = new File(workingDir, "indexsegment-workingdir");
          if (workingDir.exists()) {
              FileUtil.fullyDelete(workingDir);
          }
          IndexSegment indexer = new IndexSegment(nfs, maxDocs, srcDir, workingDir);
          LOG.info("indexing segment: " + srcDir);
          indexer.indexPages();
          LOG.info("done indexing");
          FileUtil.fullyDelete(workingDir);
      } finally {
          nfs.close();
      }
  }
}
Source Code of org.apache.nutch.indexer.IndexSegment

Related Classes of org.apache.nutch.indexer.IndexSegment