Source Code of org.apache.nutch.indexer.Indexer$OutputFormat

/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.indexer;


import java.io.*;
import java.util.*;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


import org.apache.hadoop.io.*;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Progressable;
import org.apache.nutch.parse.*;
import org.apache.nutch.analysis.*;


import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;


import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDb;


import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.nutch.metadata.Metadata;


/** Create indexes for segments. */
public class Indexer extends Configured implements Reducer {
  
  public static final String DONE_NAME = "index.done";


  public static final Log LOG = LogFactory.getLog(Indexer.class);


  /** Wraps inputs in an {@link ObjectWritable}, to permit merging different
   * types in reduce. */
  public static class InputFormat extends SequenceFileInputFormat {
    public RecordReader getRecordReader(FileSystem fs, FileSplit split,
                                        JobConf job, Reporter reporter)
      throws IOException {


      reporter.setStatus(split.toString());
      
      return new SequenceFileRecordReader(job, split) {
          public synchronized boolean next(Writable key, Writable value)
            throws IOException {
            ObjectWritable wrapper = (ObjectWritable)value;
            try {
              wrapper.set(getValueClass().newInstance());
            } catch (Exception e) {
              throw new IOException(e.toString());
            }
            return super.next(key, (Writable)wrapper.get());
          }
        };
    }
  }


  /** Unwrap Lucene Documents created by reduce and add them to an index. */
  public static class OutputFormat
    extends org.apache.hadoop.mapred.OutputFormatBase {
    public RecordWriter getRecordWriter(final FileSystem fs, JobConf job,
                                        String name, Progressable progress) throws IOException {
      final Path perm = new Path(job.getOutputPath(), name);
      final Path temp =
        job.getLocalPath("index/_"+Integer.toString(new Random().nextInt()));


      fs.delete(perm);                            // delete old, if any


      final AnalyzerFactory factory = new AnalyzerFactory(job);
      final IndexWriter writer =                  // build locally first
        new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
                        new NutchDocumentAnalyzer(job), true);


      writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
      writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
      writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
      writer.setTermIndexInterval
        (job.getInt("indexer.termIndexInterval", 128));
      writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
      writer.setInfoStream(LogUtil.getInfoStream(LOG));
      writer.setUseCompoundFile(false);
      writer.setSimilarity(new NutchSimilarity());


      return new RecordWriter() {
          boolean closed;


          public void write(WritableComparable key, Writable value)
            throws IOException {                  // unwrap & index doc
            Document doc = (Document)((ObjectWritable)value).get();
            NutchAnalyzer analyzer = factory.get(doc.get("lang"));
            if (LOG.isInfoEnabled()) {
              LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" +
                       " with analyzer " + analyzer +
                       " (" + doc.get("lang") + ")");
            }
            writer.addDocument(doc, analyzer);
          }
          
          public void close(final Reporter reporter) throws IOException {
            // spawn a thread to give progress heartbeats
            Thread prog = new Thread() {
                public void run() {
                  while (!closed) {
                    try {
                      reporter.setStatus("closing");
                      Thread.sleep(1000);
                    } catch (InterruptedException e) { continue; }
                      catch (Throwable e) { return; }
                  }
                }
              };


            try {
              prog.start();
              if (LOG.isInfoEnabled()) { LOG.info("Optimizing index."); }
              // optimize & close index
              writer.optimize();
              writer.close();
              fs.completeLocalOutput(perm, temp);   // copy to dfs
              fs.createNewFile(new Path(perm, DONE_NAME));
            } finally {
              closed = true;
            }
          }
        };
    }
  }


  private IndexingFilters filters;
  private ScoringFilters scfilters;


  public Indexer() {
    super(null);
  }


  /** Construct an Indexer. */
  public Indexer(Configuration conf) {
    super(conf);
  }


  public void configure(JobConf job) {
    setConf(job);
    this.filters = new IndexingFilters(getConf());
    this.scfilters = new ScoringFilters(getConf());
  }


  public void close() {}


  public void reduce(WritableComparable key, Iterator values,
                     OutputCollector output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;
    while (values.hasNext()) {
      Object value = ((ObjectWritable)values.next()).get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      } else if (value instanceof CrawlDatum) {
        CrawlDatum datum = (CrawlDatum)value;
        switch (datum.getStatus()) {
        case CrawlDatum.STATUS_DB_UNFETCHED:
        case CrawlDatum.STATUS_DB_FETCHED:
        case CrawlDatum.STATUS_DB_GONE:
          dbDatum = datum;
          break;
        case CrawlDatum.STATUS_FETCH_SUCCESS:
        case CrawlDatum.STATUS_FETCH_RETRY:
        case CrawlDatum.STATUS_FETCH_GONE:
          fetchDatum = datum;
          break;
        default:
          throw new RuntimeException("Unexpected status: "+datum.getStatus());
        }
      } else if (value instanceof ParseData) {
        parseData = (ParseData)value;
      } else if (value instanceof ParseText) {
        parseText = (ParseText)value;
      } else if (LOG.isWarnEnabled()) {
        LOG.warn("Unrecognized type: "+value.getClass());
      }
    }      


    if (fetchDatum == null || dbDatum == null
        || parseText == null || parseData == null) {
      return;                                     // only have inlinks
    }


    Document doc = new Document();
    Metadata metadata = parseData.getContentMeta();


    // add segment, used to map from merged index back to segment files
    doc.add(new Field("segment", metadata.get(Fetcher.SEGMENT_NAME_KEY),
            Field.Store.YES, Field.Index.NO));


    // add digest, used by dedup
    doc.add(new Field("digest", metadata.get(Fetcher.SIGNATURE_KEY),
            Field.Store.YES, Field.Index.NO));


//     if (LOG.isInfoEnabled()) {
//       LOG.info("Url: "+key.toString());
//       LOG.info("Title: "+parseData.getTitle());
//       LOG.info(crawlDatum.toString());
//       if (inlinks != null) {
//         LOG.info(inlinks.toString());
//       }
//     }


    Parse parse = new ParseImpl(parseText, parseData);
    try {
      // run indexing filters
      doc = this.filters.filter(doc, parse, (UTF8)key, fetchDatum, inlinks);
    } catch (IndexingException e) {
      if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
      return;
    }


    float boost = 1.0f;
    // run scoring filters
    try {
      boost = this.scfilters.indexerScore((UTF8)key, doc, dbDatum,
              fetchDatum, parse, inlinks, boost);
    } catch (ScoringFilterException e) {
      if (LOG.isWarnEnabled()) {
        LOG.warn("Error calculating score " + key + ": " + e);
      }
      return;
    }
    // apply boost to all indexed fields.
    doc.setBoost(boost);
    // store boost for use by explain and dedup
    doc.add(new Field("boost", Float.toString(boost),
            Field.Store.YES, Field.Index.NO));


    output.collect(key, new ObjectWritable(doc));
  }


  public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments)
    throws IOException {


    if (LOG.isInfoEnabled()) {
      LOG.info("Indexer: starting");
      LOG.info("Indexer: linkdb: " + linkDb);
    }


    JobConf job = new NutchJob(getConf());
    job.setJobName("index " + indexDir);


    for (int i = 0; i < segments.length; i++) {
      if (LOG.isInfoEnabled()) {
        LOG.info("Indexer: adding segment: " + segments[i]);
      }
      job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME));
      job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
      job.addInputPath(new Path(segments[i], ParseText.DIR_NAME));
    }


    job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
    job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME));


    job.setInputFormat(InputFormat.class);
    job.setInputKeyClass(UTF8.class);
    job.setInputValueClass(ObjectWritable.class);


    //job.setCombinerClass(Indexer.class);
    job.setReducerClass(Indexer.class);


    job.setOutputPath(indexDir);
    job.setOutputFormat(OutputFormat.class);
    job.setOutputKeyClass(UTF8.class);
    job.setOutputValueClass(ObjectWritable.class);


    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) { LOG.info("Indexer: done"); }
  }


  public static void main(String[] args) throws Exception {
    Indexer indexer = new Indexer(NutchConfiguration.create());
    
    if (args.length < 4) {
      System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ...");
      return;
    }
    
    Path[] segments = new Path[args.length-3];
    for (int i = 3; i < args.length; i++) {
      segments[i-3] = new Path(args[i]);
    }


    indexer.index(new Path(args[0]), new Path(args[1]), new Path(args[2]),
                  segments);
  }


}
Source Code of org.apache.nutch.indexer.Indexer$OutputFormat

Related Classes of org.apache.nutch.indexer.Indexer$OutputFormat