Source Code of org.apache.nutch.indexer.Indexer$OutputFormat

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.indexer;


import java.io.*;
import java.util.*;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.parse.*;
import org.apache.nutch.analysis.*;


import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;


import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDb;


import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;


import org.apache.nutch.global.Global;




/** Create indexes for segments. */
public class Indexer extends ToolBase implements Reducer, Mapper {
  
  public static final String DONE_NAME = "index.done";
  public static final Log LOG = LogFactory.getLog(Indexer.class);
  
       
  /** Unwrap Lucene Documents created by reduce and add them to an index. */
  public static class OutputFormat
    extends org.apache.hadoop.mapred.OutputFormatBase {
    public RecordWriter getRecordWriter(final FileSystem fs, JobConf job,
                                        String name, Progressable progress) throws IOException {
      final Path perm = new Path(job.getOutputPath(), name);
      final Path temp =
        job.getLocalPath("index/_"+Integer.toString(new Random().nextInt()));


      fs.delete(perm);                            // delete old, if any


      final AnalyzerFactory factory = new AnalyzerFactory(job);
      final IndexWriter writer =                  // build locally first
        new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
                        new NutchDocumentAnalyzer(job), true);


      writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
      writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
      writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
      writer.setTermIndexInterval
        (job.getInt("indexer.termIndexInterval", 128));
      writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
      writer.setInfoStream(LogUtil.getInfoStream(LOG));
      writer.setUseCompoundFile(false);
      writer.setSimilarity(new NutchSimilarity());


      return new RecordWriter() {
          boolean closed;


          public void write(WritableComparable key, Writable value)
            throws IOException {                  // unwrap & index doc
            Document doc = (Document)((ObjectWritable)value).get();
            NutchAnalyzer analyzer = factory.get(doc.get("lang"));
            if (LOG.isInfoEnabled()) {
              LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" +
                       " with analyzer " + analyzer +
                       " (" + doc.get("lang") + ")");
            }
            writer.addDocument(doc, analyzer);
          }
          
          public void close(final Reporter reporter) throws IOException {
            // spawn a thread to give progress heartbeats
            Thread prog = new Thread() {
                public void run() {
                  while (!closed) {
                    try {
                      reporter.setStatus("closing");
                      Thread.sleep(1000);
                    } catch (InterruptedException e) { continue; }
                      catch (Throwable e) { return; }
                  }
                }
              };


            try {
              prog.start();
              if (LOG.isInfoEnabled()) { LOG.info("Optimizing index."); }
              // optimize & close index
              writer.optimize();
              writer.close();
              fs.completeLocalOutput(perm, temp);   // copy to dfs
              fs.createNewFile(new Path(perm, DONE_NAME));
            } finally {
              closed = true;
            }
          }
        };
    }
  }


  private IndexingFilters filters;
  private ScoringFilters scfilters;
  private String collectionType;
  


  public Indexer() {
    
  }
  
  public Indexer(Configuration conf) {
    setConf(conf);
  }
  
  public void configure(JobConf job) {
    setConf(job);
    this.filters = new IndexingFilters(getConf());
    this.scfilters = new ScoringFilters(getConf());    
    this.collectionType = job.get(Global.COLLECTION_TYPE);   
  }


  public void close() {}


  public void reduce(WritableComparable key, Iterator values,
                     OutputCollector output, Reporter reporter)
    throws IOException {     
    
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    CrawlDatum redir = null;
    ParseData parseData = null;
    ParseText parseText = null;
    Float pagerank = null; // TODO MC
    while (values.hasNext()) {
      Object value = ((ObjectWritable)values.next()).get(); // unwrap
          
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      } 
      else if (value instanceof CrawlDatum) {
          
        CrawlDatum datum = (CrawlDatum)value;        
        if (CrawlDatum.hasDbStatus(datum))
          dbDatum = datum;
        else if (CrawlDatum.hasFetchStatus(datum))
          fetchDatum = datum;
        else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
          // redirected page
          redir = datum;
        else
          throw new RuntimeException("Unexpected status: "+datum.getStatus());
      } 
      else if (value instanceof ParseData) {
        parseData = (ParseData)value;
      } 
      else if (value instanceof ParseText) {
        parseText = (ParseText)value;
      }       
      else if (value instanceof FloatWritable) {  // TODO MC
        pagerank = ((FloatWritable)value).get();
      }
      else if (LOG.isWarnEnabled()) {
        LOG.warn("Unrecognized type: "+value.getClass());
      }
    }      
    
    if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
      LOG.info("index TREC: "+key.toString()+" "+(redir==null)+" "+(fetchDatum == null)+" "+(dbDatum == null)+" "+(parseText == null)+" "+(parseData == null)+" "+(inlinks==null)+" "+(pagerank==null));
    }
    
    if (redir != null) { // does not work - see http://www.mail-archive.com/nutch-commits@lucene.apache.org/msg01971.html
      // XXX page was redirected - what should we do?
      // XXX discard it for now
      
      LOG.info("index REDIR:"+redir); // sanity check
      return; 
    } 
        


    if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
      if (fetchDatum == null /*|| dbDatum == null*/
          || parseText == null || parseData == null) {
        return;                                     // only have inlinks
      }
    }
    else {
      if (fetchDatum == null || dbDatum == null
            || parseText == null || parseData == null) {
            return;                                     // only have inlinks
        }
    }
       
    Document doc = new Document();
    Metadata metadata = parseData.getContentMeta();
   
    if (metadata.get(Nutch.SEGMENT_NAME_KEY)==null || metadata.get(Nutch.SIGNATURE_KEY)==null) { 
      LOG.error("Metadata empty:"+key+" "+parseData.toString());
      return;
    }
    
    // add segment, used to map from merged index back to segment files
    doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY),
            Field.Store.YES, Field.Index.NO));


    // add digest, used by dedup
    doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY),
            Field.Store.YES, Field.Index.NO));
         
    Parse parse = new ParseImpl(parseText, parseData);
    try {
      // run indexing filters
      doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks);
    } catch (IndexingException e) {
      if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
      return;
    }
    
    float boost = 1.0f;
    // run scoring filters
    if (dbDatum!=null || !collectionType.equals(Global.COLLECTION_TYPE_TREC)) { 
      try {
        boost = this.scfilters.indexerScore((Text)key, doc, dbDatum,
              fetchDatum, parse, inlinks, boost);
      } catch (ScoringFilterException e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Error calculating score " + key + ": " + e);
        }
        return;
      }
    }             
    
    // apply boost to all indexed fields.
    //    doc.setBoost(boost); - it uses the default 1.0f. if set, all fields will have this value boosted
    // store boost for use by explain and dedup
    doc.add(new Field("boost", Float.toString(boost), Field.Store.YES, Field.Index.NO));        
    doc.add(new Field("inlinks", (inlinks==null) ? "0" : Integer.toString(inlinks.size()), Field.Store.YES, Field.Index.NO)); 
    doc.add(new Field("outlinks", (parseData.getOutlinks()==null) ? "0" : Integer.toString(parseData.getOutlinks().length), Field.Store.YES, Field.Index.NO)); 
    doc.add(new Field("pagerank", (pagerank==null) ? "0" : Float.toString(pagerank), Field.Store.YES, Field.Index.NO)); 
    
    output.collect(key, new ObjectWritable(doc));     
  }


  public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments)
    throws IOException {


    if (LOG.isInfoEnabled()) {
      LOG.info("Indexer: starting");
      LOG.info("Indexer: linkdb: " + linkDb);
    }


    JobConf job = new NutchJob(getConf());
    job.setJobName("index " + indexDir);


    for (int i = 0; i < segments.length; i++) {
      if (LOG.isInfoEnabled()) {
        LOG.info("Indexer: adding segment: " + segments[i]);
      }
      job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME));
      job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
      job.addInputPath(new Path(segments[i], ParseText.DIR_NAME));
    }


    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);


    job.setMapperClass(Indexer.class);
    job.setReducerClass(Indexer.class);


    job.setOutputPath(indexDir);
    job.setOutputFormat(OutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ObjectWritable.class);


    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) { LOG.info("Indexer: done"); }
  }


  public static void main(String[] args) throws Exception {
    int res = new Indexer().doMain(NutchConfiguration.create(), args);
    System.exit(res);
  }
  
  public int run(String[] args) throws Exception {
    
    if (args.length < 4) {
      System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ...");
      return -1;
    }
    
    Path[] segments = new Path[args.length-3];
    for (int i = 3; i < args.length; i++) {
      segments[i-3] = new Path(args[i]);
    }


    try {
      index(new Path(args[0]), new Path(args[1]), new Path(args[2]),
                  segments);
      return 0;
    } catch (Exception e) {
      LOG.fatal("Indexer: " + StringUtils.stringifyException(e));
      return -1;
    }
  }


  public void map(WritableComparable key, Writable value,
      OutputCollector output, Reporter reporter) throws IOException {
    output.collect(key, new ObjectWritable(value));
  }


}
Source Code of org.apache.nutch.indexer.Indexer$OutputFormat

Related Classes of org.apache.nutch.indexer.Indexer$OutputFormat