Package org.apache.nutch.indexer

Source Code of org.apache.nutch.indexer.Indexer$OutputFormat

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.indexer;

import java.io.*;
import java.util.*;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.parse.*;
import org.apache.nutch.analysis.*;

import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDb;

import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;

import org.apache.nutch.global.Global;


/** Create indexes for segments. */
public class Indexer extends ToolBase implements Reducer, Mapper {
 
  public static final String DONE_NAME = "index.done";
  public static final Log LOG = LogFactory.getLog(Indexer.class);
 
      
  /** Unwrap Lucene Documents created by reduce and add them to an index. */
  public static class OutputFormat
    extends org.apache.hadoop.mapred.OutputFormatBase {
    public RecordWriter getRecordWriter(final FileSystem fs, JobConf job,
                                        String name, Progressable progress) throws IOException {
      final Path perm = new Path(job.getOutputPath(), name);
      final Path temp =
        job.getLocalPath("index/_"+Integer.toString(new Random().nextInt()));

      fs.delete(perm);                            // delete old, if any

      final AnalyzerFactory factory = new AnalyzerFactory(job);
      final IndexWriter writer =                  // build locally first
        new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
                        new NutchDocumentAnalyzer(job), true);

      writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
      writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
      writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
      writer.setTermIndexInterval
        (job.getInt("indexer.termIndexInterval", 128));
      writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
      writer.setInfoStream(LogUtil.getInfoStream(LOG));
      writer.setUseCompoundFile(false);
      writer.setSimilarity(new NutchSimilarity());

      return new RecordWriter() {
          boolean closed;

          public void write(WritableComparable key, Writable value)
            throws IOException {                  // unwrap & index doc
            Document doc = (Document)((ObjectWritable)value).get();
            NutchAnalyzer analyzer = factory.get(doc.get("lang"));
            if (LOG.isInfoEnabled()) {
              LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" +
                       " with analyzer " + analyzer +
                       " (" + doc.get("lang") + ")");
            }
            writer.addDocument(doc, analyzer);
          }
         
          public void close(final Reporter reporter) throws IOException {
            // spawn a thread to give progress heartbeats
            Thread prog = new Thread() {
                public void run() {
                  while (!closed) {
                    try {
                      reporter.setStatus("closing");
                      Thread.sleep(1000);
                    } catch (InterruptedException e) { continue; }
                      catch (Throwable e) { return; }
                  }
                }
              };

            try {
              prog.start();
              if (LOG.isInfoEnabled()) { LOG.info("Optimizing index."); }
              // optimize & close index
              writer.optimize();
              writer.close();
              fs.completeLocalOutput(perm, temp);   // copy to dfs
              fs.createNewFile(new Path(perm, DONE_NAME));
            } finally {
              closed = true;
            }
          }
        };
    }
  }

  private IndexingFilters filters;
  private ScoringFilters scfilters;
  private String collectionType;
 

  public Indexer() {
   
  }
 
  public Indexer(Configuration conf) {
    setConf(conf);
  }
 
  public void configure(JobConf job) {
    setConf(job);
    this.filters = new IndexingFilters(getConf());
    this.scfilters = new ScoringFilters(getConf());   
    this.collectionType = job.get(Global.COLLECTION_TYPE);  
  }

  public void close() {}

  public void reduce(WritableComparable key, Iterator values,
                     OutputCollector output, Reporter reporter)
    throws IOException {    
   
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    CrawlDatum redir = null;
    ParseData parseData = null;
    ParseText parseText = null;
    Float pagerank = null; // TODO MC
    while (values.hasNext()) {
      Object value = ((ObjectWritable)values.next()).get(); // unwrap
         
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      }
      else if (value instanceof CrawlDatum) {
         
        CrawlDatum datum = (CrawlDatum)value;       
        if (CrawlDatum.hasDbStatus(datum))
          dbDatum = datum;
        else if (CrawlDatum.hasFetchStatus(datum))
          fetchDatum = datum;
        else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
          // redirected page
          redir = datum;
        else
          throw new RuntimeException("Unexpected status: "+datum.getStatus());
      }
      else if (value instanceof ParseData) {
        parseData = (ParseData)value;
      }
      else if (value instanceof ParseText) {
        parseText = (ParseText)value;
      }      
      else if (value instanceof FloatWritable) {  // TODO MC
        pagerank = ((FloatWritable)value).get();
      }
      else if (LOG.isWarnEnabled()) {
        LOG.warn("Unrecognized type: "+value.getClass());
      }
    }     
   
    if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
      LOG.info("index TREC: "+key.toString()+" "+(redir==null)+" "+(fetchDatum == null)+" "+(dbDatum == null)+" "+(parseText == null)+" "+(parseData == null)+" "+(inlinks==null)+" "+(pagerank==null));
    }
   
    if (redir != null) { // does not work - see http://www.mail-archive.com/nutch-commits@lucene.apache.org/msg01971.html
      // XXX page was redirected - what should we do?
      // XXX discard it for now
     
      LOG.info("index REDIR:"+redir); // sanity check
      return;
    }
       

    if (collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
      if (fetchDatum == null /*|| dbDatum == null*/
          || parseText == null || parseData == null) {
        return;                                     // only have inlinks
      }
    }
    else {
      if (fetchDatum == null || dbDatum == null
            || parseText == null || parseData == null) {
            return;                                     // only have inlinks
        }
    }
      
    Document doc = new Document();
    Metadata metadata = parseData.getContentMeta();
  
    if (metadata.get(Nutch.SEGMENT_NAME_KEY)==null || metadata.get(Nutch.SIGNATURE_KEY)==null) {
      LOG.error("Metadata empty:"+key+" "+parseData.toString());
      return;
    }
   
    // add segment, used to map from merged index back to segment files
    doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY),
            Field.Store.YES, Field.Index.NO));

    // add digest, used by dedup
    doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY),
            Field.Store.YES, Field.Index.NO));
        
    Parse parse = new ParseImpl(parseText, parseData);
    try {
      // run indexing filters
      doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks);
    } catch (IndexingException e) {
      if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
      return;
    }
   
    float boost = 1.0f;
    // run scoring filters
    if (dbDatum!=null || !collectionType.equals(Global.COLLECTION_TYPE_TREC)) {
      try {
        boost = this.scfilters.indexerScore((Text)key, doc, dbDatum,
              fetchDatum, parse, inlinks, boost);
      } catch (ScoringFilterException e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Error calculating score " + key + ": " + e);
        }
        return;
      }
    }            
   
    // apply boost to all indexed fields.
    //    doc.setBoost(boost); - it uses the default 1.0f. if set, all fields will have this value boosted
    // store boost for use by explain and dedup
    doc.add(new Field("boost", Float.toString(boost), Field.Store.YES, Field.Index.NO));       
    doc.add(new Field("inlinks", (inlinks==null) ? "0" : Integer.toString(inlinks.size()), Field.Store.YES, Field.Index.NO));
    doc.add(new Field("outlinks", (parseData.getOutlinks()==null) ? "0" : Integer.toString(parseData.getOutlinks().length), Field.Store.YES, Field.Index.NO));
    doc.add(new Field("pagerank", (pagerank==null) ? "0" : Float.toString(pagerank), Field.Store.YES, Field.Index.NO));
   
    output.collect(key, new ObjectWritable(doc));    
  }

  public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments)
    throws IOException {

    if (LOG.isInfoEnabled()) {
      LOG.info("Indexer: starting");
      LOG.info("Indexer: linkdb: " + linkDb);
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("index " + indexDir);

    for (int i = 0; i < segments.length; i++) {
      if (LOG.isInfoEnabled()) {
        LOG.info("Indexer: adding segment: " + segments[i]);
      }
      job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME));
      job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
      job.addInputPath(new Path(segments[i], ParseText.DIR_NAME));
    }

    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Indexer.class);
    job.setReducerClass(Indexer.class);

    job.setOutputPath(indexDir);
    job.setOutputFormat(OutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ObjectWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) { LOG.info("Indexer: done"); }
  }

  public static void main(String[] args) throws Exception {
    int res = new Indexer().doMain(NutchConfiguration.create(), args);
    System.exit(res);
  }
 
  public int run(String[] args) throws Exception {
   
    if (args.length < 4) {
      System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ...");
      return -1;
    }
   
    Path[] segments = new Path[args.length-3];
    for (int i = 3; i < args.length; i++) {
      segments[i-3] = new Path(args[i]);
    }

    try {
      index(new Path(args[0]), new Path(args[1]), new Path(args[2]),
                  segments);
      return 0;
    } catch (Exception e) {
      LOG.fatal("Indexer: " + StringUtils.stringifyException(e));
      return -1;
    }
  }

  public void map(WritableComparable key, Writable value,
      OutputCollector output, Reporter reporter) throws IOException {
    output.collect(key, new ObjectWritable(value));
  }

}
TOP

Related Classes of org.apache.nutch.indexer.Indexer$OutputFormat

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.