/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer.field;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.document.DateTools;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.scoring.webgraph.LinkDatum;
import org.apache.nutch.scoring.webgraph.Node;
import org.apache.nutch.scoring.webgraph.WebGraph;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.URLUtil;
/**
* Creates the basic FieldWritable objects. The basic fields are the main
* fields used in indexing segments. Many other fields jobs will rely on the
* urls being present in the basic fields output to create their fields for
* indexing.
*
* Basic fields are extracted from segements. Only urls that were successfully
* fetched and parsed will be converted. This job also implements a portion of
* redirect logic. If a url contains both a redirect or orig url then both the
* url and its orig will be measured against their link analysis score with the
* highest scoring one being the url used for display in the index. This
* ensures that we index content under the best, most popular, url which is most
* often the one users are expecting.
*
* The BasicFields tool can accept one or more segments to convert to fields.
* If multiple segments have overlapping content, only the latest successfully
* fetched content will be converted.
*/
public class BasicFields
extends Configured
implements Tool {
public static final Log LOG = LogFactory.getLog(BasicFields.class);
/**
* Runs the Extractor job. Extracts basic fields from segments.
*
* @param nodeDb The node database
* @param segment A single segment to process.
* @param outputDir The extractor output.
*
* @throws IOException If an error occurs while processing the segment.
*/
private void runExtractor(Path nodeDb, Path segment, Path outputDir)
throws IOException {
LOG.info("BasicFields: starting extractor");
JobConf job = new NutchJob(getConf());
job.setJobName("BasicFields " + outputDir);
LOG.info("BasicFields: extractor adding segment: " + segment);
FileInputFormat.addInputPath(job, new Path(segment,
CrawlDatum.FETCH_DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
FileInputFormat.addInputPath(job, nodeDb);
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(Extractor.class);
job.setReducerClass(Extractor.class);
FileOutputFormat.setOutputPath(job, outputDir);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(ObjectWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FieldsWritable.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) {
LOG.info("BasicFields: finished extractor");
}
}
/**
* Runs the Flipper job. Flipper is the first of a two part job to implement
* redirect logic.
*
* @param basicFields The basic fields temporary output.
* @param nodeDb The node database.
* @param outputDir The flipper output.
*
* @throws IOException If an error occurs while processing.
*/
private void runFlipper(Path basicFields, Path nodeDb, Path outputDir)
throws IOException {
LOG.info("BasicFields: starting flipper");
JobConf job = new NutchJob(getConf());
job.setJobName("BasicFields " + outputDir);
FileInputFormat.addInputPath(job, nodeDb);
FileInputFormat.addInputPath(job, basicFields);
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(Flipper.class);
job.setReducerClass(Flipper.class);
FileOutputFormat.setOutputPath(job, outputDir);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(ObjectWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LinkDatum.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) {
LOG.info("BasicFields: finished flipper");
}
}
/**
* Runs the Scorer job. Scorer is the second of a two part job to implement
* redirect logic.
*
* @param basicFields The basic fields temporary output.
* @param links The temporary output holding urls and any redirects.
* @param outputDir The scorer output.
*
* @throws IOException If an error occurs while processing.
*/
private void runScorer(Path basicFields, Path links, Path outputDir)
throws IOException {
LOG.info("BasicFields: starting scorer");
JobConf job = new NutchJob(getConf());
job.setJobName("BasicFields " + outputDir);
FileInputFormat.addInputPath(job, links);
FileInputFormat.addInputPath(job, basicFields);
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(Scorer.class);
job.setReducerClass(Scorer.class);
FileOutputFormat.setOutputPath(job, outputDir);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(ObjectWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FieldsWritable.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) {
LOG.info("BasicFields: finished scorer");
}
}
/**
* Runs the Merger job. Merger ensures that the most recent set of fields for
* any given url is collected.
*
* @param basicFields The basic fields final output.
* @param outputDir The merger output.
*
* @throws IOException If an error occurs while processing.
*/
private void runMerger(Path[] basicFields, Path outputDir)
throws IOException {
LOG.info("BasicFields: starting merger");
JobConf job = new NutchJob(getConf());
job.setJobName("BasicFields " + outputDir);
for (Path basic : basicFields) {
FileInputFormat.addInputPath(job, basic);
}
job.setInputFormat(SequenceFileInputFormat.class);
job.setReducerClass(Merger.class);
FileOutputFormat.setOutputPath(job, outputDir);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FieldsWritable.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) {
LOG.info("BasicFields: finished merger");
}
}
/**
* Extracts basic fields from a single segment.
*/
private static class Extractor
extends Configured
implements Mapper<Text, Writable, Text, ObjectWritable>,
Reducer<Text, ObjectWritable, Text, FieldsWritable> {
private int MAX_TITLE_LENGTH;
private Configuration conf;
/**
* Default constructor.
*/
public Extractor() {
}
/**
* Configurable constructor.
*/
public Extractor(Configuration conf) {
setConf(conf);
}
/**
* Configures the job.
*/
public void configure(JobConf conf) {
this.conf = conf;
this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
}
public void close() {
}
/**
* Wraps values in ObjectWritable.
*/
public void map(Text key, Writable value,
OutputCollector<Text, ObjectWritable> output, Reporter reporter)
throws IOException {
ObjectWritable objWrite = new ObjectWritable();
objWrite.set(value);
output.collect(key, objWrite);
}
/**
* Creates basic fields from a single segment.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
OutputCollector<Text, FieldsWritable> output, Reporter reporter)
throws IOException {
Node nodeDb = null;
List<CrawlDatum> fetchDatums = new ArrayList<CrawlDatum>();
ParseData parseData = null;
ParseText parseText = null;
List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();
// assign values, url must be successfully fetched and parsed
while (values.hasNext()) {
ObjectWritable objWrite = values.next();
Object value = objWrite.get();
if (value instanceof CrawlDatum) {
CrawlDatum datum = (CrawlDatum)value;
if (datum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
fetchDatums.add(datum);
}
}
else if (value instanceof Node) {
nodeDb = (Node)value;
}
else if (value instanceof ParseData
&& ((ParseData)value).getStatus().isSuccess()) {
parseData = (ParseData)value;
}
else if (value instanceof ParseText) {
parseText = (ParseText)value;
}
}
// if not successfully fetched and parsed then stop processing
int numDatums = fetchDatums.size();
if (numDatums == 0 || nodeDb == null || parseText == null
|| parseData == null) {
return;
}
// get the most recent fetch time, this is duplicates inside of a single
// segment, usually due to redirects
CrawlDatum fetchDatum = null;
long mostRecent = 0L;
for (CrawlDatum cur : fetchDatums) {
long fetchTime = cur.getFetchTime();
if (fetchDatum == null || fetchTime > mostRecent) {
fetchDatum = cur;
mostRecent = fetchTime;
}
}
// get parse metadata
Metadata metadata = parseData.getContentMeta();
Parse parse = new ParseImpl(parseText, parseData);
// handle redirect urls
Text reprUrlText = (Text)fetchDatum.getMetaData().get(
Nutch.WRITABLE_REPR_URL_KEY);
String reprUrl = reprUrlText != null ? reprUrlText.toString() : null;
String url = key.toString();
String fieldUrl = (reprUrl != null) ? reprUrl : url;
String host = URLUtil.getHost(fieldUrl);
// add segment, used to map from merged index back to segment files
FieldWritable segField = new FieldWritable(Fields.SEGMENT,
metadata.get(Nutch.SEGMENT_NAME_KEY), FieldType.CONTENT, false, true,
false);
fieldsList.add(segField);
// add digest, used by dedup
FieldWritable digestField = new FieldWritable(Fields.DIGEST,
metadata.get(Nutch.SIGNATURE_KEY), FieldType.CONTENT, false, true,
false);
fieldsList.add(digestField);
// url is both stored and indexed, so it's both searchable and returned
fieldsList.add(new FieldWritable(Fields.URL, fieldUrl, FieldType.CONTENT,
true, true, true));
fieldsList.add(new FieldWritable(Fields.SEG_URL, url, FieldType.CONTENT,
false, true, false));
if (reprUrl != null) {
// also store original url as both stored and indexes
fieldsList.add(new FieldWritable(Fields.ORIG_URL, url,
FieldType.CONTENT, true, true, true));
}
if (host != null) {
// add host as un-stored, indexed and tokenized
FieldWritable hostField = new FieldWritable(Fields.HOST, host,
FieldType.CONTENT, true, false, true);
fieldsList.add(hostField);
// add site as un-stored, indexed and un-tokenized
FieldWritable siteField = new FieldWritable(Fields.SITE, host,
FieldType.CONTENT, true, false, false);
fieldsList.add(siteField);
}
// content is indexed, so that it's searchable, but not stored in index
fieldsList.add(new FieldWritable(Fields.CONTENT, parse.getText(),
FieldType.CONTENT, true, false, true));
// title
String title = parse.getData().getTitle();
if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
title = title.substring(0, MAX_TITLE_LENGTH);
}
// add title indexed and stored so that it can be displayed
fieldsList.add(new FieldWritable(Fields.TITLE, title, FieldType.CONTENT,
true, true, true));
// add cached content/summary display policy, if available
String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
fieldsList.add(new FieldWritable(Fields.CACHE, caching,
FieldType.CONTENT, false, true, false));
}
// add timestamp when fetched, for deduplication
fieldsList.add(new FieldWritable(Fields.TSTAMP, DateTools.timeToString(
fetchDatum.getFetchTime(), DateTools.Resolution.MILLISECOND),
FieldType.CONTENT, false, true, false));
FieldsWritable fields = new FieldsWritable();
fields.setFieldsList(fieldsList);
output.collect(key, fields);
}
}
/**
* Runs the first part of redirect logic. Breaks out fields if a page
* contains a redirect.
*/
public static class Flipper
extends Configured
implements Mapper<Text, Writable, Text, ObjectWritable>,
Reducer<Text, ObjectWritable, Text, LinkDatum> {
private JobConf conf;
/**
* Configures the job.
*/
public void configure(JobConf conf) {
this.conf = conf;
}
public void close() {
}
/**
* Breaks out the collection of fields for url and redirects if necessary.
*/
public void map(Text key, Writable value,
OutputCollector<Text, ObjectWritable> output, Reporter reporter)
throws IOException {
ObjectWritable objUrl = new ObjectWritable();
objUrl.set(key);
if (value instanceof FieldsWritable) {
// collect the fields for the url
FieldsWritable fields = (FieldsWritable)value;
FieldWritable url = fields.getField(Fields.URL);
FieldWritable orig = fields.getField(Fields.ORIG_URL);
output.collect(new Text(url.getValue()), objUrl);
// collect for the orig / redirect url if one exists
if (orig != null) {
output.collect(new Text(orig.getValue()), objUrl);
}
}
else {
// anything else passes through
ObjectWritable objWrite = new ObjectWritable();
objWrite.set(value);
output.collect(key, objWrite);
}
}
/**
* Collects redirect and original links for a given url key. This will be
* used in the Scorer to handle redirects.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
OutputCollector<Text, LinkDatum> output, Reporter reporter)
throws IOException {
Node node = null;
List<String> urls = new ArrayList<String>();
while (values.hasNext()) {
ObjectWritable objWrite = values.next();
Object obj = objWrite.get();
if (obj instanceof Node) {
node = (Node)obj;
}
else if (obj instanceof Text) {
urls.add(obj.toString());
}
}
if (urls.size() > 0) {
float score = (node != null) ? node.getInlinkScore() : 0.0f;
for (String url : urls) {
LinkDatum datum = new LinkDatum(key.toString());
datum.setScore(score);
output.collect(new Text(url), datum);
}
}
}
}
/**
* The Scorer job sets the boost field from the NodeDb score.
*
* It also runs the second part of redirect logic. Determining the highest
* scoring url for pages that contain redirects.
*/
public static class Scorer
extends Configured
implements Mapper<Text, Writable, Text, ObjectWritable>,
Reducer<Text, ObjectWritable, Text, FieldsWritable> {
private JobConf conf;
/**
* Configures the job.
*/
public void configure(JobConf conf) {
this.conf = conf;
}
public void close() {
}
/**
* Wraps values in ObjectWritable.
*/
public void map(Text key, Writable value,
OutputCollector<Text, ObjectWritable> output, Reporter reporter)
throws IOException {
ObjectWritable objWrite = new ObjectWritable();
objWrite.set(value);
output.collect(key, objWrite);
}
/**
* Sets a document boost field from the NodeDb and determines the best
* scoring url for pages that have rediects. Uses the highest scoring url
* as the display url in the index.
*/
public void reduce(Text key, Iterator<ObjectWritable> values,
OutputCollector<Text, FieldsWritable> output, Reporter reporter)
throws IOException {
FieldsWritable fields = null;
List<LinkDatum> datums = new ArrayList<LinkDatum>();
while (values.hasNext()) {
ObjectWritable objWrite = values.next();
Object obj = objWrite.get();
if (obj instanceof FieldsWritable) {
fields = (FieldsWritable)obj;
}
else if (obj instanceof LinkDatum) {
datums.add((LinkDatum)obj);
}
}
int numDatums = datums.size();
if (fields != null && numDatums > 0) {
// if no redirect for the page just assign the linkrank boost
List<FieldWritable> fieldsList = fields.getFieldsList();
if (numDatums == 1) {
float linkRank = datums.get(0).getScore();
fieldsList.add(new FieldWritable(Fields.BOOST, "linkrank",
FieldType.BOOST, linkRank));
output.collect(new Text(key), fields);
}
else {
// get both the url and any rediect url stored
FieldWritable url = fields.getField(Fields.URL);
FieldWritable orig = fields.getField(Fields.ORIG_URL);
float urlScore = 0.0f;
float origScore = 0.0f;
// get the scores for each
for (LinkDatum datum : datums) {
String curUrl = datum.getUrl();
if (curUrl.equals(url.getValue())) {
urlScore = datum.getScore();
}
else if (curUrl.equals(orig.getValue())) {
origScore = datum.getScore();
}
}
// if the highest scoring url is not the one currently displayed in
// the index under the current basic fields, then switch it
String urlKey = url.getValue();
float linkRank = urlScore;
if (origScore > urlScore) {
url.setName(Fields.ORIG_URL);
orig.setName(Fields.URL);
// We also need to fix the host because we are changing urls
String host = URLUtil.getHost(orig.getValue());
if (host != null) {
fieldsList.remove(fields.getField(Fields.SITE));
fieldsList.remove(fields.getField(Fields.HOST));
fieldsList.add(new FieldWritable(Fields.HOST, host,
FieldType.CONTENT, true, false, true));
fieldsList.add(new FieldWritable(Fields.SITE, host,
FieldType.CONTENT, true, false, false));
}
linkRank = origScore;
urlKey = orig.getValue();
}
// create the final document boost field
fieldsList.add(new FieldWritable(Fields.BOOST, "linkrank",
FieldType.BOOST, linkRank));
output.collect(new Text(urlKey), fields);
}
}
}
}
/**
* Merges output of all segments fields collecting only the most recent set
* of fields for any given url.
*/
public static class Merger
extends Configured
implements Reducer<Text, FieldsWritable, Text, FieldsWritable> {
private JobConf conf;
/**
* Configures the job.
*/
public void configure(JobConf conf) {
this.conf = conf;
}
public void close() {
}
/**
* Collects the most recent set of fields for any url.
*/
public void reduce(Text key, Iterator<FieldsWritable> values,
OutputCollector<Text, FieldsWritable> output, Reporter reporter)
throws IOException {
List<FieldsWritable> fields = new ArrayList<FieldsWritable>();
// collects the various sets of fields
while (values.hasNext()) {
fields.add((FieldsWritable)WritableUtils.clone(values.next(), conf));
}
// if only one set of fields for a given url passthrough
FieldsWritable outFields = null;
int numFields = fields.size();
if (numFields == 1) {
outFields = fields.get(0);
}
else if (numFields > 1) {
// more than one set of fields means url has been fetched more than
// once, collect only the most recent set of fields
FieldsWritable mostRecent = null;
long recentTime = 0L;
for (int i = 0; i < numFields; i++) {
FieldsWritable cur = fields.get(i);
String tStampStr = cur.getField(Fields.TSTAMP).getValue();
long timestamp = Long.parseLong(tStampStr);
if (mostRecent == null || recentTime < timestamp) {
recentTime = timestamp;
mostRecent = cur;
}
}
outFields = mostRecent;
}
output.collect(key, outFields);
}
}
/**
* Runs the BasicFields jobs for every segment and aggregates and filters
* the output to create a final database of FieldWritable objects.
*
* @param nodeDb The node database.
* @param segments The array of segments to process.
* @param output The BasicFields output.
*
* @throws IOException If an error occurs while processing the segments.
*/
public void createFields(Path nodeDb, Path[] segments, Path output)
throws IOException {
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
Path tempOutput = new Path(output.toString() + "-temp");
fs.mkdirs(tempOutput);
int numSegments = segments.length;
Path[] basicFields = new Path[numSegments];
// one pass per segment to extract and create the basic fields
for (int i = 0; i < numSegments; i++) {
Path segment = segments[i];
Path segOutput = new Path(tempOutput, String.valueOf(i));
Path tempBasic = new Path(tempOutput, "basic-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
Path tempFlip = new Path(tempOutput, "flip-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
runExtractor(nodeDb, segment, tempBasic);
runFlipper(tempBasic, nodeDb, tempFlip);
runScorer(tempBasic, tempFlip, segOutput);
fs.delete(tempBasic, true);
fs.delete(tempFlip, true);
basicFields[i] = segOutput;
}
// merge all of the segments and delete any temporary output
runMerger(basicFields, output);
fs.delete(tempOutput, true);
}
public static void main(String[] args)
throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new BasicFields(),
args);
System.exit(res);
}
/**
* Runs the BasicFields tool.
*/
public int run(String[] args)
throws Exception {
Options options = new Options();
Option helpOpts = OptionBuilder.withArgName("help").withDescription(
"show this help message").create("help");
Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(
"the output index directory").create("output");
Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(
"the webgraphdb to use").create("webgraphdb");
Option segOpts = OptionBuilder.withArgName("segment").hasArgs().withDescription(
"the segment(s) to use").create("segment");
options.addOption(helpOpts);
options.addOption(webGraphOpts);
options.addOption(segOpts);
options.addOption(outputOpts);
CommandLineParser parser = new GnuParser();
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("webgraphdb")
|| !line.hasOption("output") || !line.hasOption("segment")) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("BasicFields", options);
return -1;
}
// get the command line options and all of the segments
String webGraphDb = line.getOptionValue("webgraphdb");
String output = line.getOptionValue("output");
String[] segments = line.getOptionValues("segment");
Path[] segPaths = new Path[segments.length];
for (int i = 0; i < segments.length; i++) {
segPaths[i] = new Path(segments[i]);
}
createFields(new Path(webGraphDb, WebGraph.NODE_DIR), segPaths, new Path(
output));
return 0;
}
catch (Exception e) {
LOG.fatal("BasicFields: " + StringUtils.stringifyException(e));
return -2;
}
}
}