/*
* Cloud9: A MapReduce Library for Hadoop
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package edu.umd.cloud9.webgraph.driver;
import java.io.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import edu.umd.cloud9.webgraph.BuildReverseWebGraph;
import edu.umd.cloud9.webgraph.BuildWebGraph;
import edu.umd.cloud9.webgraph.CollectionConfigurationManager;
import edu.umd.cloud9.webgraph.CollectHostnames;
import edu.umd.cloud9.webgraph.ComputeWeight;
import edu.umd.cloud9.webgraph.DriverUtil;
import edu.umd.cloud9.webgraph.TrecExtractLinks;
import edu.umd.cloud9.webgraph.normalizer.AnchorTextNormalizer;
/**
* <p>
* Main driver program for extracting the web graph, reverse web graph, and
* lines of anchor text. Command-line arguments are as follows:
* </p>
*
* <ul>
* <li>[-input collection_base_path] the base path to the collection</li>
* <li>[-output output-base-path]: the base path under which the output would be stored</li>
* <li>[-collection {trecweb|trec|wt10g|gov2}]: the collection used</li>
* <li>[-inputFormat inputFormatClass]: use user specified input format
* class</li>
* <li>[-docnoClass userSpecifiedDocnoMappingClass]: use user specified docno mapping
* class</li>
* <li>[-il]: use this for including the internal links (i.e., links within a
* domain) remove for not</li>
* <li>[-caw]: use this to compute the default weights for lines of external
* anchor text, remove for not</li>
* <li>[-normalizer normalizer] A normalizer class used to normalize the lines of anchor
* text, must extend *.anchor.normalize.AnchorTextNormalizer.</li>
* <li>[<key:value> ..]: key-value pairs to put in configuration files. It shall
* also be used as input method for user specified classes</li>
* </ul>
*
* <p>
* The default weight used in this program was originally proposed by Metzler
* et. al in the following paper: <br />
*
* D. Metzler, J. Novak, H. Cui, and S. Reddy. Building enriched document
* representations using aggregated anchor text. <i>In Proc. 32nd Annual
* International ACM SIGIR Conference on Research and Development in Information
* Retrieval</i>, pages 219{226, New York, NY, USA, 2009. ACM.
* </p>
*
* @author Nima Asadi , Modified by Fangyue Wang
*
*/
public class TrecDriver extends Configured implements Tool {
private String inputBase;
private String outputBase;
private boolean includeInternalLinks = false;
private boolean computeAnchorWeights = false;
private String normalizer = "edu.umd.cloud9.webgraph.normalizer.AnchorTextBasicNormalizer";
private String filtername = null;
private Configuration conf;
private CollectionConfigurationManager configer;
public int run(String[] args) throws Exception {
conf = getConf();
configer = new CollectionConfigurationManager();
if (!readInput(args)) {
printUsage();
return -1;
}
configer.applyConfig(conf);
conf.setInt("Cloud9.Mappers", 2000);
conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
conf.setBoolean("Cloud9.IncludeInternalLinks", includeInternalLinks);
conf.set("Cloud9.AnchorTextNormalizer", normalizer);
// Job 1:
// Extract link information for each segment separately
String inputPath = inputBase;
String outputPath = outputBase + "/" + DriverUtil.OUTPUT_EXTRACT_LINKS;
conf.set("Cloud9.InputPath", inputPath);
conf.set("Cloud9.OutputPath", outputPath);
int r = new TrecExtractLinks(conf, configer).run();
if (r != 0) {
return -1;
}
// Job 2:
// Construct the reverse web graph (i.e., collect incoming link
// information)
inputPath = outputBase + "/" + DriverUtil.OUTPUT_EXTRACT_LINKS;
outputPath = outputBase + "/" + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/";
conf.set("Cloud9.InputPath", inputPath);
conf.set("Cloud9.OutputPath", outputPath);
conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
r = new BuildReverseWebGraph(conf).run();
if (r != 0) {
return -1;
}
// Job 3:
// Construct the web graph
inputPath = outputBase + "/" + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/";
outputPath = outputBase + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/";
conf.set("Cloud9.InputPath", inputPath);
conf.set("Cloud9.OutputPath", outputPath);
conf.setInt("Cloud9.Mappers", 1);
conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
r = new BuildWebGraph(conf).run();
if (r != 0) {
return -1;
}
if (computeAnchorWeights) {
// Propagating domain names in order to compute anchor weights
inputPath = outputBase + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/";
outputPath = outputBase + "/" + DriverUtil.OUTPUT_HOST_NAMES + "/";
conf.set("Cloud9.InputPath", inputPath);
conf.set("Cloud9.OutputPath", outputPath);
conf.setInt("Cloud9.Mappers", 1);
conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
r = new CollectHostnames(conf).run();
if (r != 0) {
return -1;
}
// Compute the weights
inputPath = outputBase + "/" + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/," +
outputBase + "/" + DriverUtil.OUTPUT_HOST_NAMES + "/";
outputPath = outputBase + "/" + DriverUtil.OUTPUT_WEGIHTED_REVERSE_WEBGRAPH + "/";
conf.set("Cloud9.InputPath", inputPath);
conf.set("Cloud9.OutputPath", outputPath);
conf.setInt("Cloud9.Mappers", 1);
conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
r = new ComputeWeight(conf).run();
if (r != 0) {
return -1;
}
}
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner
.run(new Configuration(), new TrecDriver(), args);
}
private static int printUsage() {
System.out.println("\nusage:" +
"[-input collection-path]" +
"[-output output-base" +
"[-collection {trecweb|gov2|wt10g}] " +
"[-inputFormat userSpecifiedInputFormatClass] " +
"[-docnoClass userSpecifiedDocnoMappingClass] " +
"-docno userSpecifiedDocnoMappingFile " +
"[-il] " +
"[-caw] " +
"[-normalizer normalizerClass] ");
System.out.println("Help:");
System.out.println("[" + DriverUtil.CL_INPUT + " collection-path]\n\tinput directory");
System.out.println("[" + DriverUtil.CL_OUTPUT + " output-base]\n\toutput directory");
System.out
.println(DriverUtil.CL_COLLECTION + " {trecweb|gov2|wt10g}\n\tname the collection name, if it is supported, automatic configuration will be applied");
System.out
.println(DriverUtil.CL_INPUT_FORMAT + " userSpecifiedInputFormatClass\n\tspecify the class work as FileInputFormat;" +
" Required when -collection is not specified");
System.out
.println(DriverUtil.CL_DOCNO_MAPPING_CLASS + " userSpecifiedDocnoMappingClass\n\tspecify the class work as DocnoMapping;" +
"Required when -collection is not specified. It should implement GenericDocnoMapping interface.");
System.out
.println(DriverUtil.CL_DOCNO_MAPPING + " userSpecifiedDocnoMappingFile\n\tspecify the File work as input to specified DocnoMapping class.");
System.out
.println(DriverUtil.CL_INCLUDE_INTERNAL_LINKS + "\n\tinclude internal links, without this option we will not include internal links");
System.out
.println(DriverUtil.CL_COMPUTE_WEIGHTS + "\n\tcompute default anchor weights, without this option we will not compute default anchor weights");
System.out
.println(DriverUtil.CL_NORMALIZER + " normalizerClass\n\ta normalizer class used to normalize the lines of anchor text," +
" must extend edu.umd.cloud9.webgraph.normalize.AnchorTextNormalizer.");
System.out.println();
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
private boolean readInput(String[] args) {
if (args.length < 6) {
System.out.println("More arguments needed.");
return false;
}
inputBase = new File(DriverUtil.argValue(args, DriverUtil.CL_INPUT)).getAbsolutePath();
outputBase = new File(DriverUtil.argValue(args, DriverUtil.CL_OUTPUT)).getAbsolutePath();
boolean knownCollection = DriverUtil.argExists(args, DriverUtil.CL_COLLECTION);
if(knownCollection) {
String collectionName = DriverUtil.argValue(args, DriverUtil.CL_COLLECTION);
if (!configer.setConfByCollection(collectionName)) {
System.out.println("Collection \"" + collectionName +
"\" not supported, please specify inputformat and docnomapping class, or contact developer.");
return false;
}
} else {
String ciName = DriverUtil.argValue(args, DriverUtil.CL_INPUT_FORMAT);
if (!configer.setUserSpecifiedInputFormat(ciName)) {
System.out.println("class \"" + ciName +
"\" doesn't exist or not sub-class of FileInputFormat");
return false;
}
String cmName = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING_CLASS);
if (!configer.setUserSpecifiedDocnoMappingClass(cmName)) {
System.out.println("class \"" + cmName +
"\" doesn't exist or not implemented DocnoMappingt");
return false;
}
}
conf.set("Cloud9.DocnoMappingFile", DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING));
includeInternalLinks = DriverUtil.argExists(args, DriverUtil.CL_INCLUDE_INTERNAL_LINKS);
computeAnchorWeights = DriverUtil.argExists(args, DriverUtil.CL_COMPUTE_WEIGHTS);
String nm = DriverUtil.argValue(args, DriverUtil.CL_NORMALIZER);
try {
if (!AnchorTextNormalizer.class.isAssignableFrom(Class.forName(nm))) {
System.out
.println("Invalid arguments; Normalizer class must implement AnchorTextNormalizer interface.");
return false;
}
} catch (ClassNotFoundException e) {
System.out
.println("Invalid arguments; Specified Normalizer class doesn't exist");
return false;
}
normalizer = nm;
return true;
}
}