/**
* Ivory: A Hadoop toolkit for Web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package ivory.ptc;
import ivory.ptc.data.AnchorTextTarget;
import ivory.ptc.judgments.weighting.WeightingScheme;
import java.io.IOException;
import java.net.URI;
import java.util.Collections;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.io.array.ArrayListOfIntsWritable;
import edu.umd.cloud9.io.array.ArrayListWritable;
/**
* Map-Reduce job that constructs anchor text-inverted index.
* The inverted index contains, for each unique anchor text,
* a list of documents that are pointed to by that anchor text.
*
* @author Nima Asadi
*/
@SuppressWarnings("deprecation")
public class AnchorTextInvertedIndex extends PowerTool {
private static final Logger LOG = Logger.getLogger(AnchorTextInvertedIndex.class);
public static final String PARAMETER_SEPARATER = ",";
static {
LOG.setLevel(Level.INFO);
}
private static class MyMapper extends MapReduceBase implements
Mapper<IntWritable, ArrayListWritable<AnchorText>, Text, AnchorTextTarget> {
private static final AnchorTextTarget anchorTextTarget = new AnchorTextTarget();
private static final Text keyOut = new Text();
// Weighting scheme used to rank target documents
private static WeightingScheme weightingScheme;
@Override
public void configure(JobConf job) {
Path[] localFiles;
try {
localFiles = DistributedCache.getLocalCacheFiles(job);
} catch (IOException e) {
throw new RuntimeException("Local cache files not read properly.");
}
String[] params = new String[job.get("Ivory.WeightingSchemeParameters").split(PARAMETER_SEPARATER).length];
for (int i = 0; i < params.length; i++) {
params[i] = localFiles[i].toString();
}
try {
weightingScheme = (WeightingScheme) Class.forName(
job.get("Ivory.WeightingScheme")).newInstance();
weightingScheme.initialize(FileSystem.getLocal(job), params);
} catch (Exception e) {
throw new RuntimeException("Mapper failed to initialize the weighting scheme: "
+ job.get("Ivory.WeightingScheme") + " with parameters: "
+ job.get("Ivory.WeightingSchemeParameters"));
}
}
@Override
public void map(IntWritable key, ArrayListWritable<AnchorText> anchors,
OutputCollector<Text, AnchorTextTarget> output, Reporter reporter) throws IOException {
anchorTextTarget.setTarget(key.get());
for (AnchorText anchor : anchors) {
// Internal links provide navigational information which are not useful for our purposes.
if (!anchor.isExternalInLink() && !anchor.isInternalInLink()) {
continue;
}
keyOut.set(anchor.getText());
anchorTextTarget.setSources(new ArrayListOfIntsWritable(anchor.getDocuments()));
anchorTextTarget.setWeight(weightingScheme.getWeight(key.get(), anchor));
output.collect(keyOut, anchorTextTarget);
}
}
}
private static class MyReducer extends MapReduceBase implements
Reducer<Text, AnchorTextTarget, Text, ArrayListWritable<AnchorTextTarget>> {
private static final ArrayListWritable<AnchorTextTarget> outList =
new ArrayListWritable<AnchorTextTarget>();
@Override
public void reduce(Text anchorText, Iterator<AnchorTextTarget> values,
OutputCollector<Text, ArrayListWritable<AnchorTextTarget>> output, Reporter reporter)
throws IOException {
outList.clear();
while (values.hasNext()) {
outList.add(new AnchorTextTarget(values.next()));
}
Collections.sort(outList);
output.collect(anchorText, outList);
}
}
public static final String[] RequiredParameters = {
"Ivory.NumMapTasks",
"Ivory.NumReduceTasks",
"Ivory.InputPath",
"Ivory.OutputPath",
"Ivory.WeightingScheme",
"Ivory.WeightingSchemeParameters",
};
@Override
public String[] getRequiredParameters() {
return RequiredParameters;
}
public AnchorTextInvertedIndex(Configuration conf) {
super(conf);
}
@Override
public int runTool() throws Exception {
JobConf conf = new JobConf(getConf(), AnchorTextInvertedIndex.class);
FileSystem fs = FileSystem.get(conf);
String inPath = conf.get("Ivory.InputPath");
String outPath = conf.get("Ivory.OutputPath");
Path inputPath = new Path(inPath);
Path outputPath = new Path(outPath);
int mapTasks = conf.getInt("Ivory.NumMapTasks", 1);
int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 100);
String weightingSchemeParameters = conf.get("Ivory.WeightingSchemeParameters");
LOG.info("BuildAnchorTextInvertedIndex");
LOG.info(" - input path: " + inPath);
LOG.info(" - output path: " + outPath);
LOG.info(" - number of reducers: " + reduceTasks);
LOG.info(" - weighting scheme: " + conf.get("Ivory.WeightingScheme"));
LOG.info(" - weighting scheme parameters: " + weightingSchemeParameters);
String[] params = weightingSchemeParameters.split(PARAMETER_SEPARATER);
for(String param : params) {
DistributedCache.addCacheFile(new URI(param), conf);
}
conf.setJobName("BuildAnchorTextInvertedIndex");
conf.setNumMapTasks(mapTasks);
conf.setNumReduceTasks(reduceTasks);
conf.set("mapred.child.java.opts", "-Xmx4096m");
conf.setInt("mapred.task.timeout", 60000000);
FileInputFormat.setInputPaths(conf, inputPath);
FileOutputFormat.setOutputPath(conf, outputPath);
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(AnchorTextTarget.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(ArrayListWritable.class);
conf.setMapperClass(MyMapper.class);
conf.setReducerClass(MyReducer.class);
fs.delete(outputPath);
JobClient.runJob(conf);
return 0;
}
}