/*
* Cloud9: A MapReduce Library for Hadoop
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package edu.umd.cloud9.collection;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import edu.umd.cloud9.collection.line.TextDocument;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.TrecExtractLinks.Map.LinkCounter;
/**
* <p>
* Tool for generating 'per-field' collections from HTML documents. The output of this
* tool is a new collection, in TREC format (in the form of a SequenceFile<LongWritable, TextDocument>)
* that only consists of the text contained within the target tag. This is useful for various
* document structure and/or field-based retrieval tasks.
* </p>
*
* @author fangyue
* @author metzler
*/
public class ExtractHTMLFieldCollection extends PowerTool {
private static final Logger LOG = Logger.getLogger(ExtractHTMLFieldCollection.class);
public static class MyMapper extends Mapper<LongWritable, Indexable, LongWritable, TextDocument> {
// TODO: allow this to support user-defined regular expressions, not just the "heading" one pre-defined here
public static class HeadingTagFilter implements NodeFilter {
private static final long serialVersionUID = 3848416345122090905L;
private final Pattern pattern = Pattern.compile("h[123456]", Pattern.CASE_INSENSITIVE);
public boolean accept(Node node) {
return (pattern.matcher(node.getText()).matches());
}
}
private static String tag;
private static final Parser parser = new Parser();
private static NodeFilter filter;
private static final LongWritable myKey = new LongWritable();
private static final TextDocument myValue = new TextDocument();
private static final StringBuffer strBuf = new StringBuffer();
@Override
public void setup(Mapper<LongWritable, Indexable, LongWritable, TextDocument>.Context context) throws IOException {
Configuration conf = context.getConfiguration();
tag = conf.get("Cloud9.TargetTag");
if (tag.equalsIgnoreCase("heading")) {
filter = new HeadingTagFilter();
} else {
filter = new TagNameFilter(tag);
}
}
@Override
public void map(LongWritable key, Indexable doc, Mapper<LongWritable, Indexable, LongWritable, TextDocument>.Context context) throws IOException, InterruptedException {
context.getCounter(LinkCounter.INPUT_DOCS).increment(1);
if(doc.getDocid() == null || doc.getContent() == null) {
return;
}
myKey.set(key.get());
NodeList nl;
try {
// initialize HTML parser
parser.setInputHTML(doc.getContent());
// parse the document
nl = parser.parse(filter);
} catch (ParserException e) {
context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
myValue.setDocid(doc.getDocid());
myValue.setContent("<DOC>\n<DOCNO>" + doc.getDocid() + "</DOCNO>\n<DOC>");
context.write(myKey, myValue);
return;
} catch (StackOverflowError e) {
context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
myValue.setDocid(doc.getDocid());
myValue.setContent("<DOC>\n<DOCNO>" + doc.getDocid() + "</DOCNO>\n<DOC>");
context.write(myKey, myValue);
return;
}
strBuf.setLength(0);
strBuf.append("<DOC>\n<DOCNO>");
strBuf.append(doc.getDocid());
strBuf.append("</DOCNO>\n");
for (int i = 0; i < nl.size(); i++) {
strBuf.append(nl.elementAt(i).toHtml()).append("\n");
}
strBuf.append("</DOC>\n");
// create output document
myValue.setDocid(doc.getDocid());
myValue.setContent(strBuf.toString());
// emit
context.write(myKey, myValue);
// bookkeeping
context.getCounter(LinkCounter.OUTPUT_DOCS).increment(1);
}
}
public static final String[] RequiredParameters = { "Cloud9.InputPath", "Cloud9.InputFormat", "Cloud9.OutputPath", "Cloud9.TargetTag" };
public String[] getRequiredParameters() {
return RequiredParameters;
}
public ExtractHTMLFieldCollection(Configuration conf) {
super(conf);
}
@SuppressWarnings({"unchecked", "rawtypes"})
@Override
public int runTool() throws Exception {
Configuration conf = getConf();
Job job = new Job(conf);
String inputPath = conf.get("Cloud9.InputPath");
String inputFormat = conf.get("Cloud9.InputFormat");
String outputPath = conf.get("Cloud9.OutputPath");
String tag = conf.get("Cloud9.TargetTag");
job.setJobName("ExtractFieldCollection");
job.setJarByClass(ExtractHTMLFieldCollection.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(Reducer.class);
job.setNumReduceTasks(200);
job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
recursivelyAddInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setCompressOutput(job, true);
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(TextDocument.class);
LOG.info("ExtractFieldCollection - " + tag);
LOG.info(" - Input path: " + inputPath);
LOG.info(" - Input format: " + inputFormat);
LOG.info(" - Output path: " + outputPath);
LOG.info(" - Target tag: " + tag);
job.waitForCompletion(true);
return 0;
}
public static void recursivelyAddInputPaths(Job job, String path) throws IOException {
FileSystem fs;
try {
fs = FileSystem.get(new URI(path), job.getConfiguration());
} catch (URISyntaxException e) {
throw new RuntimeException("Error recursively adding path -- " + path);
}
FileStatus[] ls = fs.listStatus(new Path(path));
for (FileStatus status : ls) {
// skip anything that starts with an underscore, as it often indicates
// a log directory or another special type of Hadoop file
if (status.getPath().getName().startsWith("_")) {
continue;
}
if (status.isDir()) {
recursivelyAddInputPaths(job, status.getPath().toString());
} else {
FileInputFormat.addInputPath(job, status.getPath());
}
}
}
public static void main(String [] args) throws Exception {
Configuration conf = new Configuration();
if(args.length != 4) {
System.err.println("Usage: ExtractFieldCollection [input-path] [input-format] [output-path] [target-tag]");
System.exit(-1);
}
conf.set("Cloud9.InputPath", args[0]);
conf.set("Cloud9.InputFormat", args[1]);
conf.set("Cloud9.OutputPath", args[2]);
conf.set("Cloud9.TargetTag", args[3]);
int res = ToolRunner.run(conf, new ExtractHTMLFieldCollection(conf), args);
System.exit(res);
}
}