Source Code of edu.umd.cloud9.webgraph.TrecExtractLinks$Map

/*
 * Cloud9: A MapReduce Library for Hadoop
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */


package edu.umd.cloud9.webgraph;


import java.io.IOException;
import java.io.UTFDataFormatException;
import java.net.URI;
import java.net.URISyntaxException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Logger;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


import edu.umd.cloud9.collection.DocnoMapping;
import edu.umd.cloud9.collection.WebDocument;
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
import edu.umd.cloud9.webgraph.normalizer.AnchorTextNormalizer;


/**
 * 
 * @author Nima Asadi
 * @author Fangyue Wang
 * @author metzler
 *
 */


public class TrecExtractLinks extends PowerTool
{
  private static final Logger LOG = Logger
      .getLogger(TrecExtractLinks.class);


  public static class Map
      extends
      Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>
  {


    public static enum LinkCounter
    {
      INPUT_DOCS, // number of input documents
      OUTPUT_DOCS, // number of output documents
      INVALID_DOCNO, // number of malformed documents
      INVALID_URL, // number of malformed URLs
      TEXT_TOO_LONG, // number of lines of anchor text that are abnormally
      // long
      PARSER_FAILED
      // number of times the HTML parser fails
    };


    private static String base; // base URL for current document
    private static String baseHost;
    private static int docno; // docno of current document


    private static final Text keyWord = new Text(); // output key for the
    // mappers
    private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<AnchorText>();
    // output value for the mappers


    private static DocnoMapping docnoMapping = null;


    private static final Parser parser = new Parser();
    private static final NodeFilter filter = new NodeClassFilter(
        LinkTag.class);
    private static NodeList list;


    private static boolean includeInternalLinks;


    private static AnchorTextNormalizer normalizer;


    @Override
    public void setup(
        Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>.Context context)
        throws IOException
    {
      Configuration conf = context.getConfiguration();


      String docnoMappingClass = conf.get("Cloud9.DocnoMappingClass");
      try
      {
        docnoMapping = (DocnoMapping) Class.forName(docnoMappingClass)
            .newInstance();
      }
      catch (Exception e)
      {
        throw new RuntimeException(
            "Error initializing DocnoMapping class!");
      }


      String docnoMappingFile = conf.get("Cloud9.DocnoMappingFile", null);
      if (docnoMappingFile != null)
      {
        Path docnoMappingPath = null;
        try
        {
          Path[] localFiles = DistributedCache
              .getLocalCacheFiles(conf);
          if (localFiles != null)
          {
            docnoMappingPath = localFiles[0];
          }
          else
          {
            docnoMappingPath = new Path(
                conf.get("Cloud9.DocnoMappingFile"));
          }
        }
        catch (IOException e)
        {
          throw new RuntimeException(
              "Unable to find DocnoMappingFile!");
        }


        try
        {
          docnoMapping.loadMapping(docnoMappingPath,
              FileSystem.getLocal(conf));
        }
        catch (Exception e)
        {
          e.printStackTrace();
          throw new RuntimeException(
              "Error initializing DocnoMapping!");
        }
      }


      includeInternalLinks = conf.getBoolean(
          "Cloud9.IncludeInternalLinks", false);


      try
      {
        normalizer = (AnchorTextNormalizer) Class.forName(
            conf.get("Cloud9.AnchorTextNormalizer")).newInstance();
      }
      catch (Exception e)
      {
        e.printStackTrace();
        throw new RuntimeException(
            "Error initializing AnchorTextNormalizer");
      }
    }


    @Override
    public void map(
        LongWritable key,
        WebDocument doc,
        Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>>.Context context)
        throws IOException, InterruptedException
    {
      context.getCounter(LinkCounter.INPUT_DOCS).increment(1);


      try
      {
        docno = docnoMapping.getDocno(doc.getDocid());
      }
      catch (NullPointerException e)
      {
        // Discard documents with an invalid document number
        context.getCounter(LinkCounter.INVALID_DOCNO).increment(1);
        return;
      }


      try
      {
        String url = doc.getURL().split("\n")[0];
        LOG.info("URI: " + url);
        base = normalizeURL(url);
      }
      catch (Exception e)
      {
        // Discard documents with which there is no URL associated
        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }


      if (base == null)
      {
        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }


      arrayList.clear();
      arrayList.add(new AnchorText(
          AnchorTextConstants.Type.DOCNO_FIELD.val,
          AnchorTextConstants.EMPTY_STRING, docno));
      keyWord.set(base);
      context.write(keyWord, arrayList);


      // keeping track of the number of documents that have actually been
      // processed
      context.getCounter(LinkCounter.OUTPUT_DOCS).increment(1);


      try
      {
        baseHost = new URI(base).getHost();
      }
      catch (Exception e)
      {
        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }


      if (baseHost == null)
      {
        context.getCounter(LinkCounter.INVALID_URL).increment(1);
        return;
      }


      try
      {
        parser.setInputHTML(doc.getContent()); // initializing the
        // parser with new HTML
        // content


        // Setting base URL for the current document
        NodeList nl = parser.parse(null);
        BaseHrefTag baseTag = new BaseHrefTag();
        baseTag.setBaseUrl(base);
        nl.add(baseTag);


        // re-initializing the parser with the fixed content
        parser.setInputHTML(nl.toHtml());


        // listing all LinkTag nodes
        list = parser.extractAllNodesThatMatch(filter);
      }
      catch (ParserException e)
      {
        context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
        return;
      }
      catch (StackOverflowError e)
      {
        context.getCounter(LinkCounter.PARSER_FAILED).increment(1);
        return;
      }


      for (int i = 0; i < list.size(); i++)
      {
        LinkTag link = (LinkTag) list.elementAt(i);
        String anchor = link.getLinkText();
        String url = normalizeURL(link.extractLink());


        if (url == null)
        {
          continue;
        }


        if (url.equals(base))
        { // discard self links
          continue;
        }


        String host = null;
        try
        {
          host = new URI(url).getHost();
        }
        catch (Exception e)
        {
          continue;
        }


        if (host == null)
        {
          continue;
        }


        if (anchor == null)
        {
          anchor = "";
        }


        // normalizing the anchor text
        anchor = normalizer.process(anchor);


        arrayList.clear();
        if (baseHost.equals(host))
        {


          if (!includeInternalLinks)
            continue;


          arrayList.add(new AnchorText(
              AnchorTextConstants.Type.INTERNAL_IN_LINK.val,
              anchor, docno));


        }
        else
        {
          arrayList.add(new AnchorText(
              AnchorTextConstants.Type.EXTERNAL_IN_LINK.val,
              anchor, docno));
        }


        try
        {
          keyWord.set(url);
          context.write(keyWord, arrayList);
        }
        catch (UTFDataFormatException e)
        {
          context.getCounter(LinkCounter.TEXT_TOO_LONG).increment(1);


          keyWord.set(url);
          byte flag = arrayList.get(0).getType();
          arrayList.clear();
          arrayList.add(new AnchorText(flag,
              AnchorTextConstants.EMPTY_STRING, docno));
          context.write(keyWord, arrayList);
        }


      }
    }


    private static String normalizeURL(String url)
    {
      try
      {
        URI uri = new URI(url).normalize(); // first apply built-in normalizer
        String scheme = uri.getScheme().toLowerCase(); // schemes are not case sensitive
        String host = uri.getHost().toLowerCase(); // hosts are not case sensitive
        String path = uri.getPath();
        while(path != null && path.length() > 0 && path.charAt(path.length()-1) == '/') { // remove trailing forward slashes from path
          path = path.substring(0, path.length()-1);
        }
        return (new URI(scheme, host, path, null)).toString();
      }
      catch (Exception e)
      {
        return null;
      }
    }
  }


  public static class Reduce
      extends
      Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>>
  {


    private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<AnchorText>();
    private static boolean pushed;


    @Override
    public void reduce(
        Text key,
        Iterable<ArrayListWritable<AnchorText>> values,
        Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>>.Context context)
        throws IOException, InterruptedException
    {


      arrayList.clear();


      for (ArrayListWritable<AnchorText> packet : values)
      {
        for (AnchorText data : packet)
        {


          pushed = false;


          for (int i = 0; i < arrayList.size(); i++)
          {
            if (arrayList.get(i).equalsIgnoreSources(data))
            {
              arrayList.get(i).addDocumentsFrom(data);
              pushed = true;
              break;
            }
          }


          if (!pushed)
            arrayList.add(data.clone());
        }
      }


      context.write(key, arrayList);
    }
  }


  public static final String[] RequiredParameters = { "Cloud9.InputPath",
      "Cloud9.OutputPath", "Cloud9.Mappers", "Cloud9.Reducers",
      "Cloud9.IncludeInternalLinks", "Cloud9.AnchorTextNormalizer",
      "Cloud9.DocnoMappingClass", "Cloud9.DocnoMappingFile" };


  public String[] getRequiredParameters()
  {
    return RequiredParameters;
  }


  public TrecExtractLinks(Configuration conf)
  {
    super(conf);
  }


  CollectionConfigurationManager configer;


  public TrecExtractLinks(Configuration conf,
      CollectionConfigurationManager confer)
  {
    super(conf);
    this.configer = confer;
  }


  @Override
  public int runTool() throws Exception
  {


    Configuration conf = getConf();
    conf.set("mapred.child.java.opts", "-Xmx3072m");
    conf.setInt("mapred.task.timeout", 60000000);
    Job job = new Job(conf);


    int numReducers = conf.getInt("Cloud9.Reducers", 200);


    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");


    String mappingFile = conf.get("Cloud9.DocnoMappingFile");


    FileSystem fs = FileSystem.get(conf);
    if (!fs.exists(new Path(mappingFile)))
    {
      throw new RuntimeException("Error: Docno mapping data file "
          + mappingFile + " doesn't exist!");
    }


    DistributedCache.addCacheFile(new Path(mappingFile).toUri(), job.getConfiguration());


    job.setJobName("ExtractLinks");
    job.setNumReduceTasks(numReducers);


    job.setJarByClass(TrecExtractLinks.class);
    job.setMapperClass(TrecExtractLinks.Map.class);
    job.setCombinerClass(TrecExtractLinks.Reduce.class);
    job.setReducerClass(TrecExtractLinks.Reduce.class);


    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ArrayListWritable.class);


    configer.applyJobConfig(job);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);


    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job,
        SequenceFile.CompressionType.BLOCK);


    recursivelyAddInputPaths(job, inputPath);


    FileOutputFormat.setOutputPath(job, new Path(outputPath));


    LOG.info("ExtractLinks");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - mapping file: " + mappingFile);
    LOG.info(" - include internal links? "
        + conf.getBoolean("Cloud9.IncludeInternalLinks", false));


    job.waitForCompletion(true);
    return 0;
  }
  
   public static void recursivelyAddInputPaths(Job job, String path) throws IOException {
      FileSystem fs;
      try {
        fs = FileSystem.get(new URI(path), job.getConfiguration());
      } catch (URISyntaxException e) {
        throw new RuntimeException("Error recursively adding path -- " + path);
      }


      FileStatus [] ls = fs.listStatus(new Path(path));
      for(FileStatus status : ls) {
        // skip anything that starts with an underscore, as it often indicates
        // a log directory or another special type of Hadoop file
        if(status.getPath().getName().startsWith("_")) {
          continue;
        }


        if(status.isDir()) {
          recursivelyAddInputPaths(job, status.getPath().toString());
        }
        else {
          FileInputFormat.addInputPath(job, status.getPath());
        }
      }
    }


}
Source Code of edu.umd.cloud9.webgraph.TrecExtractLinks$Map

Related Classes of edu.umd.cloud9.webgraph.TrecExtractLinks$Map