Package org.apache.nutch.parse

Source Code of org.apache.nutch.parse.ParseSegment

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.parse;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.*;
import org.apache.hadoop.fs.Path;

import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Map.Entry;

/* Parse content in a segment. */
public class ParseSegment extends Configured implements Tool,
    Mapper<WritableComparable, Content, Text, ParseImpl>,
    Reducer<Text, Writable, Text, Writable> {

  public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class);
 
  private ScoringFilters scfilters;
 
  public ParseSegment() {
    this(null);
  }
 
  public ParseSegment(Configuration conf) {
    super(conf);
  }

  public void configure(JobConf job) {
    setConf(job);
    this.scfilters = new ScoringFilters(job);
  }

  public void close() {}
 
  private Text newKey = new Text();

  public void map(WritableComparable key, Content content,
                  OutputCollector<Text, ParseImpl> output, Reporter reporter)
    throws IOException {
    // convert on the fly from old UTF8 keys
    if (key instanceof UTF8) {
      newKey.set(key.toString());
      key = newKey;
    }
   
    int status =
      Integer.parseInt(content.getMetadata().get(Nutch.FETCH_STATUS_KEY));
    if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
      // content not fetched successfully, skip document
      LOG.debug("Skipping " + key + " as content is not fetched successfully");
      return;
    }

    ParseResult parseResult = null;
    try {
      parseResult = new ParseUtil(getConf()).parse(content);
    } catch (Exception e) {
      LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
      return;
    }

    for (Entry<Text, Parse> entry : parseResult) {
      Text url = entry.getKey();
      Parse parse = entry.getValue();
      ParseStatus parseStatus = parse.getData().getStatus();

      LOG.info("Parsing: " + url);
      reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);

      if (!parseStatus.isSuccess()) {
        LOG.warn("Error parsing: " + key + ": " + parseStatus);
        parse = parseStatus.getEmptyParse(getConf());
      }

      // pass segment name to parse data
      parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
                                           getConf().get(Nutch.SEGMENT_NAME_KEY));

      // compute the new signature
      byte[] signature =
        SignatureFactory.getSignature(getConf()).calculate(content, parse);
      parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
          StringUtil.toHexString(signature));
     
      try {
        scfilters.passScoreAfterParsing(url, content, parse);
      } catch (ScoringFilterException e) {
        if (LOG.isWarnEnabled()) {
          e.printStackTrace(LogUtil.getWarnStream(LOG));
          LOG.warn("Error passing score: "+ url +": "+e.getMessage());
        }
      }
      output.collect(url, new ParseImpl(new ParseText(parse.getText()),
                                        parse.getData(), parse.isCanonical()));
    }
  }

  public void reduce(Text key, Iterator<Writable> values,
                     OutputCollector<Text, Writable> output, Reporter reporter)
    throws IOException {
    output.collect(key, (Writable)values.next()); // collect first value
  }

  public void parse(Path segment) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("ParseSegment: starting at " + sdf.format(start));
      LOG.info("ParseSegment: segment: " + segment);
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("parse " + segment);

    FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(ParseSegment.class);
    job.setReducerClass(ParseSegment.class);
   
    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(ParseOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ParseImpl.class);

    JobClient.runJob(job);
    long end = System.currentTimeMillis();
    LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  }


  public static void main(String[] args) throws Exception {
  int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(), args);
  System.exit(res);
  }
   
  public int run(String[] args) throws Exception {
    Path segment;

    String usage = "Usage: ParseSegment segment";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }     
    segment = new Path(args[0]);
    parse(segment);
    return 0;
  }
}
TOP

Related Classes of org.apache.nutch.parse.ParseSegment

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.