Package nise

Source Code of nise.ExtractNutch

package nise;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;

import org.apache.nutch.protocol.Content;

import java.io.IOException;

import java.util.ArrayList;


public class ExtractNutch implements Mapper<Writable, Content, BytesWritable, BytesWritable>
{
    public void configure(JobConf conf) {
    }

    public void close() throws IOException {
    }

    public void map(Writable key, Content value,
        OutputCollector<BytesWritable, BytesWritable> output, Reporter reporter)
        throws IOException {
        Content content = (Content) value;

        if (content.getContentType().toLowerCase().startsWith("image/jpeg")) {
            try {
                //MessageDigest md = MessageDigest.getInstance("SHA-1");
                output.collect(new BytesWritable(content.getUrl().getBytes()),
                            new BytesWritable(content.getContent()));
            } catch (Exception e) {
            }
        }
    }

    public static void extract(Path out, Path[] segs) throws Exception {
        JobConf job = new JobConf(ExtractNutch.class);
        FileSystem fs = FileSystem.get(job);

        // prepare the minimal common set of input dirs
        for (int i = 0; i < segs.length; i++) {
            Path cDir = new Path(segs[i], Content.DIR_NAME);

            FileStatus st = fs.getFileStatus(segs[i]);
            if (st.isDir() && fs.exists(cDir)) {
                FileInputFormat.addInputPath(job, cDir);
            }
        }

        FileOutputFormat.setOutputPath(job, out);
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setMapperClass(ExtractNutch.class);
        job.setReducerClass(IdentityReducer.class);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(BytesWritable.class);
        job.setNumReduceTasks(462);
        JobClient.runJob(job);
    }

    public static void main(String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println(
                "MergeImage output_dir (-dir segments | seg1 seg2 ...)");
            System.err.println(
                "\toutput_dir\tname of the parent dir for output segment slice(s)");
            System.err.println(
                "\t-dir segments\tparent dir containing several segments");

            return;
        }

        Configuration conf = new Configuration();
        final FileSystem fs = FileSystem.get(conf);
        Path out = new Path(args[0]);
        ArrayList<Path> segs = new ArrayList<Path>();

        for (int i = 1; i < args.length; i++) {
            if (args[i].equals("-dir")) {
                FileStatus[] fstats = fs.listStatus(new Path(args[++i]));

                for (int j = 0; j < fstats.length; j++) {
                    segs.add(fstats[j].getPath());
                }
            } else {
                segs.add(new Path(args[i]));
            }
        }

        if (segs.size() == 0) {
            System.err.println("ERROR: No input segments.");

            return;
        }

        extract(out, segs.toArray(new Path[segs.size()]));
    }
}
TOP

Related Classes of nise.ExtractNutch

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.