Package hipi.examples.downloader

Source Code of hipi.examples.downloader.Downloader$DownloaderMapper

package hipi.examples.downloader;

import hipi.image.ImageHeader.ImageType;
import hipi.imagebundle.HipiImageBundle;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.net.URL;
import java.net.URLConnection;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
* A utility MapReduce program that takes a list of image URL's, downloads them, and creates
* a {@link hipi.imagebundle.HipiImageBundle} from them.
*
* When running this program, the user must specify 3 parameters. The first is the location
* of the list of URL's (one URL per line), the second is the output path for the HIB that will
* be generated, and the third is the number of nodes that should be used during the
* program's execution. This final parameter should be chosen with respect to the total
* bandwidth your particular cluster is able to handle. An example usage would be:
* <br /><br />
* downloader.jar /path/to/urls.txt /path/to/output.hib 10
* <br /><br />
* This program will automatically force 10 nodes to download the set of URL's contained in
* the input list, thus if your list contains 100,000 images, each node in this example will
* be responsible for downloading 10,000 images.
*
*/
public class Downloader extends Configured implements Tool{

 
  public static class DownloaderMapper extends Mapper<IntWritable, Text, BooleanWritable, Text>
  {
    private static Configuration conf;
    // This method is called on every node
    public void setup(Context jc) throws IOException
    {
      conf = jc.getConfiguration();
    }

    public void map(IntWritable key, Text value, Context context)
    throws IOException, InterruptedException
    {
      String temp_path = conf.get("downloader.outpath") + key.get() + ".hib.tmp";
      System.out.println("Temp path: " + temp_path);
     
      HipiImageBundle hib = new HipiImageBundle(new Path(temp_path), conf);
      hib.open(HipiImageBundle.FILE_MODE_WRITE, true);

      String word = value.toString();

      BufferedReader reader = new BufferedReader(new StringReader(word));
      String uri;
      int i = key.get();
      int iprev = i;
      while((uri = reader.readLine()) != null)     
      {
        if(i >= iprev+100) {
          hib.close();
          context.write(new BooleanWritable(true), new Text(hib.getPath().toString()));
          temp_path = conf.get("downloader.outpath") + i + ".hib.tmp";
          hib = new HipiImageBundle(new Path(temp_path), conf);
          hib.open(HipiImageBundle.FILE_MODE_WRITE, true);
          iprev = i;
        }
        long startT=0;
        long stopT=0;    
        startT = System.currentTimeMillis();           

        try {
          String type = "";
          URLConnection conn;
          // Attempt to download
          context.progress();

          try {
            URL link = new URL(uri);
            System.err.println("Downloading " + link.toString());
            conn = link.openConnection();
            conn.connect();
            type = conn.getContentType();
          } catch (Exception e)
          {
            System.err.println("Connection error to image : " + uri);
            continue;
          }

          if (type == null)
            continue;

          if (type.compareTo("image/gif") == 0)
            continue;

          if (type != null && type.compareTo("image/jpeg") == 0)
            hib.addImage(conn.getInputStream(), ImageType.JPEG_IMAGE);
         
        } catch(Exception e)
        {
          e.printStackTrace();
          System.err.println("Error... probably cluster downtime");
          try
          {
            Thread.sleep(1000);         
          } catch (InterruptedException e1)
          {
            e1.printStackTrace();
          }
        }

        i++;
       
        // Emit success
        stopT = System.currentTimeMillis();
        float el = (float)(stopT-startT)/1000.0f;
        System.err.println("> Took " + el + " seconds\n");       
      }


      try
      {
        reader.close();
        hib.close();
        context.write(new BooleanWritable(true), new Text(hib.getPath().toString()));
      } catch (Exception e)
      {
        e.printStackTrace();
      }

    }
  }

  public static class DownloaderReducer extends Reducer<BooleanWritable, Text, BooleanWritable, Text> {

    private static Configuration conf;   
    public void setup(Context jc) throws IOException
    {
      conf = jc.getConfiguration();
    }

    public void reduce(BooleanWritable key, Iterable<Text> values, Context context)
    throws IOException, InterruptedException
    {
      if(key.get()){
        FileSystem fileSystem = FileSystem.get(conf);
        HipiImageBundle hib = new HipiImageBundle(new Path(conf.get("downloader.outfile")), conf);
        hib.open(HipiImageBundle.FILE_MODE_WRITE, true);
        for (Text temp_string : values) {
          Path temp_path = new Path(temp_string.toString());
          HipiImageBundle input_bundle = new HipiImageBundle(temp_path, conf);
          hib.append(input_bundle);
         
          Path index_path = input_bundle.getPath();
          Path data_path = new Path(index_path.toString() + ".dat");
          System.out.println("Deleting: " + data_path.toString());
          fileSystem.delete(index_path, false);
          fileSystem.delete(data_path, false);
         
          context.write(new BooleanWritable(true), new Text(input_bundle.getPath().toString()));
          context.progress();
        }
        hib.close();
      }
    }
  }


  public int run(String[] args) throws Exception
 

    // Read in the configuration file
    if (args.length < 3)
    {
      System.out.println("Usage: downloader <input file> <output file> <nodes>");
      System.exit(0);
    }

    // Setup configuration
    Configuration conf = new Configuration();

    String inputFile = args[0];
    String outputFile = args[1];
    int nodes = Integer.parseInt(args[2]);

    String outputPath = outputFile.substring(0, outputFile.lastIndexOf('/')+1);
    System.out.println("Output HIB: " + outputPath);
   
   
    conf.setInt("downloader.nodes", nodes);
    conf.setStrings("downloader.outfile", outputFile);
    conf.setStrings("downloader.outpath", outputPath);

    Job job = new Job(conf, "downloader");
    job.setJarByClass(Downloader.class);
    job.setMapperClass(DownloaderMapper.class);
    job.setReducerClass(DownloaderReducer.class);

    // Set formats
    job.setOutputKeyClass(BooleanWritable.class);
    job.setOutputValueClass(Text.class);      
    job.setInputFormatClass(DownloaderInputFormat.class);

    //*************** IMPORTANT ****************\\
    job.setMapOutputKeyClass(BooleanWritable.class);
    job.setMapOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(job, new Path(outputFile + "_output"));

    DownloaderInputFormat.setInputPaths(job, new Path(inputFile));

    job.setNumReduceTasks(1);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
    return 0;
  }

  public static void createDir(String path, Configuration conf) throws IOException {
    Path output_path = new Path(path);

    FileSystem fs = FileSystem.get(conf);

    if (!fs.exists(output_path)) {
      fs.mkdirs(output_path);
    }
  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Downloader(), args);
    System.exit(res);
  }
}
TOP

Related Classes of hipi.examples.downloader.Downloader$DownloaderMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.