Package com.intel.hadoop.graphbuilder.job

Source Code of com.intel.hadoop.graphbuilder.job.AbstractPreprocessJob

/* Copyright (C) 2012 Intel Corporation.
*     All rights reserved.
*          
*  Licensed under the Apache License, Version 2.0 (the "License");
*  you may not use this file except in compliance with the License.
*  You may obtain a copy of the License at
*
*       http://www.apache.org/licenses/LICENSE-2.0
*
*   Unless required by applicable law or agreed to in writing, software
*   distributed under the License is distributed on an "AS IS" BASIS,
*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*   See the License for the specific language governing permissions and
*   limitations under the License.
*
* For more about this software visit:
*      http://www.01.org/GraphBuilder
*/
package com.intel.hadoop.graphbuilder.job;

import java.io.IOException;
import java.util.HashMap;

import javassist.CannotCompileException;
import javassist.NotFoundException;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.InputFormat;

import com.intel.hadoop.graphbuilder.preprocess.functional.Functional;
import com.intel.hadoop.graphbuilder.preprocess.inputformat.GraphTokenizer;
import com.intel.hadoop.graphbuilder.preprocess.inputformat.XMLInputFormat;
import com.intel.hadoop.graphbuilder.preprocess.mapreduce.CreateGraphMR;
import com.intel.hadoop.graphbuilder.preprocess.mapreduce.keyvalue.PreprocessJobValueFactory;
import com.intel.hadoop.graphbuilder.util.FsUtil;

/**
* An abstract wrapper class for running the Preprocessing Job, which creates a
* graph from the raw input data. See an example in {@code PreprocessJobTest}.
* <p>
* User first needs a {@code GraphTokenizer}, and a {@code InputFormat} specific
* to the input data. The {@code InputFormat} is used for generate a single
* input from the raw data. And the {@code GraphTokenizer} is used for extract a
* list of {@Vertex}s and {@code Edge}s from each input given by the
* {@code InputFormat}. For example, to create a link graph from a Wikipedia xml
* dump, {@code WikiPageInputFormat} splits the file by the begin and close of
* "page" tag, and output the string in between as a "page" to the
* {@code LinkGraphTokenizer}, which then extract the title of the page as the
* vertex and link as the edges.
* </p>
* <p>
* Next, user will need to override 3 functions: {@code vertexReducer()}, and
* {@code edgeReducer()}, which are applied to duplicate vertices and edges.
* They both can return {@code null} in which case only the first instance of
* duplicate objects will remain. The third function to override is
* {@code cleanBidirectionalEdge()}, which is the option to keep or discard the
* bi-directional edges in the graph.
* </p>
* <p>
* Additional options can be added into the jobConf by calling
* {@code addUserOpt}. {@code Functional}s can get option using
* {@code configure(JobConf)}.
* </p>
* <p>
* Input directories contain any raw input data. Output directories:
* <ul>
* <li>$outputdir/edata list of edges</li>
* <li>$outputdir/vdata list of vertices</li>
* </ul>
*
* @see CreateGraphMR
* @see PreprocessJobTest
* @see XMLInputFormat
* @see WikiPageInputFormat
* @see LinkGraphTokenizer
* @see Functional
*
* @param <VidType>
* @param <VertexData>
* @param <EdgeData>
*/
public abstract class AbstractPreprocessJob<VidType extends WritableComparable<VidType>, VertexData extends Writable, EdgeData extends Writable> {

  public AbstractPreprocessJob() {
    this.userOpts = new HashMap<String, String>();
  }

  public abstract Functional<VertexData, VertexData> vertexReducer();

  public abstract Functional<EdgeData, EdgeData> edgeReducer();

  public abstract boolean cleanBidirectionalEdge();

  public void addUserOpt(String key, String value) {
    userOpts.put(key, value);
  }

  public boolean run(GraphTokenizer<VidType, VertexData, EdgeData> tokenizer,
      InputFormat inputformat, String[] inputs, String output)
      throws CannotCompileException, NotFoundException, IOException {
    // Required parameters;
    CreateGraphMR mr = new CreateGraphMR(tokenizer, inputformat);

    Class valueClass = PreprocessJobValueFactory.getValueClassByClassName(
        tokenizer.vidClass().getName(), tokenizer.vdataClass().getName(),
        tokenizer.edataClass().getName());
    // Set value class based on the types of tokenizer.
    mr.setValueClass(valueClass);
    // Distributed the new class file to cluster.
    FsUtil.distributedTempClassToClassPath(mr.getConf());

    // Optional parameters;
    Class vreducerClass = vertexReducer() == null ? null : vertexReducer()
        .getClass();
    Class ereducerClass = edgeReducer() == null ? null : edgeReducer()
        .getClass();
    mr.setFunctionClass(vreducerClass, ereducerClass);
    mr.cleanBidirectionalEdge(cleanBidirectionalEdge());

    // User defined parameters;
    if (userOpts != null) {
      mr.setUserOptions(userOpts);
    }

    try {
      mr.run(inputs, output);
    } catch (Exception e) {
      e.printStackTrace();
      return false;
    }
    return true;
  }

  protected HashMap<String, String> userOpts;
}
TOP

Related Classes of com.intel.hadoop.graphbuilder.job.AbstractPreprocessJob

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.