Package com.scaleunlimited.cascading.scheme.hadoop

Source Code of com.scaleunlimited.cascading.scheme.hadoop.SolrScheme

package com.scaleunlimited.cascading.scheme.hadoop;

import java.io.File;
import java.io.IOException;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.xml.sax.SAXException;

import cascading.flow.FlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.Tap;
import cascading.tap.TapException;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.util.Util;

import com.scaleunlimited.cascading.scheme.core.SolrSchemeUtil;

@SuppressWarnings("serial")
public class SolrScheme extends Scheme<JobConf, RecordReader<Tuple, Tuple>, OutputCollector<Tuple, Tuple>, Object[], Void> {

    private File _solrCoreDir;
    private int _maxSegments;
    private String _dataDirPropertyName;
   
    public SolrScheme(Fields schemeFields, String solrCoreDir) throws IOException, ParserConfigurationException, SAXException {
        this(schemeFields, solrCoreDir, SolrOutputFormat.DEFAULT_MAX_SEGMENTS);
    }
   
    public SolrScheme(Fields schemeFields, String solrCoreDir, int maxSegments) throws IOException, ParserConfigurationException, SAXException {
        this(schemeFields, solrCoreDir, SolrOutputFormat.DEFAULT_MAX_SEGMENTS, SolrSchemeUtil.DEFAULT_DATA_DIR_PROPERTY_NAME);
    }
   
    public SolrScheme(Fields schemeFields, String solrCoreDir, int maxSegments, String dataDirPropertyName) throws IOException, ParserConfigurationException, SAXException {
        super(schemeFields, schemeFields);

        _solrCoreDir = new File(solrCoreDir);
        _maxSegments = maxSegments;
        _dataDirPropertyName = dataDirPropertyName;

        SolrSchemeUtil.validate(_solrCoreDir, _dataDirPropertyName, schemeFields);
    }
   
    @Override
    public boolean isSink() {
        return true;
    }
   
    @Override
    public boolean isSource() {
        return false;
    }
   
    @Override
    public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader<Tuple, Tuple>, OutputCollector<Tuple, Tuple>> tap, JobConf conf) {
        throw new TapException("SolrScheme can only be used as a sink, not a source");
    }

    @Override
    public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader<Tuple, Tuple>, OutputCollector<Tuple, Tuple>> tap, JobConf conf) {
        // Pick temp location in HDFS for conf files.
        // TODO KKr - should I get rid of this temp directory when we're done?
        String coreDirname = _solrCoreDir.getName();
        Path hdfsSolrCoreDir = new Path(Hfs.getTempPath(conf)"solr-core-" + Util.createUniqueID() + "/" + coreDirname);
       
        // Copy Solr core directory into HDFS.
        try {
            FileSystem fs = hdfsSolrCoreDir.getFileSystem(conf);
            fs.copyFromLocalFile(new Path(_solrCoreDir.getAbsolutePath()), hdfsSolrCoreDir);
        } catch (IOException e) {
            throw new TapException("Can't copy Solr core directory into HDFS", e);
        }

        conf.setOutputKeyClass(Tuple.class);
        conf.setOutputValueClass(Tuple.class);
        conf.setOutputFormat(SolrOutputFormat.class);

        try {
            conf.set(SolrOutputFormat.SINK_FIELDS_KEY, HadoopUtil.serializeBase64(getSinkFields(), conf));
        } catch (IOException e) {
            throw new TapException("Can't serialize sink fields", e);
        }

        conf.set(SolrOutputFormat.SOLR_CORE_PATH_KEY, hdfsSolrCoreDir.toString());
        conf.setInt(SolrOutputFormat.MAX_SEGMENTS_KEY, _maxSegments);
        conf.set(SolrOutputFormat.DATA_DIR_PROPERTY_NAME_KEY, _dataDirPropertyName);
    }

    @Override
    public boolean source(FlowProcess<JobConf> conf, SourceCall<Object[], RecordReader<Tuple, Tuple>> sourceCall) throws IOException {
        throw new TapException("SolrScheme can only be used as a sink, not a source");
    }

    @Override
    public void sink(FlowProcess<JobConf> flowProcess, SinkCall<Void, OutputCollector<Tuple, Tuple>> sinkCall) throws IOException {
        sinkCall.getOutput().collect(Tuple.NULL, sinkCall.getOutgoingEntry().getTuple());
    }
}
TOP

Related Classes of com.scaleunlimited.cascading.scheme.hadoop.SolrScheme

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.