Package com.datasalt.pangool.solr

Source Code of com.datasalt.pangool.solr.TupleSolrOutputFormatExample

/**
* Copyright [2012] [Datasalt Systems S.L.]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datasalt.pangool.solr;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import com.datasalt.pangool.PangoolRuntimeException;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.tuplemr.IdentityTupleReducer;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMapper;
import com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat;

/**
* Example usage of {@TupleSolrOutputFormat} that is also used for unit testing.
* <p>
* This example creates three index: one in the main output which contains (user_id, message) pairs of english messages
* and two other auxiliar indexes with french & spanish messages.
*/
@SuppressWarnings("serial")
public class TupleSolrOutputFormatExample implements Serializable {

  public int run(String input, String output, Configuration conf) throws Exception {
    // Define the intermediate schema: It must match SOLR's schema.xml!
    final Schema schema = new Schema("iSchema", Fields.parse("user_id:string, message:string"));

    TupleMRBuilder job = new TupleMRBuilder(conf);
    job.addIntermediateSchema(schema);
    job.setGroupByFields("user_id");
    // Define the input and its associated mapper.
    // We'll just have a Mapper, reducer will be Identity
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {

          Tuple tuple = new Tuple(schema);

          @Override
          public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
              throws IOException, InterruptedException {
            String[] fields = value.toString().split("\t");
            String language = fields[1];
            tuple.set("user_id", fields[0]);
            tuple.set("message", fields[2]);
            if(language.equals("en")) {
              // English -> write to main output
              collector.write(tuple);
            } else if(language.equals("fr")) {
              // French -> write to french index
              collector.getNamedOutput("FR").write(tuple, NullWritable.get());
            } else if(language.equals("es")) {
              // Spanish -> write to spanish index
              collector.getNamedOutput("ES").write(tuple, NullWritable.get());
            }
          }
        });
    // Add multi-output: French index
    job.addNamedOutput("FR", new TupleSolrOutputFormat(new File("src/test/resources/solr-fr"), conf),
        ITuple.class, NullWritable.class);
    // Add multi-output: Spanish index
    job.addNamedOutput("ES", new TupleSolrOutputFormat(new File("src/test/resources/solr-es"), conf),
        ITuple.class, NullWritable.class);
    job.setTupleReducer(new IdentityTupleReducer());
    // Add multi-output: English index
    job.setOutput(new Path(output), new TupleSolrOutputFormat(new File("src/test/resources/solr-en"),
        conf), ITuple.class, NullWritable.class);
    Job hadoopJob = job.createJob();
    try {
      hadoopJob.waitForCompletion(true);
      if(!hadoopJob.isSuccessful()) {
        throw new PangoolRuntimeException("Job was not sucessfull");
      }
    } finally {
      job.cleanUpInstanceFiles();
    }
    return 0;
  }
}
TOP

Related Classes of com.datasalt.pangool.solr.TupleSolrOutputFormatExample

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.