Source Code of com.datasalt.pangool.solr.TupleSolrOutputFormatExample

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.solr;


import java.io.File;
import java.io.IOException;
import java.io.Serializable;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;


import com.datasalt.pangool.PangoolRuntimeException;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.tuplemr.IdentityTupleReducer;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMapper;
import com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat;


/**
 * Example usage of {@TupleSolrOutputFormat} that is also used for unit testing.
 * <p>
 * This example creates three index: one in the main output which contains (user_id, message) pairs of english messages
 * and two other auxiliar indexes with french & spanish messages.
 */
@SuppressWarnings("serial")
public class TupleSolrOutputFormatExample implements Serializable {


  public int run(String input, String output, Configuration conf) throws Exception {
    // Define the intermediate schema: It must match SOLR's schema.xml!
    final Schema schema = new Schema("iSchema", Fields.parse("user_id:string, message:string"));


    TupleMRBuilder job = new TupleMRBuilder(conf);
    job.addIntermediateSchema(schema);
    job.setGroupByFields("user_id");
    // Define the input and its associated mapper.
    // We'll just have a Mapper, reducer will be Identity
    job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class),
        new TupleMapper<LongWritable, Text>() {


          Tuple tuple = new Tuple(schema);


          @Override
          public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
              throws IOException, InterruptedException {
            String[] fields = value.toString().split("\t");
            String language = fields[1];
            tuple.set("user_id", fields[0]);
            tuple.set("message", fields[2]);
            if(language.equals("en")) {
              // English -> write to main output
              collector.write(tuple);
            } else if(language.equals("fr")) {
              // French -> write to french index
              collector.getNamedOutput("FR").write(tuple, NullWritable.get());
            } else if(language.equals("es")) {
              // Spanish -> write to spanish index
              collector.getNamedOutput("ES").write(tuple, NullWritable.get());
            }
          }
        });
    // Add multi-output: French index
    job.addNamedOutput("FR", new TupleSolrOutputFormat(new File("src/test/resources/solr-fr"), conf),
        ITuple.class, NullWritable.class);
    // Add multi-output: Spanish index
    job.addNamedOutput("ES", new TupleSolrOutputFormat(new File("src/test/resources/solr-es"), conf),
        ITuple.class, NullWritable.class);
    job.setTupleReducer(new IdentityTupleReducer());
    // Add multi-output: English index
    job.setOutput(new Path(output), new TupleSolrOutputFormat(new File("src/test/resources/solr-en"),
        conf), ITuple.class, NullWritable.class);
    Job hadoopJob = job.createJob();
    try {
      hadoopJob.waitForCompletion(true);
      if(!hadoopJob.isSuccessful()) {
        throw new PangoolRuntimeException("Job was not sucessfull");
      }
    } finally {
      job.cleanUpInstanceFiles();
    }
    return 0;
  }
}
Source Code of com.datasalt.pangool.solr.TupleSolrOutputFormatExample

Related Classes of com.datasalt.pangool.solr.TupleSolrOutputFormatExample