/**
* Copyright [2012] [Datasalt Systems S.L.]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datasalt.pangool.solr;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import com.datasalt.pangool.PangoolRuntimeException;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.tuplemr.IdentityTupleReducer;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMapper;
import com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat;
/**
* Example usage of {@TupleSolrOutputFormat} that is also used for unit testing.
* <p>
* This example creates three index: one in the main output which contains (user_id, message) pairs of english messages
* and two other auxiliar indexes with french & spanish messages.
*/
@SuppressWarnings("serial")
public class TupleSolrOutputFormatExample implements Serializable {
public int run(String input, String output, Configuration conf) throws Exception {
// Define the intermediate schema: It must match SOLR's schema.xml!
final Schema schema = new Schema("iSchema", Fields.parse("user_id:string, message:string"));
TupleMRBuilder job = new TupleMRBuilder(conf);
job.addIntermediateSchema(schema);
job.setGroupByFields("user_id");
// Define the input and its associated mapper.
// We'll just have a Mapper, reducer will be Identity
job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class),
new TupleMapper<LongWritable, Text>() {
Tuple tuple = new Tuple(schema);
@Override
public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
throws IOException, InterruptedException {
String[] fields = value.toString().split("\t");
String language = fields[1];
tuple.set("user_id", fields[0]);
tuple.set("message", fields[2]);
if(language.equals("en")) {
// English -> write to main output
collector.write(tuple);
} else if(language.equals("fr")) {
// French -> write to french index
collector.getNamedOutput("FR").write(tuple, NullWritable.get());
} else if(language.equals("es")) {
// Spanish -> write to spanish index
collector.getNamedOutput("ES").write(tuple, NullWritable.get());
}
}
});
// Add multi-output: French index
job.addNamedOutput("FR", new TupleSolrOutputFormat(new File("src/test/resources/solr-fr"), conf),
ITuple.class, NullWritable.class);
// Add multi-output: Spanish index
job.addNamedOutput("ES", new TupleSolrOutputFormat(new File("src/test/resources/solr-es"), conf),
ITuple.class, NullWritable.class);
job.setTupleReducer(new IdentityTupleReducer());
// Add multi-output: English index
job.setOutput(new Path(output), new TupleSolrOutputFormat(new File("src/test/resources/solr-en"),
conf), ITuple.class, NullWritable.class);
Job hadoopJob = job.createJob();
try {
hadoopJob.waitForCompletion(true);
if(!hadoopJob.isSuccessful()) {
throw new PangoolRuntimeException("Job was not sucessfull");
}
} finally {
job.cleanUpInstanceFiles();
}
return 0;
}
}