package cc.mrlda;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.google.common.base.Preconditions;
import edu.umd.cloud9.io.map.HMapIDW;
import edu.umd.cloud9.io.pair.PairOfIntFloat;
public class DisplayTopic extends Configured implements Tool {
public static final String TOP_DISPLAY_OPTION = "topdisplay";
public static final int TOP_DISPLAY = 10;
@SuppressWarnings("unchecked")
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(Settings.HELP_OPTION, false, "print the help message");
options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg()
.withDescription("input beta file").create(Settings.INPUT_OPTION));
options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg()
.withDescription("term index file").create(ParseCorpusOptions.INDEX));
options.addOption(OptionBuilder.withArgName(Settings.INTEGER_INDICATOR).hasArg()
.withDescription("display top terms only (default - 10)").create(TOP_DISPLAY_OPTION));
String betaString = null;
String indexString = null;
int topDisplay = TOP_DISPLAY;
CommandLineParser parser = new GnuParser();
HelpFormatter formatter = new HelpFormatter();
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption(Settings.HELP_OPTION)) {
formatter.printHelp(ParseCorpus.class.getName(), options);
System.exit(0);
}
if (line.hasOption(Settings.INPUT_OPTION)) {
betaString = line.getOptionValue(Settings.INPUT_OPTION);
} else {
throw new ParseException("Parsing failed due to " + Settings.INPUT_OPTION
+ " not initialized...");
}
if (line.hasOption(ParseCorpusOptions.INDEX)) {
indexString = line.getOptionValue(ParseCorpusOptions.INDEX);
} else {
throw new ParseException("Parsing failed due to " + ParseCorpusOptions.INDEX
+ " not initialized...");
}
if (line.hasOption(TOP_DISPLAY_OPTION)) {
topDisplay = Integer.parseInt(line.getOptionValue(TOP_DISPLAY_OPTION));
}
} catch (ParseException pe) {
System.err.println(pe.getMessage());
formatter.printHelp(ParseCorpus.class.getName(), options);
System.exit(0);
} catch (NumberFormatException nfe) {
System.err.println(nfe.getMessage());
System.exit(0);
}
JobConf conf = new JobConf(DisplayTopic.class);
FileSystem fs = FileSystem.get(conf);
Path indexPath = null;
indexPath = new Path(indexString);
Preconditions.checkArgument(fs.exists(indexPath) && fs.isFile(indexPath),
"Invalid index path...");
Path betaPath = new Path(betaString);
Preconditions.checkArgument(fs.exists(betaPath) && fs.isFile(betaPath), "Invalid beta path...");
SequenceFile.Reader sequenceFileReader = null;
try {
IntWritable intWritable = new IntWritable();
Text text = new Text();
Map<Integer, String> termIndex = new HashMap<Integer, String>();
sequenceFileReader = new SequenceFile.Reader(fs, indexPath, conf);
while (sequenceFileReader.next(intWritable, text)) {
termIndex.put(intWritable.get(), text.toString());
}
PairOfIntFloat pairOfIntFloat = new PairOfIntFloat();
// HMapIFW hmap = new HMapIFW();
HMapIDW hmap = new HMapIDW();
TreeMap<Double, Integer> treeMap = new TreeMap<Double, Integer>();
sequenceFileReader = new SequenceFile.Reader(fs, betaPath, conf);
while (sequenceFileReader.next(pairOfIntFloat, hmap)) {
treeMap.clear();
System.out.println("==============================");
System.out.println("Top ranked " + topDisplay + " terms for Topic "
+ pairOfIntFloat.getLeftElement());
System.out.println("==============================");
Iterator<Integer> itr1 = hmap.keySet().iterator();
int temp1 = 0;
while (itr1.hasNext()) {
temp1 = itr1.next();
treeMap.put(-hmap.get(temp1), temp1);
if (treeMap.size() > topDisplay) {
treeMap.remove(treeMap.lastKey());
}
}
Iterator<Double> itr2 = treeMap.keySet().iterator();
double temp2 = 0;
while (itr2.hasNext()) {
temp2 = itr2.next();
if (termIndex.containsKey(treeMap.get(temp2))) {
System.out.println(termIndex.get(treeMap.get(temp2)) + "\t\t" + -temp2);
} else {
System.out.println("How embarrassing! Term index not found...");
}
}
}
} finally {
IOUtils.closeStream(sequenceFileReader);
}
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new DisplayTopic(), args);
System.exit(res);
}
}