/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.fpm.pfpgrowth;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.FileLineIterable;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.Parameters;
import org.apache.mahout.common.StringRecordIterator;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.fpm.pfpgrowth.convertors.SequenceFileOutputCollector;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.StringOutputConvertor;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;
import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class FPGrowthJob {
private static final Logger log = LoggerFactory.getLogger(FPGrowthJob.class);
private FPGrowthJob() {
}
/**
* Run TopK FPGrowth given the input file,
*
* @param args
* @throws IOException
* @throws OptionException
* @throws NumberFormatException
* @throws IllegalStateException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
public static void main(String[] args) throws IOException, OptionException,
NumberFormatException, IllegalStateException, InterruptedException,
ClassNotFoundException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
Option inputDirOpt = obuilder.withLongName("input").withRequired(true)
.withArgument(
abuilder.withName("input").withMinimum(1).withMaximum(1).create())
.withDescription(
"The Directory on HDFS containing the transaction files")
.withShortName("i").create();
Option outputOpt = DefaultOptionCreator.outputOption(obuilder, abuilder).create();
Option helpOpt = DefaultOptionCreator.helpOption(obuilder);
// minSupport(3), maxHeapSize(50), numGroups(1000)
Option minSupportOpt = obuilder.withLongName("minSupport").withArgument(
abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
.withDescription("(Optional) Minimum Support. Default Value: 3")
.withShortName("s").create();
Option maxHeapSizeOpt = obuilder
.withLongName("maxHeapSize")
.withArgument(
abuilder.withName("maxHeapSize").withMinimum(1).withMaximum(1)
.create())
.withDescription(
"(Optional) Maximum Heap Size k, to denote the requirement to mine top K items. Default value: 50")
.withShortName("k").create();
Option numGroupsOpt = obuilder
.withLongName("numGroups")
.withArgument(
abuilder.withName("numGroups").withMinimum(1).withMaximum(1)
.create())
.withDescription(
"(Optional) Number of groups the features should be divided in the map-reduce version. Doesn't work in sequential version Default Value:1000")
.withShortName("g").create();
Option recordSplitterOpt = obuilder
.withLongName("splitterPattern")
.withArgument(
abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1)
.create())
.withDescription(
"Regular Expression pattern used to split given string transaction into itemsets. Default value splits comma separated itemsets. Default Value: \"[ ,\\t]*[,|\\t][ ,\\t]*\" ")
.withShortName("regex").create();
Option treeCacheOpt = obuilder
.withLongName("numTreeCacheEntries")
.withArgument(
abuilder.withName("numTreeCacheEntries").withMinimum(1)
.withMaximum(1).create())
.withDescription(
"(Optional) Number of entries in the tree cache to prevent duplicate tree building. "
+ "(Warning) a first level conditional FP-Tree might consume a lot of memory, "
+ "so keep this value small, but big enough to prevent duplicate tree building. Default Value:5 Recommended Values: [5-10]")
.withShortName("tc").create();
Option methodOpt = obuilder.withLongName("method").withRequired(true)
.withArgument(
abuilder.withName("method").withMinimum(1).withMaximum(1).create())
.withDescription("Method of processing: sequential|mapreduce")
.withShortName("method").create();
Option encodingOpt = obuilder.withLongName("encoding").withArgument(
abuilder.withName("encoding").withMinimum(1).withMaximum(1).create())
.withDescription("(Optional) The file encoding. Default value: UTF-8")
.withShortName("e").create();
Group group = gbuilder.withName("Options").withOption(minSupportOpt)
.withOption(inputDirOpt).withOption(outputOpt).withOption(
maxHeapSizeOpt).withOption(numGroupsOpt).withOption(methodOpt)
.withOption(encodingOpt).withOption(helpOpt).withOption(treeCacheOpt)
.withOption(recordSplitterOpt).create();
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return;
}
Parameters params = new Parameters();
if (cmdLine.hasOption(minSupportOpt)) {
String minSupportString = (String) cmdLine.getValue(minSupportOpt);
params.set("minSupport", minSupportString);
}
if (cmdLine.hasOption(maxHeapSizeOpt)) {
String maxHeapSizeString = (String) cmdLine.getValue(maxHeapSizeOpt);
params.set("maxHeapSize", maxHeapSizeString);
}
if (cmdLine.hasOption(numGroupsOpt)) {
String numGroupsString = (String) cmdLine.getValue(numGroupsOpt);
params.set("numGroups", numGroupsString);
}
if (cmdLine.hasOption(treeCacheOpt)) {
String numTreeCacheString = (String) cmdLine.getValue(treeCacheOpt);
params.set("treeCacheSize", numTreeCacheString);
}
if (cmdLine.hasOption(recordSplitterOpt)) {
String patternString = (String) cmdLine.getValue(recordSplitterOpt);
params.set("splitPattern", patternString);
}
String encoding = "UTF-8";
if (cmdLine.hasOption(encodingOpt)) {
encoding = (String) cmdLine.getValue(encodingOpt);
}
params.set("encoding", encoding);
String inputDir = (String) cmdLine.getValue(inputDirOpt);
String outputDir = (String) cmdLine.getValue(outputOpt);
params.set("input", inputDir);
params.set("output", outputDir);
String classificationMethod = (String) cmdLine.getValue(methodOpt);
if (classificationMethod.equalsIgnoreCase("sequential"))
runFPGrowth(params);
else if (classificationMethod.equalsIgnoreCase("mapreduce"))
PFPGrowth.runPFPGrowth(params);
}
private static void runFPGrowth(Parameters params) throws IOException {
log.info("Starting Sequential FPGrowth");
int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50"));
int minSupport = Integer.valueOf(params.get("minSupport", "3"));
String output = params.get("output", "output.txt");
Path path = new Path(output);
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Charset encoding = Charset.forName(params.get("encoding"));
String input = params.get("input");
String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString());
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
Text.class, TopKStringPatterns.class);
FPGrowth<String> fp = new FPGrowth<String>();
Set<String> features = new HashSet<String>();
fp.generateTopKFrequentPatterns(new StringRecordIterator(
new FileLineIterable(new File(input), encoding, false), pattern), fp
.generateFList(new StringRecordIterator(new FileLineIterable(new File(
input), encoding, false), pattern), minSupport), minSupport,
maxHeapSize, features, new StringOutputConvertor(
new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)));
writer.close();
List<Pair<String, TopKStringPatterns>> frequentPatterns = FPGrowth
.readFrequentPattern(fs, conf, path);
for (Pair<String, TopKStringPatterns> entry : frequentPatterns)
log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond().toString());
}
}