// define the schema of the input files: projectcode, pagename, pageviews, bytes
Schema fileSchema = new Schema("pagecountsfile",
Fields.parse("projectcode:string, pagename:string, pageviews:int, bytes:long"));
// instantiate a TableBuilder
TableBuilder tableBuilder = new TableBuilder(tableSchema);
// for every input file...
for(FileStatus fileStatus : fileStatuses) {
String fileName = fileStatus.getPath().getName().toString();
// strip the date and the hour from the file name
String fileDate = fileName.split("-")[1];
String fileHour = fileName.split("-")[2].substring(0, 2);
// instantiate a custom RecordProcessor to process the records of this file
PageCountsRecordProcessor recordProcessor = new PageCountsRecordProcessor(tableSchema, fileDate,
fileHour);
// use the tableBuilder method for adding each of the files to the mix
tableBuilder.addCSVTextFile(fileStatus.getPath(), ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER,
TupleTextInputFormat.NO_ESCAPE_CHARACTER, false, false, TupleTextInputFormat.NO_NULL_STRING,
fileSchema, recordProcessor);
}
// partition the dataset by pagename - which should give a fair even distribution.
tableBuilder.partitionBy("pagename");
// create a compound index on pagename, date so that typical queries for the dataset will be fast
tableBuilder.createIndex("pagename", "date");
long nonExactPageSize = memoryForIndexing / 32000; // number of pages
int pageSize = (int) Math.pow(2, (int) Math.round(Math.log(nonExactPageSize) / Math.log(2)));
Log.info("Pagesize = " + pageSize + " as memory for indexing was [" + memoryForIndexing
+ "] and there are 32000 pages.");
tableBuilder.initialSQL("pragma page_size=" + pageSize);
// insertion order is very important for optimizing query speed because it makes data be co-located in disk
tableBuilder.insertionSortOrder(OrderBy.parse("pagename:asc, date:asc"));
// instantiate a TablespaceBuilder
TablespaceBuilder tablespaceBuilder = new TablespaceBuilder();
// we will partition this dataset in as many partitions as:
tablespaceBuilder.setNPartitions(nPartitions);
tablespaceBuilder.add(tableBuilder.build());
// we turn a specific SQLite pragma on for making autocomplete queries fast
tablespaceBuilder.initStatements("pragma case_sensitive_like=true;");
HadoopUtils.deleteIfExists(outFs, outPath);