/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package com.ikanow.infinit.e.processing.custom.utils;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Iterator;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.bson.types.ObjectId;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import com.ikanow.infinit.e.data_model.store.MongoDbUtil;
import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.hadoop.io.BSONWritable;
public class HadoopUtils {
public static void deleteHadoopDir(CustomMapReduceJobPojo cmr) throws SAXException, IOException, ParserConfigurationException {
PropertiesManager props = new PropertiesManager();
Configuration conf = getConfiguration(props);
Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(pathDir)) {
fs.delete(pathDir, true);
}
}
public static BasicDBList getBsonFromTextFiles(CustomMapReduceJobPojo cmr, int nLimit, String fields) throws IOException, SAXException, ParserConfigurationException {
BasicDBList dbl = new BasicDBList();
PropertiesManager props = new PropertiesManager();
Configuration conf = getConfiguration(props);
Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false);
FileSystem fs = FileSystem.get(conf);
FileStatus[] files = fs.globStatus(new Path(pathDir.toString() + "/part-*"));
for(FileStatus file:files) {
if(file.getLen() > 0){
FSDataInputStream in = fs.open(file.getPath());
BufferedReader bin = new BufferedReader(new InputStreamReader(in));
for(;;) {
String s = bin.readLine();
if (null == s) break;
String[] keyValue = s.split("\t", 2);
BasicDBObject dbo = new BasicDBObject();
if (keyValue.length > 1) {
dbo.put("key", keyValue[0]);
dbo.put("value", keyValue[1]);
}
else {
dbo.put("value", keyValue[0]);
}
dbl.add(dbo);
}
in.close();
}
}
return dbl;
}//TESTED
public static BasicDBList getBsonFromSequenceFile(CustomMapReduceJobPojo cmr, int nLimit, String fields) throws SAXException, IOException, ParserConfigurationException {
BasicDBList dbl = new BasicDBList();
PropertiesManager props = new PropertiesManager();
Configuration conf = getConfiguration(props);
Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false);
@SuppressWarnings({ "unchecked", "rawtypes" })
SequenceFileDirIterable<? extends Writable, ? extends Writable> seqFileDir =
new SequenceFileDirIterable(pathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf);
// Very basic, only allow top level, 1 level of nesting, and field removal
HashSet<String> fieldLookup = null;
if (null != fields) {
fieldLookup = new HashSet<String>();
String[] fieldArray = fields.split(",");
for (String field: fieldArray) {
String[] fieldDecomp = field.split(":");
fieldLookup.add(fieldDecomp[0]);
}
}//TOTEST
int nRecords = 0;
for (Pair<? extends Writable, ? extends Writable> record: seqFileDir) {
BasicDBObject element = new BasicDBObject();
// KEY
Writable key = record.getFirst();
if (key instanceof org.apache.hadoop.io.Text) {
org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text)key;
element.put("key", writable.toString());
}
else if (key instanceof org.apache.hadoop.io.DoubleWritable) {
org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable)key;
element.put("key", Double.toString(writable.get()));
}
else if (key instanceof org.apache.hadoop.io.IntWritable) {
org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable)key;
element.put("key", Integer.toString(writable.get()));
}
else if (key instanceof org.apache.hadoop.io.LongWritable) {
org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable)key;
element.put("key", Long.toString(writable.get()));
}
else if (key instanceof BSONWritable) {
element.put("key", MongoDbUtil.convert((BSONWritable)key));
}
// VALUE
Writable value = record.getSecond();
if (value instanceof org.apache.hadoop.io.Text) {
org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text)value;
element.put("value", writable.toString());
}
else if (value instanceof org.apache.hadoop.io.DoubleWritable) {
org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable)value;
element.put("value", Double.toString(writable.get()));
}
else if (value instanceof org.apache.hadoop.io.IntWritable) {
org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable)value;
element.put("value", Integer.toString(writable.get()));
}
else if (value instanceof org.apache.hadoop.io.LongWritable) {
org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable)value;
element.put("value", Long.toString(writable.get()));
}
else if (value instanceof BSONWritable) {
element.put("value", MongoDbUtil.convert((BSONWritable)value));
}
else if (value instanceof org.apache.mahout.math.VectorWritable) {
Vector vec = ((org.apache.mahout.math.VectorWritable)value).get();
BasicDBList dbl2 = listFromMahoutVector(vec, "value", element);
element.put("value", dbl2);
}
else if (value instanceof org.apache.mahout.clustering.classify.WeightedVectorWritable) {
org.apache.mahout.clustering.classify.WeightedVectorWritable vecW = (org.apache.mahout.clustering.classify.WeightedVectorWritable)value;
element.put("valueWeight", vecW.getWeight());
BasicDBList dbl2 = listFromMahoutVector(vecW.getVector(), "value", element);
element.put("value", dbl2);
}
else if (value instanceof org.apache.mahout.clustering.iterator.ClusterWritable) {
Cluster cluster = ((org.apache.mahout.clustering.iterator.ClusterWritable)value).getValue();
BasicDBObject clusterVal = new BasicDBObject();
clusterVal.put("center", listFromMahoutVector(cluster.getCenter(), "center", clusterVal));
clusterVal.put("radius", listFromMahoutVector(cluster.getRadius(), "radius", clusterVal));
element.put("value", clusterVal);
}
else {
element.put("unknownValue", value.getClass().toString());
}
// Check the fields settings:
// Only handle a few...
if (null != fieldLookup) {
for (String fieldToRemove: fieldLookup) {
if (fieldToRemove.startsWith("value.")) {
fieldToRemove = fieldToRemove.substring(6);
BasicDBObject nested = (BasicDBObject) element.get("value.");
if (null != nested) {
nested.remove(fieldToRemove);
}
}
else {
element.remove(fieldToRemove);
}
}//TOTEST
}
dbl.add(element);
nRecords++;
if ((nLimit > 0) && (nRecords >= nLimit)) {
break;
}
}
return dbl;
}//TOTEST
private static BasicDBList listFromMahoutVector(Vector vec, String prefix, BasicDBObject element) {
if (vec instanceof NamedVector) {
element.put(prefix + "Name", ((NamedVector)vec).getName());
}
BasicDBList dbl2 = new BasicDBList();
if (vec.isDense()) {
int nSize = vec.size();
dbl2.ensureCapacity(nSize);
for (int i = 0; i < nSize; ++i) {
dbl2.add(vec.getQuick(i));
}
}
else { // sparse, write as a set in the format [{int:double}]
Iterator<org.apache.mahout.math.Vector.Element> elIt = vec.iterateNonZero();
while (elIt.hasNext()) {
BasicDBObject el2 = new BasicDBObject();
org.apache.mahout.math.Vector.Element el = elIt.next();
el2.put("k", el.index());
el2.put("v", el.get());
dbl2.add(el2);
}
}
return dbl2;
}
/**
* Returns an HDFS path for the custom task
* @throws ParserConfigurationException
* @throws IOException
* @throws SAXException
*
*/
public static Path getPathForJob(CustomMapReduceJobPojo cmr, Configuration config, boolean bTemp) throws SAXException, IOException, ParserConfigurationException {
// Get the name:
StringBuffer sb = null;
if (bTemp) {
sb = new StringBuffer("in_progress/"); // (will move this after it's complete)
}
else {
sb = new StringBuffer("completed/"); // (final location)
}
for (ObjectId commId: cmr.communityIds) {
sb.append(commId.toString()).append('_');
}
sb.append('/');
sb.append(cmr.jobtitle).append('/');
String pathName = sb.toString();
return new Path(pathName);
}//TOTEST
public static Configuration getConfiguration(PropertiesManager prop_custom) throws SAXException, IOException, ParserConfigurationException
{
Configuration conf = new Configuration();
if (prop_custom.getHadoopLocalMode()) {
conf.set("fs.default.name", "local");
}
else {
String fsUrl = getXMLProperty(prop_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name");
conf.set("fs.default.name", fsUrl);
}
return conf;
}//TOTEST
/**
* Parses a given xml file and returns the requested value of propertyName.
* The XML is expected to be in a format: <configuration><property><name>some.prop.name</name><value>some.value</value></property></configuration>
*
* @param xmlFileLocation
* @param propertyName
* @return
* @throws SAXException
* @throws IOException
* @throws ParserConfigurationException
*/
public static String getXMLProperty(String xmlFileLocation, String propertyName) throws SAXException, IOException, ParserConfigurationException
{
File configFile = new File(xmlFileLocation);
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
Document doc = docBuilder.parse(configFile);
doc.getDocumentElement().normalize();
NodeList listOfProps = doc.getElementsByTagName("property");
for ( int i = 0; i < listOfProps.getLength(); i++ )
{
Node prop = listOfProps.item(i);
if ( prop.getNodeType() == Node.ELEMENT_NODE)
{
Element propElement = (Element)prop;
NodeList name = propElement.getElementsByTagName("name").item(0).getChildNodes();
Node nameValue = (Node) name.item(0);
String nameString = nameValue.getNodeValue().trim();
//found the correct property
if ( nameString.equals(propertyName) )
{
//return the value
NodeList value = propElement.getElementsByTagName("value").item(0).getChildNodes();
Node valueValue = (Node) value.item(0);
String valueString = valueValue.getNodeValue().trim();
return valueString;
}
}
}
return null;
}//TESTED
}