package edu.cmu.graphchi.preprocessing;
import edu.cmu.graphchi.ChiFilenames;
import edu.cmu.graphchi.ChiLogger;
import edu.cmu.graphchi.ChiVertex;
import edu.cmu.graphchi.datablocks.BytesToValueConverter;
import edu.cmu.graphchi.datablocks.ChiPointer;
import edu.cmu.graphchi.datablocks.DataBlockManager;
import edu.cmu.graphchi.datablocks.IntConverter;
import edu.cmu.graphchi.engine.auxdata.VertexData;
import edu.cmu.graphchi.io.CompressedIO;
import edu.cmu.graphchi.shards.MemoryShard;
import edu.cmu.graphchi.shards.SlidingShard;
import nom.tam.util.BufferedDataInputStream;
import java.io.*;
import java.util.Iterator;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.logging.Logger;
import java.util.zip.DeflaterOutputStream;
/**
* New version of sharder that requires predefined number of shards
* and translates the vertex ids in order to randomize the order, thus
* requiring no additional step to divide the number of edges for
* each shard equally (it is assumed that probablistically the number
* of edges is roughly even).
*
* Since the vertex ids are translated to internal-ids, you need to use
* VertexIdTranslate class to obtain the original id-numbers.
*
* Usage:
* <code>
* FastSharder sharder = new FastSharder(graphName, numShards, ....)
* sharder.shard(new FileInputStream())
* </code>
*
* To use a pipe to feed a graph, use
* <code>
* sharder.shard(System.in, "edgelist");
* </code>
*
* <b>Note:</b> <a href="http://code.google.com/p/graphchi/wiki/EdgeListFormat">Edge list</a>
* and <a href="http://code.google.com/p/graphchi/wiki/AdjacencyListFormat">adjacency list</a>
* formats are supported.
*
* <b>Note:</b>If from and to vertex ids equal (applies only to edge list format), the line is assumed to contain vertex-value.
*
* @author Aapo Kyrola
*/
public class FastSharder <VertexValueType, EdgeValueType> {
public enum GraphInputFormat {EDGELIST, ADJACENCY, MATRIXMARKET};
private String baseFilename;
private int numShards;
private int initialIntervalLength;
private VertexIdTranslate preIdTranslate;
private VertexIdTranslate finalIdTranslate;
private DataOutputStream[] shovelStreams;
private DataOutputStream[] vertexShovelStreams;
private int maxVertexId = 0;
private int[] inDegrees;
private int[] outDegrees;
private boolean memoryEfficientDegreeCount = false;
private long numEdges = 0;
private boolean useSparseDegrees = false;
private boolean allowSparseDegreesAndVertexData = false;
private BytesToValueConverter<EdgeValueType> edgeValueTypeBytesToValueConverter;
private BytesToValueConverter<VertexValueType> vertexValueTypeBytesToValueConverter;
private EdgeProcessor<EdgeValueType> edgeProcessor;
private VertexProcessor<VertexValueType> vertexProcessor;
private static final Logger logger = ChiLogger.getLogger("fast-sharder");
/**
* Constructor
* @param baseFilename input-file
* @param numShards the number of shards to be created
* @param vertexProcessor user-provided function for translating strings to vertex value type
* @param edgeProcessor user-provided function for translating strings to edge value type
* @param vertexValConterter translator byte-arrays to/from vertex-value
* @param edgeValConverter translator byte-arrays to/from edge-value
* @throws IOException if problems reading the data
*/
public FastSharder(String baseFilename, int numShards,
VertexProcessor<VertexValueType> vertexProcessor,
EdgeProcessor<EdgeValueType> edgeProcessor,
BytesToValueConverter<VertexValueType> vertexValConterter,
BytesToValueConverter<EdgeValueType> edgeValConverter) throws IOException {
this.baseFilename = baseFilename;
this.numShards = numShards;
this.initialIntervalLength = Integer.MAX_VALUE / numShards;
this.preIdTranslate = new VertexIdTranslate(this.initialIntervalLength, numShards);
this.edgeProcessor = edgeProcessor;
this.vertexProcessor = vertexProcessor;
this.edgeValueTypeBytesToValueConverter = edgeValConverter;
this.vertexValueTypeBytesToValueConverter = vertexValConterter;
/**
* In the first phase of processing, the edges are "shoveled" to
* the corresponding shards. The interim shards are called "shovel-files",
* and the final shards are created by sorting the edges in the shovel-files.
* See processShovel()
*/
shovelStreams = new DataOutputStream[numShards];
vertexShovelStreams = new DataOutputStream[numShards];
for(int i=0; i < numShards; i++) {
shovelStreams[i] = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(shovelFilename(i))));
if (vertexProcessor != null) {
vertexShovelStreams[i] = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vertexShovelFileName(i))));
}
}
/** Byte-array template used as a temporary value for performance (instead of
* always reallocating it).
**/
if (edgeValueTypeBytesToValueConverter != null) {
valueTemplate = new byte[edgeValueTypeBytesToValueConverter.sizeOf()];
} else {
valueTemplate = new byte[0];
}
if (vertexValueTypeBytesToValueConverter != null)
vertexValueTemplate = new byte[vertexValueTypeBytesToValueConverter.sizeOf()];
}
private String shovelFilename(int i) {
return baseFilename + ".shovel." + i;
}
private String vertexShovelFileName(int i) {
return baseFilename + ".vertexshovel." + i;
}
/**
* Adds an edge to the preprocessing.
* @param from
* @param to
* @param edgeValueToken
* @throws IOException
*/
public void addEdge(int from, int to, String edgeValueToken) throws IOException {
if (maxVertexId < from) maxVertexId = from;
if (maxVertexId < to) maxVertexId = to;
/* If the from and to ids are same, this entry is assumed to contain value
for the vertex, and it is passed to the vertexProcessor.
*/
if (from == to) {
if (vertexProcessor != null && edgeValueToken != null) {
VertexValueType value = vertexProcessor.receiveVertexValue(from, edgeValueToken);
if (value != null) {
addVertexValue(from % numShards, preIdTranslate.forward(from), value);
}
}
return;
}
int preTranslatedIdFrom = preIdTranslate.forward(from);
int preTranslatedTo = preIdTranslate.forward(to);
addToShovel(to % numShards, preTranslatedIdFrom, preTranslatedTo,
(edgeProcessor != null ? edgeProcessor.receiveEdge(from, to, edgeValueToken) : null));
}
private byte[] valueTemplate;
private byte[] vertexValueTemplate;
/**
* Adds n edge to the shovel. At this stage, the vertex-ids are "pretranslated"
* to a temporary internal ids. In the last phase, each vertex-id is assigned its
* final id. The pretranslation is requried because at this point we do not know
* the total number of vertices.
* @param shard
* @param preTranslatedIdFrom internal from-id
* @param preTranslatedTo internal to-id
* @param value
* @throws IOException
*/
private void addToShovel(int shard, int preTranslatedIdFrom, int preTranslatedTo,
EdgeValueType value) throws IOException {
DataOutputStream strm = shovelStreams[shard];
strm.writeLong(packEdges(preTranslatedIdFrom, preTranslatedTo));
if (edgeValueTypeBytesToValueConverter != null) {
edgeValueTypeBytesToValueConverter.setValue(valueTemplate, value);
}
strm.write(valueTemplate);
}
public boolean isAllowSparseDegreesAndVertexData() {
return allowSparseDegreesAndVertexData;
}
/**
* If set true, GraphChi will use sparse file for vertices and the degree data
* if the number of edges is smaller than the number of vertices. Default false.
* Note: if you use this, you probably want to set engine.setSkipZeroDegreeVertices(true)
* @param allowSparseDegreesAndVertexData
*/
public void setAllowSparseDegreesAndVertexData(boolean allowSparseDegreesAndVertexData) {
this.allowSparseDegreesAndVertexData = allowSparseDegreesAndVertexData;
}
/**
* We keep separate shovel-file for vertex-values.
* @param shard
* @param pretranslatedVertexId
* @param value
* @throws IOException
*/
private void addVertexValue(int shard, int pretranslatedVertexId, VertexValueType value) throws IOException{
DataOutputStream strm = vertexShovelStreams[shard];
strm.writeInt(pretranslatedVertexId);
vertexValueTypeBytesToValueConverter.setValue(vertexValueTemplate, value);
strm.write(vertexValueTemplate);
}
/**
* Bit arithmetic for packing two 32-bit vertex-ids into one 64-bit long.
* @param a
* @param b
* @return
*/
static long packEdges(int a, int b) {
return ((long) a << 32) + b;
}
static int getFirst(long l) {
return (int) (l >> 32);
}
static int getSecond(long l) {
return (int) (l & 0x00000000ffffffffl);
}
/**
* Final processing after all edges have been received.
* @throws IOException
*/
public void process() throws IOException {
/* Check if we have enough memory to keep track of
vertex degree in memory. If not, we need to run a special
graphchi-program to create the degree-file.
*/
// Ad-hoc: require that degree vertices won't take more than 5th of memory
memoryEfficientDegreeCount = Runtime.getRuntime().maxMemory() / 5 < ((long) maxVertexId) * 8;
if (memoryEfficientDegreeCount) {
logger.info("Going to use memory-efficient, but slower, method to compute vertex degrees.");
}
if (!memoryEfficientDegreeCount) {
inDegrees = new int[maxVertexId + numShards];
outDegrees = new int[maxVertexId + numShards];
}
/**
* Now when we have the total number of vertices known, we can
* construct the final translator.
*/
finalIdTranslate = new VertexIdTranslate((1 + maxVertexId) / numShards + 1, numShards);
/**
* Store information on how to translate internal vertex id to the original id.
*/
saveVertexTranslate();
/**
* Close / flush each shovel-file.
*/
for(int i=0; i < numShards; i++) {
shovelStreams[i].close();
}
shovelStreams = null;
/**
* Store the vertex intervals.
*/
writeIntervals();
/**
* Process each shovel to create a final shard.
*/
for(int i=0; i<numShards; i++) {
processShovel(i);
}
/**
* If we have more vertices than edges, it makes sense to use sparse representation
* for the auxilliary degree-data and vertex-data files.
*/
if (allowSparseDegreesAndVertexData) {
useSparseDegrees = (maxVertexId > numEdges) || "1".equals(System.getProperty("sparsedeg"));
} else {
useSparseDegrees = false;
}
logger.info("Use sparse output: " + useSparseDegrees);
/**
* Construct the degree-data file which stores the in- and out-degree
* of each vertex. See edu.cmu.graphchi.engine.auxdata.DegreeData
*/
if (!memoryEfficientDegreeCount) {
writeDegrees();
} else {
computeVertexDegrees();
}
/**
* Write the vertex-data file.
*/
if (vertexProcessor != null) {
processVertexValues(useSparseDegrees);
}
}
/**
* Consteuct the degree-file if we had degrees computed in-memory,
* @throws IOException
*/
private void writeDegrees() throws IOException {
DataOutputStream degreeOut = new DataOutputStream(new BufferedOutputStream(
new FileOutputStream(ChiFilenames.getFilenameOfDegreeData(baseFilename, useSparseDegrees))));
for(int i=0; i<inDegrees.length; i++) {
if (!useSparseDegrees) {
degreeOut.writeInt(Integer.reverseBytes(inDegrees[i]));
degreeOut.writeInt(Integer.reverseBytes(outDegrees[i]));
} else {
if (inDegrees[i] + outDegrees[i] > 0) {
degreeOut.writeInt(Integer.reverseBytes(i));
degreeOut.writeInt(Integer.reverseBytes(inDegrees[i]));
degreeOut.writeInt(Integer.reverseBytes(outDegrees[i]));
}
}
}
degreeOut.close();
}
private void writeIntervals() throws IOException{
FileWriter wr = new FileWriter(ChiFilenames.getFilenameIntervals(baseFilename, numShards));
for(int j=1; j<=numShards; j++) {
int a =(j * finalIdTranslate.getVertexIntervalLength() -1);
wr.write(a + "\n");
if (a > maxVertexId) {
maxVertexId = a;
}
}
wr.close();
}
private void saveVertexTranslate() throws IOException {
FileWriter wr = new FileWriter(ChiFilenames.getVertexTranslateDefFile(baseFilename, numShards));
wr.write(finalIdTranslate.stringRepresentation());
wr.close();
}
/**
* Initializes the vertex-data file. Similar process as sharding for edges.
* @param sparse
* @throws IOException
*/
private void processVertexValues(boolean sparse) throws IOException {
DataBlockManager dataBlockManager = new DataBlockManager();
VertexData<VertexValueType> vertexData = new VertexData<VertexValueType>(maxVertexId + 1, baseFilename,
vertexValueTypeBytesToValueConverter, sparse);
vertexData.setBlockManager(dataBlockManager);
for(int p=0; p < numShards; p++) {
int intervalSt = p * finalIdTranslate.getVertexIntervalLength();
int intervalEn = (p + 1) * finalIdTranslate.getVertexIntervalLength() - 1;
if (intervalEn > maxVertexId) intervalEn = maxVertexId;
vertexShovelStreams[p].close();
/* Read shovel and sort */
File shovelFile = new File(vertexShovelFileName(p));
BufferedDataInputStream in = new BufferedDataInputStream(new FileInputStream(shovelFile));
int sizeOf = vertexValueTypeBytesToValueConverter.sizeOf();
long[] vertexIds = new long[(int) (shovelFile.length() / (4 + sizeOf))];
if (vertexIds.length == 0) continue;
byte[] vertexValues = new byte[vertexIds.length * sizeOf];
for(int i=0; i<vertexIds.length; i++) {
int vid = in.readInt();
int transVid = finalIdTranslate.forward(preIdTranslate.backward(vid));
vertexIds[i] = transVid;
in.readFully(vertexValueTemplate);
int valueIdx = i * sizeOf;
System.arraycopy(vertexValueTemplate, 0, vertexValues, valueIdx, sizeOf);
}
/* Sort */
sortWithValues(vertexIds, vertexValues, sizeOf); // The source id is higher order, so sorting the longs will produce right result
int SUBINTERVAL = 2000000;
int iterIdx = 0;
/* Insert into data */
for(int subIntervalSt=intervalSt; subIntervalSt < intervalEn; subIntervalSt += SUBINTERVAL) {
int subIntervalEn = subIntervalSt + SUBINTERVAL - 1;
if (subIntervalEn > intervalEn) subIntervalEn = intervalEn;
int blockId = vertexData.load(subIntervalSt, subIntervalEn);
Iterator<Integer> iterator = vertexData.currentIterator();
while(iterator.hasNext()) {
int curId = iterator.next();
while(iterIdx < vertexIds.length && vertexIds[iterIdx] < curId) {
iterIdx++;
}
if (iterIdx >= vertexIds.length) break;
if (curId == (int) vertexIds[iterIdx]) {
ChiPointer pointer = vertexData.getVertexValuePtr(curId, blockId);
System.arraycopy(vertexValues, iterIdx * sizeOf, vertexValueTemplate, 0, sizeOf);
dataBlockManager.writeValue(pointer, vertexValueTemplate);
} else {
// No vertex data for that vertex.
}
}
vertexData.releaseAndCommit(subIntervalSt, blockId);
}
}
}
/**
* Converts a shovel-file into a shard.
* @param shardNum
* @throws IOException
*/
private void processShovel(int shardNum) throws IOException {
File shovelFile = new File(shovelFilename(shardNum));
int sizeOf = (edgeValueTypeBytesToValueConverter != null ? edgeValueTypeBytesToValueConverter.sizeOf() : 0);
long[] shoveled = new long[(int) (shovelFile.length() / (8 + sizeOf))];
// TODO: improve
if (shoveled.length > 500000000) {
throw new RuntimeException("Too big shard size, shovel length was: " + shoveled.length + " max: " + 500000000);
}
byte[] edgeValues = new byte[shoveled.length * sizeOf];
logger.info("Processing shovel " + shardNum);
/**
* Read the edges into memory.
*/
BufferedDataInputStream in = new BufferedDataInputStream(new FileInputStream(shovelFile));
for(int i=0; i<shoveled.length; i++) {
long l = in.readLong();
int from = getFirst(l);
int to = getSecond(l);
in.readFully(valueTemplate);
int newFrom = finalIdTranslate.forward(preIdTranslate.backward(from));
int newTo = finalIdTranslate.forward(preIdTranslate.backward(to));
shoveled[i] = packEdges(newFrom, newTo);
/* Edge value */
int valueIdx = i * sizeOf;
System.arraycopy(valueTemplate, 0, edgeValues, valueIdx, sizeOf);
if (!memoryEfficientDegreeCount) {
inDegrees[newTo]++;
outDegrees[newFrom]++;
}
}
numEdges += shoveled.length;
in.close();
/* Delete the shovel-file */
shovelFile.delete();
logger.info("Processing shovel " + shardNum + " ... sorting");
/* Sort the edges */
sortWithValues(shoveled, edgeValues, sizeOf); // The source id is higher order, so sorting the longs will produce right result
logger.info("Processing shovel " + shardNum + " ... writing shard");
/*
Now write the final shard in a compact form. Note that there is separate shard
for adjacency and the edge-data. The edge-data is split and stored into 4-megabyte compressed blocks.
*/
/**
* Step 1: ADJACENCY SHARD
*/
File adjFile = new File(ChiFilenames.getFilenameShardsAdj(baseFilename, shardNum, numShards));
DataOutputStream adjOut = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(adjFile)));
File indexFile = new File(adjFile.getAbsolutePath() + ".index");
DataOutputStream indexOut = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(indexFile)));
int curvid = 0;
int istart = 0;
int edgeCounter = 0;
int lastIndexFlush = 0;
int edgesPerIndexEntry = 4096; // Tuned for fast shard queries
for(int i=0; i <= shoveled.length; i++) {
int from = (i < shoveled.length ? getFirst(shoveled[i]) : -1);
if (from != curvid) {
/* Write index */
if (edgeCounter - lastIndexFlush >= edgesPerIndexEntry) {
indexOut.writeInt(curvid);
indexOut.writeInt(adjOut.size());
indexOut.writeInt(edgeCounter);
lastIndexFlush = edgeCounter;
}
int count = i - istart;
if (count > 0) {
if (count < 255) {
adjOut.writeByte(count);
} else {
adjOut.writeByte(0xff);
adjOut.writeInt(Integer.reverseBytes(count));
}
}
for(int j=istart; j<i; j++) {
adjOut.writeInt(Integer.reverseBytes(getSecond(shoveled[j])));
edgeCounter++;
}
istart = i;
// Handle zeros
if (from != (-1)) {
if (from - curvid > 1 || (i == 0 && from > 0)) {
int nz = from - curvid - 1;
if (i ==0 && from >0) nz = from;
do {
adjOut.writeByte(0);
nz--;
int tnz = Math.min(254, nz);
adjOut.writeByte(tnz);
nz -= tnz;
} while (nz > 0);
}
}
curvid = from;
}
}
adjOut.close();
indexOut.close();
/**
* Step 2: EDGE DATA
*/
/* Create compressed edge data directories */
if (sizeOf > 0) {
int blockSize = ChiFilenames.getBlocksize(sizeOf);
String edataFileName = ChiFilenames.getFilenameShardEdata(baseFilename, new BytesToValueConverter() {
@Override
public int sizeOf() {
return edgeValueTypeBytesToValueConverter.sizeOf();
}
@Override
public Object getValue(byte[] array) {
return null;
}
@Override
public void setValue(byte[] array, Object val) {
}
}, shardNum, numShards);
File edgeDataSizeFile = new File(edataFileName + ".size");
File edgeDataDir = new File(ChiFilenames.getDirnameShardEdataBlock(edataFileName, blockSize));
if (!edgeDataDir.exists()) edgeDataDir.mkdir();
long edatasize = shoveled.length * edgeValueTypeBytesToValueConverter.sizeOf();
FileWriter sizeWr = new FileWriter(edgeDataSizeFile);
sizeWr.write(edatasize + "");
sizeWr.close();
/* Create compressed blocks */
int blockIdx = 0;
int edgeIdx= 0;
for(long idx=0; idx < edatasize; idx += blockSize) {
File blockFile = new File(ChiFilenames.getFilenameShardEdataBlock(edataFileName, blockIdx, blockSize));
OutputStream blockOs = (CompressedIO.isCompressionEnabled() ?
new DeflaterOutputStream(new BufferedOutputStream(new FileOutputStream(blockFile))) :
new FileOutputStream(blockFile));
long len = Math.min(blockSize, edatasize - idx);
byte[] block = new byte[(int)len];
System.arraycopy(edgeValues, edgeIdx * sizeOf, block, 0, block.length);
edgeIdx += len / sizeOf;
blockOs.write(block);
blockOs.close();
blockIdx++;
}
assert(edgeIdx == edgeValues.length);
}
}
private static Random random = new Random();
// http://www.algolist.net/Algorithms/Sorting/Quicksort
// TODO: implement faster
private static int partition(long arr[], byte[] values, int sizeOf, int left, int right)
{
int i = left, j = right;
long tmp;
long pivot = arr[left + random.nextInt(right - left + 1)];
byte[] valueTemplate = new byte[sizeOf];
while (i <= j) {
while (arr[i] < pivot)
i++;
while (arr[j] > pivot)
j--;
if (i <= j) {
tmp = arr[i];
/* Swap */
System.arraycopy(values, j * sizeOf, valueTemplate, 0, sizeOf);
System.arraycopy(values, i * sizeOf, values, j * sizeOf, sizeOf);
System.arraycopy(valueTemplate, 0, values, i * sizeOf, sizeOf);
arr[i] = arr[j];
arr[j] = tmp;
i++;
j--;
}
}
return i;
}
static void quickSort(long arr[], byte[] values, int sizeOf, int left, int right) {
if (left < right) {
int index = partition(arr, values, sizeOf, left, right);
if (left < index - 1)
quickSort(arr, values, sizeOf, left, index - 1);
if (index < right)
quickSort(arr, values, sizeOf, index, right);
}
}
public static void sortWithValues(long[] shoveled, byte[] edgeValues, int sizeOf) {
quickSort(shoveled, edgeValues, sizeOf, 0, shoveled.length - 1);
}
/**
* Execute sharding by reading edges from a inputstream
* @param inputStream
* @param format graph input format
* @throws IOException
*/
public void shard(InputStream inputStream, GraphInputFormat format) throws IOException {
BufferedReader ins = new BufferedReader(new InputStreamReader(inputStream));
String ln;
long lineNum = 0;
if (!format.equals(GraphInputFormat.MATRIXMARKET)) {
while ((ln = ins.readLine()) != null) {
if (ln.length() > 2 && !ln.startsWith("#")) {
lineNum++;
if (lineNum % 2000000 == 0) logger.info("Reading line: " + lineNum);
String[] tok = ln.split("\t");
if (tok.length == 1) tok = ln.split(" ");
if (tok.length > 1) {
if (format == GraphInputFormat.EDGELIST) {
/* Edge list: <src> <dst> <value> */
if (tok.length == 2) {
this.addEdge(Integer.parseInt(tok[0]), Integer.parseInt(tok[1]), null);
} else if (tok.length == 3) {
this.addEdge(Integer.parseInt(tok[0]), Integer.parseInt(tok[1]), tok[2]);
}
} else if (format == GraphInputFormat.ADJACENCY) {
/* Adjacency list: <vertex-id> <count> <neighbor-1> <neighbor-2> ... */
int vertexId = Integer.parseInt(tok[0]);
int len = Integer.parseInt(tok[1]);
if (len != tok.length - 2) {
if (lineNum < 10) {
throw new IllegalArgumentException("Error on line " + lineNum + "; number of edges does not match number of tokens:" +
len + " != " + tok.length);
} else {
logger.warning("Error on line " + lineNum + "; number of edges does not match number of tokens:" +
len + " != " + tok.length);
break;
}
}
for(int j=2; j < 2 + len; j++) {
int dest = Integer.parseInt(tok[j]);
this.addEdge(vertexId, dest, null);
}
} else {
throw new IllegalArgumentException("Please specify graph input format");
}
}
}
}
} else if (format.equals(GraphInputFormat.MATRIXMARKET)) {
/* Process matrix-market format to create a bipartite graph. */
boolean parsedMatrixSize = false;
int numLeft = 0;
int numRight = 0;
long totalEdges = 0;
while ((ln = ins.readLine()) != null) {
lineNum++;
if (ln.length() > 2 && !ln.startsWith("#")) {
if (ln.startsWith("%%")) {
if (!ln.contains(("matrix coordinate real general"))) {
throw new RuntimeException("Unknown matrix market format!");
}
} else if (ln.startsWith("%")) {
// Comment - skip
} else {
String[] tok = ln.split(" ");
if (lineNum % 2000000 == 0) logger.info("Reading line: " + lineNum + " / " + totalEdges);
if (!parsedMatrixSize) {
numLeft = Integer.parseInt(tok[0]);
numRight = Integer.parseInt(tok[1]);
totalEdges = Long.parseLong(tok[2]);
logger.info("Matrix-market: going to load total of " + totalEdges + " edges.");
parsedMatrixSize = true;
} else {
/* The ids start from 1, so we take 1 off. */
/* Vertex - ids on the right side of the bipartite graph have id numLeft + originalId */
try {
String lastTok = tok[tok.length - 1];
this.addEdge(Integer.parseInt(tok[0]) - 1, numLeft + Integer.parseInt(tok[1]) - 1, lastTok);
} catch (NumberFormatException nfe) {
logger.severe("Could not parse line: " + ln);
throw nfe;
}
}
}
}
}
/* Store matrix dimensions */
String matrixMarketInfoFile = baseFilename + ".matrixinfo";
FileOutputStream fos = new FileOutputStream(new File(matrixMarketInfoFile));
fos.write((numLeft + "\t" + numRight + "\t" + totalEdges + "\n").getBytes());
fos.close();
}
this.process();
}
/**
* Shard a graph
* @param inputStream
* @param format "edgelist" or "adjlist" / "adjacency"
* @throws IOException
*/
public void shard(InputStream inputStream, String format) throws IOException {
if (format == null || format.equals("edgelist")) {
shard(inputStream, GraphInputFormat.EDGELIST);
}
else if (format.equals("adjlist") || format.startsWith("adjacency")) {
shard(inputStream, GraphInputFormat.ADJACENCY);
}
}
/**
* Shard an input graph with edge list format.
* @param inputStream
* @throws IOException
*/
public void shard(InputStream inputStream) throws IOException {
shard(inputStream, GraphInputFormat.EDGELIST);
}
/**
* Compute vertex degrees by running a special graphchi program.
* This is done only if we do not have enough memory to keep track of
* vertex degrees in-memory.
*/
private void computeVertexDegrees() {
try {
logger.info("Use sparse degrees: " + useSparseDegrees);
DataOutputStream degreeOut = new DataOutputStream(new BufferedOutputStream(
new FileOutputStream(ChiFilenames.getFilenameOfDegreeData(baseFilename, useSparseDegrees))));
SlidingShard[] slidingShards = new SlidingShard[numShards];
for(int p=0; p < numShards; p++) {
int intervalSt = p * finalIdTranslate.getVertexIntervalLength();
int intervalEn = (p + 1) * finalIdTranslate.getVertexIntervalLength() - 1;
slidingShards[p] = new SlidingShard(null, ChiFilenames.getFilenameShardsAdj(baseFilename, p, numShards),
intervalSt, intervalEn);
slidingShards[p].setOnlyAdjacency(true);
}
int SUBINTERVAL = 2000000;
ExecutorService parallelExecutor = Executors.newFixedThreadPool(4);
for(int p=0; p < numShards; p++) {
logger.info("Degree computation round " + p + " / " + numShards);
int intervalSt = p * finalIdTranslate.getVertexIntervalLength();
int intervalEn = (p + 1) * finalIdTranslate.getVertexIntervalLength() - 1;
MemoryShard<Float> memoryShard = new MemoryShard<Float>(null, ChiFilenames.getFilenameShardsAdj(baseFilename, p, numShards),
intervalSt, intervalEn);
memoryShard.setOnlyAdjacency(true);
for(int subIntervalSt=intervalSt; subIntervalSt < intervalEn; subIntervalSt += SUBINTERVAL) {
int subIntervalEn = subIntervalSt + SUBINTERVAL - 1;
if (subIntervalEn > intervalEn) subIntervalEn = intervalEn;
ChiVertex[] verts = new ChiVertex[subIntervalEn - subIntervalSt + 1];
for(int i=0; i < verts.length; i++) {
verts[i] = new ChiVertex(i + subIntervalSt, null);
}
memoryShard.loadVertices(subIntervalSt, subIntervalEn, verts, false, parallelExecutor);
for(int i=0; i < numShards; i++) {
if (i != p) {
slidingShards[i].readNextVertices(verts, subIntervalSt, true);
}
}
for(int i=0; i < verts.length; i++) {
if (!useSparseDegrees) {
degreeOut.writeInt(Integer.reverseBytes(verts[i].numInEdges()));
degreeOut.writeInt(Integer.reverseBytes(verts[i].numOutEdges()));
} else {
if (verts[i].numEdges() > 0 ){
degreeOut.writeInt(Integer.reverseBytes(subIntervalSt + i));
degreeOut.writeInt(Integer.reverseBytes(verts[i].numInEdges()));
degreeOut.writeInt(Integer.reverseBytes(verts[i].numOutEdges()));
}
}
}
}
}
parallelExecutor.shutdown();
degreeOut.close();
} catch (Exception err) {
err.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
String fileName = args[0];
int numShards = Integer.parseInt(args[1]);
String conversion = args[2];
FastSharder<Integer, Integer> sharder = new FastSharder<Integer, Integer>(fileName, numShards, null, new EdgeProcessor<Integer>() {
@Override
public Integer receiveEdge(int from, int to, String token) {
if (token == null) return 0;
return Integer.parseInt(token);
}
},
new IntConverter(), new IntConverter());
sharder.shard(new FileInputStream(fileName), conversion);
}
}