/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.test.iterative;
import java.io.BufferedReader;
import java.io.Serializable;
import eu.stratosphere.api.common.Plan;
import eu.stratosphere.api.java.record.operators.DeltaIteration;
import eu.stratosphere.api.java.record.operators.FileDataSink;
import eu.stratosphere.api.java.record.operators.FileDataSource;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsSecondExcept;
import eu.stratosphere.api.java.record.functions.JoinFunction;
import eu.stratosphere.api.java.record.io.CsvInputFormat;
import eu.stratosphere.api.java.record.io.CsvOutputFormat;
import eu.stratosphere.api.java.record.operators.JoinOperator;
import eu.stratosphere.api.java.record.operators.MapOperator;
import eu.stratosphere.api.java.record.operators.ReduceOperator;
import eu.stratosphere.test.recordJobs.graph.WorksetConnectedComponents.DuplicateLongMap;
import eu.stratosphere.test.recordJobs.graph.WorksetConnectedComponents.MinimumComponentIDReduce;
import eu.stratosphere.test.recordJobs.graph.WorksetConnectedComponents.NeighborWithComponentIDJoin;
import eu.stratosphere.test.testdata.ConnectedComponentsData;
import eu.stratosphere.test.util.RecordAPITestBase;
import eu.stratosphere.types.LongValue;
import eu.stratosphere.types.Record;
import eu.stratosphere.util.Collector;
/**
* Tests a bug that prevented that the solution set can be on both sides of the match/cogroup function.
*/
public class ConnectedComponentsWithSolutionSetFirstITCase extends RecordAPITestBase {
private static final long SEED = 0xBADC0FFEEBEEFL;
private static final int NUM_VERTICES = 1000;
private static final int NUM_EDGES = 10000;
protected String verticesPath;
protected String edgesPath;
protected String resultPath;
@Override
protected void preSubmit() throws Exception {
verticesPath = createTempFile("vertices.txt", ConnectedComponentsData.getEnumeratingVertices(NUM_VERTICES));
edgesPath = createTempFile("edges.txt", ConnectedComponentsData.getRandomOddEvenEdges(NUM_EDGES, NUM_VERTICES, SEED));
resultPath = getTempFilePath("results");
}
@Override
protected Plan getTestJob() {
return getPlanForWorksetConnectedComponentsWithSolutionSetAsFirstInput(4, verticesPath, edgesPath, resultPath, 100);
}
@Override
protected void postSubmit() throws Exception {
for (BufferedReader reader : getResultReader(resultPath)) {
ConnectedComponentsData.checkOddEvenResult(reader);
}
}
// --------------------------------------------------------------------------------------------
// Classes and methods for the test program
// --------------------------------------------------------------------------------------------
@ConstantFieldsSecondExcept({})
public static final class UpdateComponentIdMatchMirrored extends JoinFunction implements Serializable {
private static final long serialVersionUID = 1L;
@Override
public void join(Record currentVertexWithComponent, Record newVertexWithComponent, Collector<Record> out){
long candidateComponentID = newVertexWithComponent.getField(1, LongValue.class).getValue();
long currentComponentID = currentVertexWithComponent.getField(1, LongValue.class).getValue();
if (candidateComponentID < currentComponentID) {
out.collect(newVertexWithComponent);
}
}
}
@SuppressWarnings("unchecked")
private static Plan getPlanForWorksetConnectedComponentsWithSolutionSetAsFirstInput(
int numSubTasks, String verticesInput, String edgeInput, String output, int maxIterations)
{
// data source for initial vertices
FileDataSource initialVertices = new FileDataSource(new CsvInputFormat(' ', LongValue.class), verticesInput, "Vertices");
MapOperator verticesWithId = MapOperator.builder(DuplicateLongMap.class).input(initialVertices).name("Assign Vertex Ids").build();
DeltaIteration iteration = new DeltaIteration(0, "Connected Components Iteration");
iteration.setInitialSolutionSet(verticesWithId);
iteration.setInitialWorkset(verticesWithId);
iteration.setMaximumNumberOfIterations(maxIterations);
// create DataSourceContract for the edges
FileDataSource edges = new FileDataSource(new CsvInputFormat(' ', LongValue.class, LongValue.class), edgeInput, "Edges");
// create CrossOperator for distance computation
JoinOperator joinWithNeighbors = JoinOperator.builder(new NeighborWithComponentIDJoin(), LongValue.class, 0, 0)
.input1(iteration.getWorkset())
.input2(edges)
.name("Join Candidate Id With Neighbor")
.build();
// create ReduceOperator for finding the nearest cluster centers
ReduceOperator minCandidateId = ReduceOperator.builder(new MinimumComponentIDReduce(), LongValue.class, 0)
.input(joinWithNeighbors)
.name("Find Minimum Candidate Id")
.build();
// create CrossOperator for distance computation
JoinOperator updateComponentId = JoinOperator.builder(new UpdateComponentIdMatchMirrored(), LongValue.class, 0, 0)
.input1(iteration.getSolutionSet())
.input2(minCandidateId)
.name("Update Component Id")
.build();
iteration.setNextWorkset(updateComponentId);
iteration.setSolutionSetDelta(updateComponentId);
// create DataSinkContract for writing the new cluster positions
FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, iteration, "Result");
CsvOutputFormat.configureRecordFormat(result)
.recordDelimiter('\n')
.fieldDelimiter(' ')
.field(LongValue.class, 0)
.field(LongValue.class, 1);
// return the PACT plan
Plan plan = new Plan(result, "Workset Connected Components");
plan.setDefaultParallelism(numSubTasks);
return plan;
}
}