Examples of org.apache.flink.api.java.record.io.CsvInputFormat

Package org.apache.flink.api.java.record.io

Examples of org.apache.flink.api.java.record.io.CsvInputFormat

org.apache.flink.api.java.record.io.CsvInputFormat
Input format to parse text files and generate Records. The input file is structured by record delimiters and field delimiters (CSV files are common). Record delimiter separate records from each other ('\n' is common). Field delimiters separate fields within a record. Record and field delimiters must be configured using the InputFormat {@link Configuration}. The number of fields to parse must be configured as well. For each field a data type must be specified using the {@link CsvInputFormat#FIELD_TYPE_PARAMETER_PREFIX} config key.The position within the text record can be configured for each field using the {@link CsvInputFormat#TEXT_POSITION_PARAMETER_PREFIX} config key.Either all text positions must be configured or none. If none is configured, the index of the config key is used. The position of a value within the {@link Record} is the index of the config key. @see Configuration @see Record

     * Output Format:
     * 0: URL
     * 1: DOCUMENT_TEXT
     */
    // Create DataSourceContract for documents relation
    @SuppressWarnings("unchecked")
    CsvInputFormat docsFormat = new CsvInputFormat('|', StringValue.class, StringValue.class);
    FileDataSource docs = new FileDataSource(docsFormat, docsInput, "Docs Input");
    
    /*
     * Output Format:
     * 0: URL
     * 1: RANK
     * 2: AVG_DURATION
     */
    // Create DataSourceContract for ranks relation
    FileDataSource ranks = new FileDataSource(new CsvInputFormat(), ranksInput, "Ranks input");
    CsvInputFormat.configureRecordFormat(ranks)
      .recordDelimiter('\n')
      .fieldDelimiter('|')
      .field(StringValue.class, 1)
      .field(IntValue.class, 0)
      .field(IntValue.class, 2);


    /*
     * Output Format:
     * 0: URL
     * 1: DATE
     */
    // Create DataSourceContract for visits relation
    @SuppressWarnings("unchecked")
    CsvInputFormat visitsFormat = new CsvInputFormat('|', null, StringValue.class, StringValue.class);
    FileDataSource visits = new FileDataSource(visitsFormat, visitsInput, "Visits input:q");


    // Create MapOperator for filtering the entries from the documents
    // relation
    MapOperator filterDocs = MapOperator.builder(new FilterDocs())

View Full Code Here

    String clusterInput = (args.length > 2 ? args[2] : "");
    String output = (args.length > 3 ? args[3] : "");


    // create DataSourceContract for data point input
    @SuppressWarnings("unchecked")
    FileDataSource pointsSource = new FileDataSource(new CsvInputFormat('|', IntValue.class, DoubleValue.class, DoubleValue.class, DoubleValue.class), dataPointInput, "Data Points");


    // create DataSourceContract for cluster center input
    @SuppressWarnings("unchecked")
    FileDataSource clustersSource = new FileDataSource(new CsvInputFormat('|', IntValue.class, DoubleValue.class, DoubleValue.class, DoubleValue.class), clusterInput, "Centers");
    
    MapOperator dataPoints = MapOperator.builder(new PointBuilder()).name("Build data points").input(pointsSource).build();
    
    MapOperator clusterPoints = MapOperator.builder(new PointBuilder()).name("Build cluster points").input(clustersSource).build();

View Full Code Here

    final String edgeInput = (args.length > 2 ? args[2] : "");
    final String output = (args.length > 3 ? args[3] : "");
    final int maxIterations = (args.length > 4 ? Integer.parseInt(args[4]) : 1);


    // data source for initial vertices
    FileDataSource initialVertices = new FileDataSource(new CsvInputFormat(' ', LongValue.class), verticesInput, "Vertices");
    
    MapOperator verticesWithId = MapOperator.builder(DuplicateLongMap.class).input(initialVertices).name("Assign Vertex Ids").build();
    
    DeltaIteration iteration = new DeltaIteration(0, "Connected Components Iteration");
    iteration.setInitialSolutionSet(verticesWithId);
    iteration.setInitialWorkset(verticesWithId);
    iteration.setMaximumNumberOfIterations(maxIterations);
    
    // create DataSourceContract for the edges
    FileDataSource edges = new FileDataSource(new CsvInputFormat(' ', LongValue.class, LongValue.class), edgeInput, "Edges");


    // create CrossOperator for distance computation
    JoinOperator joinWithNeighbors = JoinOperator.builder(new NeighborWithComponentIDJoin(), LongValue.class, 0, 0)
        .input1(iteration.getWorkset())
        .input2(edges)

View Full Code Here

      final int numSubtasks     = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
      final String recordsPath = (args.length > 1 ? args[1] : "");
      final String output      = (args.length > 2 ? args[2] : "");


      @SuppressWarnings("unchecked")
      FileDataSource source = new FileDataSource(new CsvInputFormat(',', IntValue.class, IntValue.class, IntValue.class), recordsPath);


      FileDataSink sink = new FileDataSink(CsvOutputFormat.class, output);
      CsvOutputFormat.configureRecordFormat(sink)
        .recordDelimiter('\n')
        .fieldDelimiter(',')

View Full Code Here

    /*
     * Output Schema:
     * 0: CUSTOMER_ID
     */
    // create DataSourceContract for Orders input
    FileDataSource orders = new FileDataSource(new CsvInputFormat(), ordersPath, "Orders");
    orders.setDegreeOfParallelism(numSubtasks);
    CsvInputFormat.configureRecordFormat(orders)
      .recordDelimiter('\n')
      .fieldDelimiter('|')
      .field(IntValue.class, 1);
    
    /*
     * Output Schema:
     * 0: CUSTOMER_ID
     * 1: MKT_SEGMENT
     */
    // create DataSourceContract for Customer input
    FileDataSource customers = new FileDataSource(new CsvInputFormat(), customerPath, "Customers");
    customers.setDegreeOfParallelism(numSubtasks);
    CsvInputFormat.configureRecordFormat(customers)
      .recordDelimiter('\n')
      .fieldDelimiter('|')
      .field(IntValue.class, 0)

View Full Code Here

  
  @SuppressWarnings("unchecked")
  public static Plan getPlan(int numSubTasks, String verticesInput, String edgeInput, String output, int maxIterations, boolean extraMap) {


    // data source for initial vertices
    FileDataSource initialVertices = new FileDataSource(new CsvInputFormat(' ', LongValue.class), verticesInput, "Vertices");
    
    MapOperator verticesWithId = MapOperator.builder(DuplicateLongMap.class).input(initialVertices).name("Assign Vertex Ids").build();
    
    // the loop takes the vertices as the solution set and changed vertices as the workset
    // initially, all vertices are changed
    DeltaIteration iteration = new DeltaIteration(0, "Connected Components Iteration");
    iteration.setInitialSolutionSet(verticesWithId);
    iteration.setInitialWorkset(verticesWithId);
    iteration.setMaximumNumberOfIterations(maxIterations);
    
    // data source for the edges
    FileDataSource edges = new FileDataSource(new CsvInputFormat(' ', LongValue.class, LongValue.class), edgeInput, "Edges");


    // join workset (changed vertices) with the edges to propagate changes to neighbors
    JoinOperator joinWithNeighbors = JoinOperator.builder(new NeighborWithComponentIDJoin(), LongValue.class, 0, 0)
        .input1(iteration.getWorkset())
        .input2(edges)

View Full Code Here

    final String edgeInput = (args.length > 2 ? args[2] : "");
    final String output = (args.length > 3 ? args[3] : "");
    final int maxIterations = (args.length > 4 ? Integer.parseInt(args[4]) : 1);


    // data source for initial vertices
    FileDataSource initialVertices = new FileDataSource(new CsvInputFormat(' ', LongValue.class), verticesInput, "Vertices");
    
    MapOperator verticesWithId = MapOperator.builder(DuplicateLongMap.class).input(initialVertices).name("Assign Vertex Ids").build();
    
    // the loop takes the vertices as the solution set and changed vertices as the workset
    // initially, all vertices are changed
    DeltaIteration iteration = new DeltaIteration(0, "Connected Components Iteration");
    iteration.setInitialSolutionSet(verticesWithId);
    iteration.setInitialWorkset(verticesWithId);
    iteration.setMaximumNumberOfIterations(maxIterations);
    
    // data source for the edges
    FileDataSource edges = new FileDataSource(new CsvInputFormat(' ', LongValue.class, LongValue.class), edgeInput, "Edges");


    // join workset (changed vertices) with the edges to propagate changes to neighbors
    JoinOperator joinWithNeighbors = JoinOperator.builder(new NeighborWithComponentIDJoin(), LongValue.class, 0, 0)
        .input1(iteration.getWorkset())
        .input2(edges)

View Full Code Here

  // -----------------------------------------------------------------------------------------------------------------


  private static InputFormatVertex createVerticesInput(JobGraph jobGraph, String verticesPath, int numSubTasks,
      TypeSerializerFactory<?> serializer, TypeComparatorFactory<?> comparator)
  {
    @SuppressWarnings("unchecked")
    CsvInputFormat verticesInFormat = new CsvInputFormat(' ', LongValue.class);
    InputFormatVertex verticesInput = JobGraphUtils.createInput(verticesInFormat, verticesPath, "VerticesInput",
      jobGraph, numSubTasks);
    TaskConfig verticesInputConfig = new TaskConfig(verticesInput.getConfiguration());
    {
      verticesInputConfig.addOutputShipStrategy(ShipStrategyType.FORWARD);

View Full Code Here


  private static InputFormatVertex createEdgesInput(JobGraph jobGraph, String edgesPath, int numSubTasks,
      TypeSerializerFactory<?> serializer, TypeComparatorFactory<?> comparator)
  {
    // edges
    @SuppressWarnings("unchecked")
    CsvInputFormat edgesInFormat = new CsvInputFormat(' ', LongValue.class, LongValue.class);
    InputFormatVertex edgesInput = JobGraphUtils.createInput(edgesInFormat, edgesPath, "EdgesInput", jobGraph,
      numSubTasks);
    TaskConfig edgesInputConfig = new TaskConfig(edgesInput.getConfiguration());
    {
      edgesInputConfig.setOutputSerializer(serializer);

View Full Code Here

  @Override
  protected Plan getTestJob() {
    
    int dop = this.config.getInteger("GroupOrderTest#NumSubtasks", 1);
    
    @SuppressWarnings("unchecked")
    CsvInputFormat format = new CsvInputFormat(',', IntValue.class, IntValue.class);
    FileDataSource source = new FileDataSource(format, this.textPath, "Source");
    
    ReduceOperator reducer = ReduceOperator.builder(CheckingReducer.class)
      .keyField(IntValue.class, 0)
      .input(source)

View Full Code Here

0 1 2

TOP

Related Classes of org.apache.flink.api.java.record.io.CsvInputFormat

org.apache.flink.api.common.operators.CompilerHints

org.apache.flink.test.broadcastvars.BroadcastBranchingITCase

org.apache.flink.test.broadcastvars.BroadcastVarsNepheleITCase

org.apache.flink.test.broadcastvars.KMeansIterativeNepheleITCase

org.apache.flink.test.compiler.iterations.ConnectedComponentsTest

org.apache.flink.test.iterative.CoGroupConnectedComponentsITCase

org.apache.flink.test.iterative.ConnectedComponentsWithDeferredUpdateITCase

org.apache.flink.test.iterative.ConnectedComponentsWithSolutionSetFirstITCase

org.apache.flink.test.iterative.nephele.ConnectedComponentsNepheleITCase

org.apache.flink.test.recordJobs.graph.ConnectedComponentsWithCoGroup

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.