Package eu.stratosphere.test.recordJobs.relational

Source Code of eu.stratosphere.test.recordJobs.relational.WebLogAnalysis$FilterRanks

/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/

package eu.stratosphere.test.recordJobs.relational;

import java.io.Serializable;
import java.util.Iterator;

import eu.stratosphere.api.common.Plan;
import eu.stratosphere.api.common.Program;
import eu.stratosphere.api.common.ProgramDescription;
import eu.stratosphere.api.java.record.operators.FileDataSink;
import eu.stratosphere.api.java.record.operators.FileDataSource;
import eu.stratosphere.api.java.record.functions.CoGroupFunction;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsExcept;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsFirstExcept;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsSecondExcept;
import eu.stratosphere.api.java.record.functions.JoinFunction;
import eu.stratosphere.api.java.record.functions.MapFunction;
import eu.stratosphere.api.java.record.io.CsvInputFormat;
import eu.stratosphere.api.java.record.io.CsvOutputFormat;
import eu.stratosphere.api.java.record.operators.CoGroupOperator;
import eu.stratosphere.api.java.record.operators.JoinOperator;
import eu.stratosphere.api.java.record.operators.MapOperator;
import eu.stratosphere.types.IntValue;
import eu.stratosphere.types.Record;
import eu.stratosphere.types.StringValue;
import eu.stratosphere.util.Collector;

/**
* Implements the following relational OLAP query as PACT program:
*
* <code><pre>
* SELECT r.pageURL, r.pageRank, r.avgDuration
* FROM Documents d JOIN Rankings r
*   ON d.url = r.url
* WHERE CONTAINS(d.text, [keywords])
*   AND r.rank > [rank]
*   AND NOT EXISTS (
*     SELECT * FROM Visits v
*     WHERE v.destUrl = d.url
*       AND v.visitDate < [date]);
*  * </pre></code>
*
* Table Schemas: <code><pre>
* CREATE TABLE Documents (
*           url VARCHAR(100) PRIMARY KEY,
*           contents TEXT );
*
* CREATE TABLE Rankings (
*           pageRank INT,
*           pageURL VARCHAR(100) PRIMARY KEY,    
*           avgDuration INT );      
*
* CREATE TABLE Visits (
*           sourceIP VARCHAR(16),
*           destURL VARCHAR(100),
*           visitDate DATE,
*           adRevenue FLOAT,
*           userAgent VARCHAR(64),
*           countryCode VARCHAR(3),
*           languageCode VARCHAR(6),
*           searchWord VARCHAR(32),
*           duration INT );
* </pre></code>
*
*/
public class WebLogAnalysis implements Program, ProgramDescription {
 
  private static final long serialVersionUID = 1L;


  /**
   * MapFunction that filters for documents that contain a certain set of
   * keywords.
   */
  @ConstantFieldsExcept(1)
  public static class FilterDocs extends MapFunction implements Serializable {
    private static final long serialVersionUID = 1L;
   
    private static final String[] KEYWORDS = { " editors ", " oscillations ", " convection " };
   
    /**
     * Filters for documents that contain all of the given keywords and projects the records on the URL field.
     *
     * Output Format:
     * 0: URL
     */
    @Override
    public void map(Record record, Collector<Record> out) throws Exception {
      // FILTER
      // Only collect the document if all keywords are contained
      String docText = record.getField(1, StringValue.class).toString();
      boolean allContained = true;
      for (String kw : KEYWORDS) {
        if (!docText.contains(kw)) {
          allContained = false;
          break;
        }
      }

      if (allContained) {
        record.setNull(1);
        out.collect(record);
      }
    }
  }

  /**
   * MapFunction that filters for records where the rank exceeds a certain threshold.
   */
  @ConstantFieldsExcept({})
  public static class FilterRanks extends MapFunction implements Serializable {
    private static final long serialVersionUID = 1L;
   
    private static final int RANKFILTER = 50;
   
    /**
     * Filters for records of the rank relation where the rank is greater
     * than the given threshold.
     *
     * Output Format:
     * 0: URL
     * 1: RANK
     * 2: AVG_DURATION
     */
    @Override
    public void map(Record record, Collector<Record> out) throws Exception {
     
      if (record.getField(1, IntValue.class).getValue() > RANKFILTER) {
        out.collect(record);
      }
    }
  }

  /**
   * MapFunction that filters for records of the visits relation where the year
   * (from the date string) is equal to a certain value.
   */
  @ConstantFieldsExcept(1)
  public static class FilterVisits extends MapFunction implements Serializable {
    private static final long serialVersionUID = 1L;

    private static final int YEARFILTER = 2010;
   
    /**
     * Filters for records of the visits relation where the year of visit is equal to a
     * specified value. The URL of all visit records passing the filter is emitted.
     *
     * Output Format:
     * 0: URL
     */
    @Override
    public void map(Record record, Collector<Record> out) throws Exception {
      // Parse date string with the format YYYY-MM-DD and extract the year
      String dateString = record.getField(1, StringValue.class).getValue();
      int year = Integer.parseInt(dateString.substring(0,4));
     
      if (year == YEARFILTER) {
        record.setNull(1);
        out.collect(record);
       
      }
    }
  }

  /**
   * JoinFunction that joins the filtered entries from the documents and the
   * ranks relation.
   */
  @ConstantFieldsSecondExcept({})
  public static class JoinDocRanks extends JoinFunction implements Serializable {
    private static final long serialVersionUID = 1L;

    /**
     * Joins entries from the documents and ranks relation on their URL.
     *
     * Output Format:
     * 0: URL
     * 1: RANK
     * 2: AVG_DURATION
     */
    @Override
    public void join(Record document, Record rank, Collector<Record> out) throws Exception {
      out.collect(rank)
    }
  }

  /**
   * CoGroupFunction that realizes an anti-join.
   * If the first input does not provide any pairs, all pairs of the second input are emitted.
   * Otherwise, no pair is emitted.
   */
  @ConstantFieldsFirstExcept({})
  public static class AntiJoinVisits extends CoGroupFunction implements Serializable {
    private static final long serialVersionUID = 1L;

    /**
     * If the visit iterator is empty, all pairs of the rank iterator are emitted.
     * Otherwise, no pair is emitted.
     *
     * Output Format:
     * 0: URL
     * 1: RANK
     * 2: AVG_DURATION
     */
    @Override
    public void coGroup(Iterator<Record> ranks, Iterator<Record> visits, Collector<Record> out) {
      // Check if there is a entry in the visits relation
      if (!visits.hasNext()) {
        while (ranks.hasNext()) {
          // Emit all rank pairs
          out.collect(ranks.next());
        }
      }
    }
  }


  @Override
  public Plan getPlan(String... args) {

    // parse job parameters
    int numSubTasks     = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    String docsInput   = (args.length > 1 ? args[1] : "");
    String ranksInput  = (args.length > 2 ? args[2] : "");
    String visitsInput = (args.length > 3 ? args[3] : "");
    String output      = (args.length > 4 ? args[4] : "");

    /*
     * Output Format:
     * 0: URL
     * 1: DOCUMENT_TEXT
     */
    // Create DataSourceContract for documents relation
    @SuppressWarnings("unchecked")
    CsvInputFormat docsFormat = new CsvInputFormat('|', StringValue.class, StringValue.class);
    FileDataSource docs = new FileDataSource(docsFormat, docsInput, "Docs Input");
   
    /*
     * Output Format:
     * 0: URL
     * 1: RANK
     * 2: AVG_DURATION
     */
    // Create DataSourceContract for ranks relation
    FileDataSource ranks = new FileDataSource(new CsvInputFormat(), ranksInput, "Ranks input");
    CsvInputFormat.configureRecordFormat(ranks)
      .recordDelimiter('\n')
      .fieldDelimiter('|')
      .field(StringValue.class, 1)
      .field(IntValue.class, 0)
      .field(IntValue.class, 2);

    /*
     * Output Format:
     * 0: URL
     * 1: DATE
     */
    // Create DataSourceContract for visits relation
    @SuppressWarnings("unchecked")
    CsvInputFormat visitsFormat = new CsvInputFormat('|', null, StringValue.class, StringValue.class);
    FileDataSource visits = new FileDataSource(visitsFormat, visitsInput, "Visits input:q");

    // Create MapOperator for filtering the entries from the documents
    // relation
    MapOperator filterDocs = MapOperator.builder(new FilterDocs())
      .input(docs)
      .name("Filter Docs")
      .build();
    filterDocs.getCompilerHints().setFilterFactor(0.15f);

    // Create MapOperator for filtering the entries from the ranks relation
    MapOperator filterRanks = MapOperator.builder(new FilterRanks())
      .input(ranks)
      .name("Filter Ranks")
      .build();
    filterRanks.getCompilerHints().setFilterFactor(0.25f);

    // Create MapOperator for filtering the entries from the visits relation
    MapOperator filterVisits = MapOperator.builder(new FilterVisits())
      .input(visits)
      .name("Filter Visits")
      .build();
    filterVisits.getCompilerHints().setFilterFactor(0.2f);

    // Create JoinOperator to join the filtered documents and ranks
    // relation
    JoinOperator joinDocsRanks = JoinOperator.builder(new JoinDocRanks(), StringValue.class, 0, 0)
      .input1(filterDocs)
      .input2(filterRanks)
      .name("Join Docs Ranks")
      .build();

    // Create CoGroupOperator to realize a anti join between the joined
    // documents and ranks relation and the filtered visits relation
    CoGroupOperator antiJoinVisits = CoGroupOperator.builder(new AntiJoinVisits(), StringValue.class, 0, 0)
      .input1(joinDocsRanks)
      .input2(filterVisits)
      .name("Antijoin DocsVisits")
      .build();

    // Create DataSinkContract for writing the result of the OLAP query
    FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, antiJoinVisits, "Result");
    result.setDegreeOfParallelism(numSubTasks);
    CsvOutputFormat.configureRecordFormat(result)
      .recordDelimiter('\n')
      .fieldDelimiter('|')
      .lenient(true)
      .field(IntValue.class, 1)
      .field(StringValue.class, 0)
      .field(IntValue.class, 2);

    // Return the PACT plan
    Plan p = new Plan(result, "Weblog Analysis");
    p.setDefaultParallelism(numSubTasks);
    return p;
  }


  @Override
  public String getDescription() {
    return "Parameters: [numSubTasks], [docs], [ranks], [visits], [output]";
  }
}
TOP

Related Classes of eu.stratosphere.test.recordJobs.relational.WebLogAnalysis$FilterRanks

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.