Package com.atlantbh.nutch.index.alternativedataflow.flow

Source Code of com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlow

package com.atlantbh.nutch.index.alternativedataflow.flow;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.namespace.QName;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;

import com.Ostermiller.util.CSVPrint;
import com.Ostermiller.util.CSVPrinter;
import com.Ostermiller.util.ExcelCSVPrinter;
import com.atlantbh.nutch.index.alternativedataflow.FilterUtils;
import com.atlantbh.nutch.index.alternativedataflow.conf.Entry;
import com.atlantbh.nutch.index.alternativedataflow.conf.Field;

public class CsvDataFlow implements DataFlow {
 
  // Constants
  private static final Logger log = Logger.getLogger(CsvDataFlow.class);
 
  // Configuration constants
  public static final QName DESTINATION = new QName("destination");
  public static final QName NAME = new QName("name");
  public static final QName SEPERATOR = new QName("seperator");
  public static final QName QUOTE_CHARACTER = new QName("quoteCharacter");
  public static final QName APPEND_TIMESTAMP = new QName("appendTimestamp");
  public static final QName ADD_FIELD_NAMES_TO_FIRST_LINE = new QName("addFieldNamesToFirstLine");
  public static final QName STYLE = new QName("style");
 
  // Init data
  private Configuration configuration;
  private List<Entry> entryList;

  // Internal data
  private Map<String, CSVPrint> nameCsvPrintMap = new HashMap<String, CSVPrint>();
  private boolean initialized = false;
 
  public CsvDataFlow() {
  }

  private void initWriters() throws IOException {

    // Check if it already initialized
    // because this function is called everytime a NutchDocument is being
    // processed
    if (!initialized) {

      // Initialize timestamp
      long timestamp = new Date().getTime();
     
      // Iterate trough the entries
      for (Entry entry : entryList) {
        char seperator = FilterUtils.getNullSafe(entry.getParameterMap().get(SEPERATOR), ",").charAt(0);
        char quoteCharacter = FilterUtils.getNullSafe(entry.getParameterMap().get(QUOTE_CHARACTER), "\"").charAt(0);
        String style = FilterUtils.getNullSafe(entry.getParameterMap().get(STYLE), "UNIX");

        // Check if the file needs a timestamp
        File csvFile;
        if (Boolean.valueOf(FilterUtils.getNullSafe(entry.getParameterMap().get(APPEND_TIMESTAMP), "false"))) {
          csvFile = new File(entry.getParameterMap().get(DESTINATION) + File.separator + entry.getParameterMap().get(NAME) + timestamp + ".csv");
        } else {
          csvFile = new File(entry.getParameterMap().get(DESTINATION) + File.separator + entry.getParameterMap().get(NAME) + ".csv");
        }

        // If the file already exists delete it
        if (csvFile.exists()) {
          csvFile.delete();
        }

        // Create the new cvs file
        csvFile.createNewFile();

        // Create the printer
        CSVPrint csvPrint = null;
        if("UNIX".equals(style)) {
          csvPrint = new CSVPrinter(new OutputStreamWriter(new FileOutputStream(csvFile), "UTF-8"), '#', quoteCharacter, seperator, false, true);
        } else if("EXCEL".equals(style)) {
          csvPrint = new ExcelCSVPrinter(new OutputStreamWriter(new FileOutputStream(csvFile), "UTF-8"), quoteCharacter, seperator, false, true);
        }
        nameCsvPrintMap.put(entry.getParameterMap().get(NAME), csvPrint);

        // If this flag is set add the first
        if (Boolean.valueOf(FilterUtils.getNullSafe(entry.getParameterMap().get(ADD_FIELD_NAMES_TO_FIRST_LINE), "false"))) {

          String[] fieldNames = new String[entry.getFieldList().size()];
          List<Field> fieldList = entry.getFieldList();
          for (int i = 0; i < fieldList.size(); i++) {
            if(fieldList.get(i).getAlias() != null) {
              fieldNames[i] = fieldList.get(i).getAlias();
            } else {
              fieldNames[i] = fieldList.get(i).getName();
            }
           
          }

          // Write field names to CSV file
          csvPrint.println(fieldNames);
        }

      }

      // Set the flag to initialized
      initialized = true;
    }
  }

  @Override
  public void init(Configuration configuration, List<Entry> entryList) {
    this.configuration = configuration;
    this.entryList = entryList;
  }
 
  @Override
  public void destroy() {
    for (String name : nameCsvPrintMap.keySet()) {
      try {
       
        CSVPrint csvPrint = nameCsvPrintMap.get(name);
        csvPrint.close();
      } catch (IOException e) {
        // DOESN'T MATTER IF IT THROWS AN EXCEPTION
      }
    }
  }

  @Override
  public void processData(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) {
   
    // Get metadata
    Metadata metadata = parse.getData().getParseMeta();
   
    try {
     
      // Initialize the writers
      // Only on the first execution
      initWriters();
     
      for(Entry entry : entryList) {
       
        CSVPrint csvPrint = nameCsvPrintMap.get(entry.getParameterMap().get(NAME));
       
        String[] fieldValues = new String[entry.getFieldList().size()];
        List<Field> fieldList = entry.getFieldList();
        for (int i = 0; i < fieldList.size(); i++) {
          fieldValues[i] = FilterUtils.getNullSafe(metadata.get(fieldList.get(i).getName()), "");
        }
       
        // Write field values to CSV file
        csvPrint.println(fieldValues);
      }
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

  }

 
}
TOP

Related Classes of com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlow

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.