Package com.ikanow.infinit.e.harvest.extraction.document.file

Source Code of com.ikanow.infinit.e.harvest.extraction.document.file.CsvToMetadataParser

/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package com.ikanow.infinit.e.harvest.extraction.document.file;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;

import au.com.bytecode.opencsv.CSVParser;

import com.google.gson.JsonObject;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.harvest.extraction.document.file.JsonToMetadataParser.ObjectLength;

public class CsvToMetadataParser {

  private int _debugMaxDocs = Integer.MAX_VALUE;
 
  CsvToMetadataParser(int debugMaxDocs) {
    _debugMaxDocs = debugMaxDocs;   
  }
 
  private char _quoteChar = '"';
 
  // Track approximate memory usage
  private ObjectLength _memUsage = new ObjectLength();     
  public long getMemUsage() {
    return _memUsage.memory*12; // 6x for overhead, 2x for string->byte
  }
 
  public List<DocumentPojo> parseDocument(BufferedReader lineReader, SourcePojo source) throws IOException {
    String line;
    List<DocumentPojo> partials = new LinkedList<DocumentPojo>();
    int docs = 0;
    _memUsage.memory = 0;
   
    CSVParser parser = null;
    Object[] indexToField = null;
    // *Automated* parser, else will just grab the line and let subsequent pipeline elements extract the fields
    if ((null != source.getFileConfig()) && (
         ((null != source.getFileConfig().XmlIgnoreValues) && (!source.getFileConfig().XmlIgnoreValues.isEmpty())) ||
         ((null != source.getFileConfig().XmlRootLevelValues) && (!source.getFileConfig().XmlRootLevelValues.isEmpty()))
      ))
    {
      if (null != source.getFileConfig().XmlAttributePrefix) {
        String chars = source.getFileConfig().XmlAttributePrefix;
        if (1 == chars.length()) {
          parser = new CSVParser(chars.charAt(0));
        }
        else if (2 == chars.length()) {
          parser = new CSVParser(chars.charAt(0), chars.charAt(1));
        }
        else if (chars.length() > 2) {
          parser = new CSVParser(chars.charAt(0), chars.charAt(1), chars.charAt(2));
        }
        if (chars.length() > 1) {
          _quoteChar = chars.charAt(1);
        }
      }
      if (null == parser) {
        parser = new CSVParser();
      }
      if ((null != source.getFileConfig().XmlRootLevelValues) && (source.getFileConfig().XmlRootLevelValues.size() > 0)) {
        indexToField = source.getFileConfig().XmlRootLevelValues.toArray();
      }
    }//TESTED
   
    boolean foundHeaderLine = (indexToField != null);
    while ((line = lineReader.readLine()) != null) {
      // Ignore header lines:
      if ((null != source.getFileConfig()) && (null != source.getFileConfig().XmlIgnoreValues)) {
        boolean bMatched = false;
        boolean firstIgnoreField = true; // (first ignore field in list can generate the headers)
        for (String ignore: source.getFileConfig().XmlIgnoreValues) {     
          boolean lineMatches = false;
          if (ignore.charAt(0) == _quoteChar) {
            if (line.charAt(0) == _quoteChar) {
              lineMatches = line.startsWith(ignore);
            }//TESTED (["a","b","c"] and XmlIgnoreFields: [ "\"a" ])             
            else {
              lineMatches = line.startsWith(ignore.substring(1));
            }//TESTED ([a,b,c] vs XmlIgnoreFields: [ "a" ] and [ "\"a" ])             
          }
          else {
            lineMatches = line.startsWith(ignore);
          }//TESTED
         
          if (lineMatches) {
            if (!foundHeaderLine && firstIgnoreField && (null != parser)) {
              if (ignore.charAt(0) != _quoteChar) { // if using quotes then don't pull the char
                line = line.substring(ignore.length());
              }//TESTED (["a","b","c"] and [a,b,c] vs XmlIgnoreFields: [ "a" ] and [ "\"a" ])             
              String[] fields = parser.parseLine(line);
              // Now override the manual fields:
              indexToField = Arrays.asList(fields).toArray();
             
              if ((indexToField.length > 1) || (0 != ((String)indexToField[0]).length())) {
                foundHeaderLine = true;
              }//TESTED
            }//TESTED
            bMatched = true;
          }
          firstIgnoreField = false;
        }
        if (bMatched) continue;
      }//TESTED
     
      DocumentPojo newDoc = new DocumentPojo();
      String primaryKey = null;
      if (null != parser) {
        JsonObject json = new JsonObject();
        try {
          String[] records = parser.parseLine(line);
          for (int i = 0; i < records.length; ++i) {
            String record = records[i];
            if ((record.length() > 0) && (i < indexToField.length)) {
              String fieldName = (String) indexToField[i];
              if ((null != fieldName) && (fieldName.length() > 0)) {
                json.addProperty(fieldName, record);
                if (fieldName.equals(source.getFileConfig().XmlPrimaryKey)) {
                  primaryKey = record;
                }
              }
            }
          }
          if ((null != primaryKey) && (null != source.getFileConfig().XmlSourceName)) {
            newDoc.setUrl(source.getFileConfig().XmlSourceName + primaryKey);
          }//TESTED
          newDoc.addToMetadata("csv", JsonToMetadataParser.convertJsonObjectToLinkedHashMap(json, _memUsage));         
        }
        catch (Exception e) {} // can just skip over the line and carry on
       
      }//TESTED
     
      newDoc.setFullText(line);
      if (line.length() > 128) {
        newDoc.setDescription(line.substring(0, 128));
      }
      else {
        newDoc.setDescription(line);
      }
      partials.add(newDoc);
      docs++;
      if (docs >= _debugMaxDocs) { // debug mode only, otherwise commit to all docs in this file
        break;
      }
    }
    return partials;
  }
}
TOP

Related Classes of com.ikanow.infinit.e.harvest.extraction.document.file.CsvToMetadataParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.