Source Code of org.renjin.maven.namespace.DatasetsBuilder

package org.renjin.maven.namespace;


import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.util.Properties;
import java.util.zip.GZIPInputStream;


import com.google.common.io.Closer;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.renjin.eval.EvalException;
import org.renjin.eval.Session;
import org.renjin.eval.SessionBuilder;
import org.renjin.parser.RParser;
import org.renjin.primitives.io.connections.GzFileConnection;
import org.renjin.primitives.io.serialization.RDataReader;
import org.renjin.primitives.io.serialization.RDataWriter;
import org.renjin.sexp.ExpressionVector;
import org.renjin.sexp.FunctionCall;
import org.renjin.sexp.LogicalVector;
import org.renjin.sexp.PairList;
import org.renjin.sexp.SEXP;
import org.renjin.sexp.StringVector;
import org.renjin.sexp.Symbol;
import org.tukaani.xz.XZInputStream;


import com.google.common.base.Joiner;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.io.Closeables;


/**
 * Prepares datasets, writes an index, and copies them into target/classes
 * as a resource.
 *
 * <p>GNU R supports several types of data formats and compression; we want to
 * simplify everything at compile time into an uncompressed, serialized objects
 * we don't have to muck around with it at runtime. The data files will be compressed
 * in a jar in any case.</p>
 * 
 * <p>To complicate things, a single "dataset" can contain multiple R objects. Again,
 * to simplify things at runtime, we'll write out each element to a seperate resource
 * file, and then write a "datasets" index file that maps logical datasets to the 
 * named R objects.
 */
public class DatasetsBuilder {


  private File packageRoot;
  private File dataObjectDirectory;
  private File dataDirectory;
  
  /**
   * Maps logical datasets to R object names
   */
  private Multimap<String, String> indexMap = HashMultimap.create();


  public DatasetsBuilder(File packageRoot, File dataDirectory) {
    this.packageRoot = packageRoot;
    
    this.dataObjectDirectory = new File(packageRoot, "data");
    this.dataObjectDirectory.mkdirs();
    
    this.dataDirectory = dataDirectory;
  }


  public void build() throws FileNotFoundException  {
    if(dataDirectory.exists()) {
      File[] files = dataDirectory.listFiles();
      if(files != null) {
        for(File dataFile : files) {
          try {
            processDataset(dataFile);
          } catch(EvalException e) {
            System.err.println("ERROR processing data file " + dataFile.getName() + ": " + e.getMessage());
            e.printRStackTrace(System.err);
          } catch(Exception e) {
            System.err.println("Exception processing data file " + dataFile);
            e.printStackTrace();
          }
        }
      }
    }


    if(!indexMap.isEmpty())  {
      writeIndex();
    }
  }


  private void writeIndex() throws FileNotFoundException  {
    
    Properties index = new Properties();
    for(String logicalDatasetName : indexMap.keySet()) {
      index.put(logicalDatasetName, Joiner.on(",").join(indexMap.get(logicalDatasetName)));
    }
    
    File indexFile = new File(packageRoot, "datasets");
    FileOutputStream out = new FileOutputStream(indexFile);
    try {
      index.store(out, "Datasets index");
      out.close();
    } catch (IOException e) {
      throw new RuntimeException("Failed to write dataset index to " + indexFile.getAbsolutePath(), e);
    }
  }


  private void processDataset(File dataFile) throws IOException {
    if(dataFile.getName().endsWith("datalist")) {
      return;
    } else if(dataFile.getName().endsWith(".rda") || dataFile.getName().endsWith(".RData")) {
      processRDataFile(dataFile);


    } else if(dataFile.getName().endsWith(".txt.gz")) {
      processTextFile(dataFile, stripExtension(dataFile, ".txt.gz"), "");


    } else if(dataFile.getName().endsWith(".txt")) {
      processTextFile(dataFile, stripExtension(dataFile), "");
      
    } else if(dataFile.getName().endsWith(".tab")) {
      processTextFile(dataFile, stripExtension(dataFile), "");
      
    } else if(dataFile.getName().toLowerCase().endsWith(".csv")) {
      processTextFile(dataFile, stripExtension(dataFile), ";");
      
    } else if(dataFile.getName().endsWith(".R")) {
      processRScript(dataFile, stripExtension(dataFile));
    
    } else {
      throw new RuntimeException("Don't know how to process datafile " + dataFile.getName());
    }
  }


  /**
   * Copy and decompress the saved PairList in rda format.
   * @param dataFile the source data format
   * @throws IOException
   */
  private void processRDataFile(File dataFile) throws IOException {
    Closer closer = Closer.create();
    InputStream in = closer.register(DatasetsBuilder.decompress(dataFile));
    SEXP exp;
    try { 
      RDataReader reader = new RDataReader(in);
      exp = reader.readFile();
    } catch(Throwable e) {
      throw closer.rethrow(e);
    } finally {
      in.close();
    }
    
    if(!(exp instanceof PairList)) {
      throw new UnsupportedOperationException("Expected to find a pairlist in " + dataFile + ", found a " + exp.getTypeName());
    }
    
    String logicalDatasetName = stripExtension(dataFile.getName());
    Session session = new SessionBuilder().withoutBasePackage().build();
    writePairList(logicalDatasetName, session, (PairList)exp);
  }


  private String stripExtension(String name) {
    int lastDot = name.lastIndexOf('.');
    return name.substring(0, lastDot);
  }
  


  private String stripExtension(File dataFile) {
    return stripExtension(dataFile.getName());
  }




  private static String stripExtension(File file, String ext) {
    return stripExtension(file.getName(), ext);
  }


  private static String stripExtension(String name, String ext) {
    return name.substring(0, name.length() - ext.length());
  }


  /**
   * Text files (*.tab, *.csv, *.txt) are processed with utils::read.table() and the
   * resulting data.frame is stored as the single object of the logical dataset.
   */
  private void processTextFile(File dataFile, String logicalDatasetName, String sep) throws IOException {
    // Read into a data frame using read.table()
    PairList.Builder args = new PairList.Builder();
    args.add(StringVector.valueOf(dataFile.getAbsolutePath()));
    args.add("header", LogicalVector.TRUE);
    args.add("sep", StringVector.valueOf(sep));


    FunctionCall readTable = FunctionCall.newCall(Symbol.get("::"), Symbol.get("utils"), Symbol.get("read.table"));
    FunctionCall call = new FunctionCall(readTable, args.build());


    Session session = new SessionBuilder().build();
    SEXP dataFrame = session.getTopLevelContext().evaluate(call);


    PairList.Builder pairList = new PairList.Builder();
    pairList.add(logicalDatasetName, dataFrame);


    writePairList(logicalDatasetName, session, pairList.build());
  }
  
  /**
   * R Scripts are evaluated, and any resulting objects in the global
   * namespace are considered part of the dataset.
   * 
   */
  private void processRScript(File scriptFile, String logicalDatasetName) throws IOException {


    Session session = new SessionBuilder().build();
    FileReader reader = new FileReader(scriptFile);
    ExpressionVector source = RParser.parseAllSource(reader);
    reader.close();
    
    session.getTopLevelContext().evaluate(source);
    
    PairList.Builder pairList = new PairList.Builder();
    for(Symbol symbol : session.getGlobalEnvironment().getSymbolNames()) {
      if(!symbol.getPrintName().startsWith(".")) {
        pairList.add(symbol, session.getGlobalEnvironment().getVariable(symbol));
      }
    }   
    writePairList(logicalDatasetName, session, pairList.build());
  }
  
  /**
   * Write each element of the pairlist out to a separate resource
   * file so that it can be loaded on demand, rather than en mass
   * when a package is loaded. 
   */
  private void writePairList(String logicalDatasetName, Session session,
      PairList pairList) throws FileNotFoundException, IOException {
        
    for(PairList.Node node : pairList.nodes()) {
      
      if(indexMap.values().contains(node.getName())) {
        throw new UnsupportedOperationException(String.format("Duplicate R object '%s' name in dataset '%s' ",
            node.getName(), logicalDatasetName));
      }
      indexMap.put(logicalDatasetName, node.getName());
      
      File targetFile = new File(dataObjectDirectory, node.getName());
      FileOutputStream out = new FileOutputStream(targetFile);
      RDataWriter writer = new RDataWriter(session.getTopLevelContext(), out);
      writer.save(node.getValue());
      out.close();    
    }
  }




  /**
   * Check the input stream for a compression header and wrap in a decompressing
   * stream (gzip or xz) if necessary
   */
  public static InputStream decompress(File file) throws IOException {


    FileInputStream in = new FileInputStream(file);
    int b1 = in.read();
    int b2 = in.read();
    int b3 = in.read();
    in.close();
    
    if(b1 == GzFileConnection.GZIP_MAGIC_BYTE1 && b2 == GzFileConnection.GZIP_MAGIC_BYTE2) {
      return new GZIPInputStream(new FileInputStream(file));


    } else if(b1 == 0xFD && b2 == '7') {
      // See http://tukaani.org/xz/xz-javadoc/org/tukaani/xz/XZInputStream.html
      // Set a memory limit of 64mb, if this is not sufficient, it will throw
      // an exception rather than an OutOfMemoryError, which will terminate the JVM
      return new XZInputStream(new FileInputStream(file), 64 * 1024 * 1024);
      
    } else if (b1 == 'B' && b2 == 'Z' && b3 == 'h' ) {
      return new BZip2CompressorInputStream(new FileInputStream(file));
    
    } else {
      return new FileInputStream(file);
    }
  }
}
Source Code of org.renjin.maven.namespace.DatasetsBuilder

Related Classes of org.renjin.maven.namespace.DatasetsBuilder