Package com.socrata.datasync.publishers

Source Code of com.socrata.datasync.publishers.Soda2Publisher

package com.socrata.datasync.publishers;

import com.socrata.api.Soda2Producer;
import com.socrata.api.SodaDdl;
import com.socrata.datasync.PublishMethod;
import com.socrata.datasync.Utils;
import com.socrata.exceptions.SodaError;
import com.socrata.model.UpsertError;
import com.socrata.model.UpsertResult;
import com.socrata.model.importer.Column;
import com.socrata.model.importer.Dataset;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

public class Soda2Publisher {

    private Soda2Publisher() {
        throw new AssertionError("Never instantiate utility classes!");

     * Deletes the rows corresponding to the IDs within the given file
     * @param csvOrTsvFile a file specifiying the ids of the rows to be deleted
     * @return an upsertResult with the number of rows deleted and the number of errors encountered=
    public static UpsertResult deleteRows(Soda2Producer producer, SodaDdl ddl,
                                          final String id, final File csvOrTsvFile, final int numRowsPerChunk, final boolean containsHeaderRow)
            throws IOException, SodaError, InterruptedException
        List<Map<String, Object>> upsertObjectsChunk = new ArrayList<Map<String, Object>>();
        int totalRowsDeleted = 0;
        List<UpsertError> deleteErrors = new ArrayList<UpsertError>();

        String rowIdentifierName = getDatasetRowId(ddl, id);

        FileReader fileReader = new FileReader(csvOrTsvFile);
        CSVReader reader = new CSVReader(fileReader);
        String[] currLine;

        // skip first row if there is a header row
        if(containsHeaderRow) {

        do {
            currLine = reader.readNext();
            if(currLine != null) {
                upsertObjectsChunk.add(ImmutableMap.of(rowIdentifierName, (Object) currLine[0], ":deleted", Boolean.TRUE));
            if(upsertObjectsChunk.size() == numRowsPerChunk || currLine == null) {
                UpsertResult chunkResult = producer.upsert(id, upsertObjectsChunk);
                totalRowsDeleted += chunkResult.getRowsDeleted();

                if(chunkResult.errorCount() > 0) {
                    // TODO find a better way to suppress these errors (which are really not errors anyway)
                    for(UpsertError err : chunkResult.getErrors()) {
                        if(!err.getError().contains("no record is found")) {
        } while(currLine != null);

        return new UpsertResult(
                0, 0, totalRowsDeleted, deleteErrors);

     * Publishes the given csvOrTsvFile via SODA 2 replace or upsert/append. Publishes in chunks if using
     * upsert/append and numRowsPerChunk > 0 (all data will be upserted in one chunk if numRowsPerChunk == 0)
     * where each chunk contains numRowsPerChunk rows. Chunking is useful when uploading very large CSV files.
     * @param method to use to publish (upsert, append, or replace; delete not allowed)
     * @param id dataset ID to publish to
     * @param csvOrTsvFile file to publish containing data in comma- or tab- separated values (CSV or TSV) format
     * @param numRowsPerChunk number of rows within CSV to publish in each chunk
     *                        (if numRowsPerChunk == 0 do not use chunking)
     * @param containsHeaderRow if true assume the first row in CSV/TSV file is a list of the dataset columns,
     *                          otherwise upload all rows as new rows (column order must exactly match that of
     *                          Socrata dataset)
     * @return UpsertResult containing success or error information
     * @throws
     * @throws com.socrata.exceptions.SodaError
     * @throws InterruptedException
    public static UpsertResult publishViaSoda2(Soda2Producer producer, SodaDdl ddl,
                                               final PublishMethod method, final String id, final File csvOrTsvFile,
                                               int numRowsPerChunk, final boolean containsHeaderRow)
            throws IOException, SodaError, InterruptedException
        // If doing a replace force it to upload all data as a single chunk
        if(method.equals(PublishMethod.replace)) {
            System.out.println("WARNING: replace does not support chunking.");
            numRowsPerChunk = 0;

        List<Map<String, Object>> upsertObjectsChunk = new ArrayList<Map<String, Object>>();
        int totalRowsCreated = 0;
        int totalRowsUpdated = 0;
        int totalRowsDeleted = 0;
        List<UpsertError> upsertErrors = new ArrayList<UpsertError>();

        char columnDelimiter = ',';
        if(Utils.getFileExtension(csvOrTsvFile.toString()).equals("tsv")) {
            columnDelimiter = '\t';

        int numUploadedChunks = 0;
        FileReader  fileReader = new FileReader(csvOrTsvFile);
        CSVReader reader = new CSVReader(fileReader, columnDelimiter);

        String[] headers;
        if(containsHeaderRow) {
            headers = reader.readNext();
            // trim whitespace from header names
            if (headers != null) {
                for (int i=0; i<headers.length; i++) {
                    headers[i] = headers[i].trim();
        } else {
            // get API field names for each column in dataset
            Dataset info = (Dataset) ddl.loadDatasetInfo(id);
            List<Column> columns = info.getColumns();
            headers = new String[columns.size()];
            for(int i = 0; i < columns.size(); i++) {
                headers[i] = columns.get(i).getFieldName();

        if (headers != null) {
            String[] currLine;
            do {
                currLine = reader.readNext();
                ImmutableMap.Builder<String, Object> builder = ImmutableMap.builder();
                if(currLine != null) {
                    for (int i=0; i<currLine.length; i++) {
                        if (i < headers.length) {
                            builder.put(headers[i], currLine[i]);
                if(upsertObjectsChunk.size() == numRowsPerChunk || currLine == null) {
                    if(numRowsPerChunk == 0) {
                        System.out.println("Publishing entire file via HTTP...");
                    } else {
                        System.out.println("Publishing file in chunks via HTTP (" + numUploadedChunks * numRowsPerChunk + " rows uploaded so far)...");

                    // upsert or replace current chunk
                    UpsertResult chunkResult;
                    if(method.equals(PublishMethod.upsert) || method.equals(PublishMethod.append)) {
                        chunkResult = producer.upsert(id, upsertObjectsChunk);
                    } else if(method.equals(PublishMethod.replace)) {
                        chunkResult = producer.replace(id, upsertObjectsChunk);
                    } else {
                        throw new IllegalArgumentException("Error performing publish: "
                                + method + " is not a valid publishing method");
                    totalRowsCreated += chunkResult.getRowsCreated();
                    totalRowsUpdated += chunkResult.getRowsUpdated();
                    totalRowsDeleted += chunkResult.getRowsDeleted();
                    numUploadedChunks += 1;

                    if(chunkResult.errorCount() > 0) {
                        if(numRowsPerChunk != 0) {
                            for (UpsertError upsertErr : chunkResult.getErrors()) {
                                int lineIndexOffset = (containsHeaderRow) ? 2 : 1;
                                System.err.println("Error uploading chunk " + numUploadedChunks + ": " +
                                        upsertErr.getError() + " (line " +
                                        (upsertErr.getIndex() + lineIndexOffset + ((numUploadedChunks-1) * numRowsPerChunk)) + " of file)");

                    if(numRowsPerChunk != 0) {
                        System.out.println("Chunk " + numUploadedChunks + " uploaded: " + chunkResult.getRowsCreated() + " rows created; " +
                                chunkResult.getRowsUpdated() + " rows updated; " + chunkResult.getRowsDeleted() +
                                " rows deleted; " + chunkResult.errorCount() + " rows omitted");

            } while(currLine != null);
        return new UpsertResult(
                totalRowsCreated, totalRowsUpdated, totalRowsDeleted, upsertErrors);

     * This operation will do an append/upsert.
     * IMPORTANT: If you have a row identifier set on the dataset, and this is appending a row that
     * has an identifier of a row that has already been added, this will overwrite that row rather than
     * failing. If you have no row identifier set, this will be a straight append every time.
    public static UpsertResult appendUpsert(Soda2Producer producer, SodaDdl ddl,
                                            final String id, final File file,
                                            int numRowsPerChunk, boolean containsHeaderRow)
            throws SodaError, InterruptedException, IOException
        return publishViaSoda2(producer, ddl, PublishMethod.upsert, id, file, numRowsPerChunk, containsHeaderRow);

     * This is a new replace function that does not need a working copy.
    public static UpsertResult replaceNew(Soda2Producer producer, SodaDdl ddl,
                                          final String id, final File file,
                                          boolean containsHeaderRow)
            throws SodaError, InterruptedException, IOException {
        return publishViaSoda2(producer, ddl, PublishMethod.replace, id, file, 0, containsHeaderRow);

    private static String getDatasetRowId(SodaDdl ddl, String id) throws SodaError, InterruptedException {
        Dataset info = (Dataset) ddl.loadDatasetInfo(id);
        Column rowIdentifier = info.lookupRowIdentifierColumn();
        String rowIdentifierName;
        if (rowIdentifier == null) {
            rowIdentifierName = ":id";
        } else {
            rowIdentifierName = rowIdentifier.getFieldName();
        return rowIdentifierName;

Related Classes of com.socrata.datasync.publishers.Soda2Publisher

Copyright © 2018 All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact