Source Code of com.socrata.api.SodaImporter

package com.socrata.api;


import com.socrata.builders.BlueprintBuilder;
import com.socrata.exceptions.LongRunningQueryException;
import com.socrata.exceptions.SodaError;
import com.socrata.model.importer.*;
import com.socrata.model.requests.SodaRequest;
import com.sun.jersey.api.client.ClientResponse;
import com.sun.jersey.core.header.InBoundHeaders;
import org.apache.commons.lang3.StringUtils;


import javax.annotation.Nullable;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.UriBuilder;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URLEncoder;


/**
 * This class contains all the apis for using the full file import/update apis.
 *
 * The update and append APIs in this class require the dataset is in a working copy.  Since, creating
 * and publishing working copies can be expensive, when operating on large datsets or doing frequent updates
 * you should use the Soda2Producer class.  Soda2Producer does not require creating working copies, however,
 * when doing very large changes  or replacing a dataset working copies can be useful.
 *
 * Look at http://dev.socrata.com/publishers/workflow for information about the workflow process.
 *
 */
public class SodaImporter extends SodaDdl
{


    public static final String SCAN_BASE_PATH = "imports2";




    private final URI           importUri;




    /**
     * Create a new SodaImporter object, using the supplied credentials for authentication.
     *
     * @param url the base URL for the SODA2 domain to access.
     * @param userName user name to log in as
     * @param password password to log in with
     * @param token the App Token to use for authorization and usage tracking.  If this is {@code null}, no value will be sent.
     *
     * @return fully configured SodaImporter
     */
    public static final SodaImporter newImporter(final String url, String userName, String password, String token)
    {
        return new SodaImporter(HttpLowLevel.instantiateBasic(url, userName, password, token, null));
    }


    /**
     * Constructor.
     *
     * @param httpLowLevel the HttpLowLevel this uses to contact the server
     */
    public SodaImporter(HttpLowLevel httpLowLevel)
    {
        super(httpLowLevel);


        importUri = httpLowLevel.uriBuilder()
                              .path(API_BASE_PATH)
                              .path(SCAN_BASE_PATH)
                              .build();




    }


    /**
     * Creates a dataset from a CSV, using all the default column types.  This will also
     * assume the CSV has a single header row at the top.
     *
     * @param name name of the dataset to create
     * @param description description of the new dataset
     * @param file the file to upload
     * @param rowIdentifierColumnName row identifie
     * @return return the view that was just created.
     *
     * @throws InterruptedException
     * @throws SodaError
     * @throws IOException
     */
    public DatasetInfo createViewFromCsv(final String name, final String description, final File file, @Nullable final String rowIdentifierColumnName) throws InterruptedException, SodaError, IOException
    {
        return importScanResults(name, description, file, scan(file), rowIdentifierColumnName);
    }




    /**
     * Creates a dataset from a CSV, using all the default column types.  This will also
     * assume the CSV has a single header row at the top.
     *
     * @param name name of the dataset to create
     * @param description description of the new dataset
     * @param file the file to upload
     * @return return the view that was just created.
     *
     * @throws InterruptedException
     * @throws SodaError
     * @throws IOException
     */
    public DatasetInfo createViewFromCsv(final String name, final String description, final File file) throws InterruptedException, SodaError, IOException
    {
        return importScanResults(name, description, file, scan(file));
    }


    /**
     * Creates a new view based on a shapefile, these can be KML, KMZ or the ESRI format.
     *
     * This method will use default names and import all the layers.  To customize the import, call
     * scanShapeFile and importShapeScanResults separately.
     *
     * @param file the file to upload
     * @return the
     * @throws SodaError
     * @throws InterruptedException
     * @throws IOException
     */
    public DatasetInfo createViewFromShapefile(final File file) throws SodaError, InterruptedException, IOException
    {
        final ShapeScanResults  shapeScanResults = scanShapeFile(file);
        return importShapeScanResults(ShapeBlueprint.fromScanResults(shapeScanResults), file, shapeScanResults);
    }


    /**
     * Replaces a view with a new shapefile, these can be KML, KMZ or the ESRI format.
     *
     * @param viewId the id of the view to update
     * @param file the file to upload
     * @return
     */
    public DatasetInfo replaceViewFromShapefile(final String viewId, final File file) throws SodaError, InterruptedException, IOException
    {
        final ShapeScanResults  shapeScanResults = scanShapeFile(file);
        return replaceShapeScanResults(viewId, ShapeBlueprint.fromScanResults(shapeScanResults), file, shapeScanResults);
    }


    /**
     * Scans a file, then sends it up to the Socrata service to be analyzed and have things
     * like column types guessed.
     *
     * @param file File to upload
     * @return the results of the scan.
     *
     * @throws SodaError
     * @throws InterruptedException
     */
    public ScanResults scan(final File file) throws SodaError, InterruptedException
    {
        return scanFile("scan", HttpLowLevel.CSV_TYPE, file, ScanResults.class);
    }




    /**
     * Scans a file, then sends it up to the Socrata service to be analyzed and have things
     * like column types guessed.
     *
     * @param file File to upload
     * @return the results of the scan.
     *
     * @throws SodaError
     * @throws InterruptedException
     */
    public ShapeScanResults scanShapeFile(final File file) throws SodaError, InterruptedException
    {
        return scanFile("scanShape", MediaType.APPLICATION_OCTET_STREAM_TYPE, file, ShapeScanResults.class);
    }






    protected <T> T scanFile(final String method, final MediaType mediaType, final File file, Class<T> retType) throws SodaError, InterruptedException
    {


        SodaRequest requester = new SodaRequest<File>(null, file)
        {
            public ClientResponse issueRequest() throws LongRunningQueryException, SodaError
            {
                final URI scanUri = UriBuilder.fromUri(importUri)
                                              .queryParam("method", method)
                                              .build();


                return httpLowLevel.postFileRaw(scanUri, mediaType, payload);
            }
        };


        try {
            final ClientResponse response = requester.issueRequest();
            return response.getEntity(retType);
        } catch (LongRunningQueryException e) {
            return getHttpLowLevel().getAsyncResults(e.location, e.timeToRetry, getHttpLowLevel().getMaxRetries(), retType, requester);
        }
    }






    /**
     * Imports the results of scanning a file.  This will build  a default blueprint from it, assuming the first rows are
     * column names.
     *
     * @param name name of the dataset to create
     * @param description description of the datset
     * @param file file that was scanned
     * @param scanResults results of the scan
     * @return The default View object for the dataset that was just created.
     */
    public DatasetInfo importScanResults(final String name, final String description, final File file, final ScanResults scanResults) throws SodaError, InterruptedException, IOException
    {
       return importScanResults(name, description, file, scanResults, null);
    }




    /**
     * Imports the results of scanning a file.  This will build  a default blueprint from it, assuming the first rows are
     * column names.
     *
     * @param name name of the dataset to create
     * @param description description of the datset
     * @param file file that was scanned
     * @param scanResults results of the scan
     * @return The default View object for the dataset that was just created.
     */
    public DatasetInfo importScanResults(final String name, final String description, final File file, final ScanResults scanResults, @Nullable final String rowIdentifierColumnName) throws SodaError, InterruptedException, IOException
    {
        final Blueprint blueprint = new BlueprintBuilder(scanResults)
                                        .setSkip(1)
                                        .setName(name)
                                        .setDescription(description)
                                        .build();


        DatasetInfo createdDatasetInfo = importScanResults(blueprint, null, file, scanResults);


        if (rowIdentifierColumnName != null) {


            final Dataset createdDataset = (Dataset) loadDatasetInfo(createdDatasetInfo.getId());
            try {
                createdDataset.setupRowIdentifierColumnByName(rowIdentifierColumnName);
                createdDatasetInfo = updateDatasetInfo(createdDataset);
            } catch (IllegalArgumentException e) {
                deleteDataset(createdDataset.getId());
                throw e;
            }
        }
        return loadDatasetInfo(createdDatasetInfo.getId());
    }




    /**
     * Imports the results of scanning a file.  This method does not assume anything about the CSV, but instead has
     * the caller provide the blueprint and the translation for any schema defintion or data transforms.
     *
     * @param blueprint
     * @param translation
     * @param file file that was scanned
     * @param scanResults results of the scan
     * @return The default View object for the dataset that was just created.
     */
    public DatasetInfo importScanResults(final Blueprint blueprint, final String[] translation, final File file, final ScanResults scanResults) throws SodaError, InterruptedException, IOException
    {
        final String blueprintString = mapper.writeValueAsString(blueprint);


        final String blueprintBody = "blueprint="+URLEncoder.encode(blueprintString, "UTF-8");
        return sendScanResults(blueprintBody, scanResults.getFileId(), translation, file);
    }


    /**
     * Imports the results of scanning a file.  This method does not assume anything about the Shape file, but instead has
     * the caller provide the blueprint for any layer modifications.
     *
     * @param blueprint
     * @param file file that was scanned
     * @param scanResults results of the scan
     * @return The default View object for the dataset that was just created.
     */
    public DatasetInfo importShapeScanResults(final ShapeBlueprint blueprint, final File file, final ShapeScanResults scanResults) throws SodaError, InterruptedException, IOException
    {
        return sendShapeScanResults(scanResults.getFileId(), "shapefile", blueprint, file, null);
    }


    /**
     * Replaces the a view with the results of scanning in a shape file.
     *
     * @param viewId  id of the view to overwrite
     * @param blueprint blueprint object with the name of the layers, etc.
     * @param file file that was scanned
     * @param scanResults results of the scan
     * @return  The default object for the dataset that was just updated.
     */
    public DatasetInfo replaceShapeScanResults(final String viewId, final ShapeBlueprint blueprint, final File file, final ShapeScanResults scanResults) throws SodaError, InterruptedException, IOException
    {
        return sendShapeScanResults(scanResults.getFileId(), "replaceShapefile", blueprint, file, viewId);
    }




    /**
     * This appends the contents of a file to a dataset on Socrata.  This operation requires the dataset is
     * in a working copy, so unless you are doing large updates or your dataset is small, using the UPSERT functionality
     * in Soda2Producer may give you better results.
     *
     * If you are doing frequent updates, the apis in Soda2Producer may give better results (since they don't require working copies)
     *
     * In the case of errors, if the error is an MetadataUpdateError, then the data has all been committed, but there was a problem with
     * the meta-data.  In the case of any other errors, the dataset is in an unknown state.  The only way to get it back into a clean
     * state is to remove the working copy, and start again.  The Soda2Producer API has better error semantics where all rows will be
     * either committed or rolledback.
     *
     *
     * @param datasetId  id of the dataset to append to
     * @param file file with the data in it
     * @param skip number of rows in the data to skip (normally for skipping headers)
     * @param translation an optional translation array for translating from values in the file and values in the dataset.
     * @return The info of the dataset after the append operation.
     * @throws com.socrata.exceptions.MetadataUpdateError thrown if the data was updated, but the process failed because
     * of a metadata inconsistency.  In this case, the data has already been committed.
     */
    public DatasetInfo append(String datasetId, File file, int skip, final String[] translation) throws SodaError, InterruptedException, IOException
    {


        final ScanResults results = scan(file);
        return updateFromScanResults(datasetId, "append", skip, results.getFileId(), translation, file);
    }


    /**
     * This replaces the contents of a file to a dataset on Socrata.  This operation requires the dataset is
     * in a working copy, which is an expensive operation.  If your dataset is large, you may want to figure out
     * how to figure out which rows to update, rather than doing a full replace for updates.
     *
     * If you are doing frequent updates, the apis in Soda2Producer may give better results (since they don't require working copies)
     *
     * In the case of errors, if the error is an MetadataUpdateError, then the data has all been committed, but there was a problem with
     * the meta-data.  In the case of any other errors, the dataset is in an unknown state.  The only way to get it back into a clean
     * state is to remove the working copy, and start again.  The Soda2Producer API has better error semantics where all rows will be
     * either committed or rolledback.
     *
     * @param datasetId  id of the dataset to append to
     * @param file file with the data in it
     * @param skip number of rows in the data to skip (normally for skipping headers)
     * @param translation an optional translation array for translating from values in the file and values in the dataset.
     * @return The info of the dataset after the append operation.
     * @throws com.socrata.exceptions.MetadataUpdateError thrown if the data was updated, but the process failed because
     * of a metadata inconsistency.  In this case, the data has already been committed.
     */
    public DatasetInfo replace(String datasetId, File file, int skip, final String[] translation) throws SodaError, InterruptedException, IOException
    {
        final ScanResults results = scan(file);
        return updateFromScanResults(datasetId, "replace", skip, results.getFileId(), translation, file);
    }


    protected DatasetInfo updateFromScanResults(final String datasetId, final String method, final int skip, final String fileId, final String[] translation, final File file) throws SodaError, InterruptedException, IOException
    {




        final StringBuilder updateBody = new StringBuilder();
        updateBody.append("viewUid=").append(datasetId)
                  .append("&method=").append(method)
                  .append("&skip=").append(skip);




        return sendScanResults(updateBody.toString(), fileId, translation, file);


    }


    protected DatasetInfo sendScanResults(final String basePostBody, final String fileId, final String[] translation, final File file) throws SodaError, InterruptedException, IOException
    {


        final StringBuilder postbodyBuilder = new StringBuilder(basePostBody);


        final String translationString =  (translation != null) ? "[" + StringUtils.join(translation, ",") + "]" : "";


        postbodyBuilder.append("&fileId=").append(fileId)
                       .append("&translation=").append(URLEncoder.encode(translationString, "UTF-8"))
                       .append("&name=").append(URLEncoder.encode(file.getName(), "UTF-8"));


        SodaRequest requester = new SodaRequest<String>(null, postbodyBuilder.toString())
        {
            public ClientResponse issueRequest() throws LongRunningQueryException, SodaError
            {
                return httpLowLevel.postRaw(importUri, MediaType.APPLICATION_FORM_URLENCODED_TYPE, ContentEncoding.IDENTITY, payload);
            }
        };


        try {
            final ClientResponse response = requester.issueRequest();
            return response.getEntity(DatasetInfo.class);
        } catch (LongRunningQueryException e) {
            LongRunningQueryException lrqe = e.location != null ? e :
                new LongRunningQueryException(UriBuilder.fromUri(importUri).queryParam("ticket", e.ticket).build(), e.timeToRetry, e.ticket);
            LongRunningRequest<String, DatasetInfo> longRunningRequest = new LongRunningRequest(lrqe, DatasetInfo.class, requester);
            HttpLowLevel http = getHttpLowLevel();
            return longRunningRequest.checkStatus(http, http.getStatusCheckErrorRetries(), http.getStatusCheckErrorTime());
        }
    }


    protected DatasetInfo sendShapeScanResults(final String fileId, final String method, final ShapeBlueprint shapeBlueprint, final File file, final String viewId) throws SodaError, InterruptedException, IOException
    {


        final StringBuilder postbodyBuilder = new StringBuilder();
        final String blueprintString = mapper.writeValueAsString(shapeBlueprint);
        final URI    shapeImportUri = UriBuilder.fromUri(importUri)
                                                .queryParam("method", method)
                                                .build();




        postbodyBuilder.append("&fileId=").append(fileId)
                       .append("&name=").append(URLEncoder.encode(file.getName(), "UTF-8"))
                       .append("&blueprint=").append(URLEncoder.encode(blueprintString, "UTF-8"));


        if (StringUtils.isNotEmpty(viewId)) {
            postbodyBuilder.append("&viewUid=").append(viewId);
        }


        SodaRequest requester = new SodaRequest<String>(null, postbodyBuilder.toString())
        {
            public ClientResponse issueRequest() throws LongRunningQueryException, SodaError
            {
                return httpLowLevel.postRaw(shapeImportUri, MediaType.APPLICATION_FORM_URLENCODED_TYPE, ContentEncoding.IDENTITY, payload);
            }
        };




        try {


            final ClientResponse response = requester.issueRequest();
            return response.getEntity(DatasetInfo.class);
        } catch (LongRunningQueryException e) {


            if (e.location != null) {
                return getHttpLowLevel().getAsyncResults(e.location, e.timeToRetry, Integer.MAX_VALUE, DatasetInfo.class, requester);
            } else {


                final URI ticketUri = UriBuilder.fromUri(shapeImportUri)
                                                .queryParam("ticket", e.ticket)
                                                .build();
                return getHttpLowLevel().getAsyncResults(ticketUri, e.timeToRetry, Integer.MAX_VALUE, DatasetInfo.class, requester);


            }
        }


    }




    /**
     * Imports a file that is NOT going to be used to create a datset, but is instead available for
     * downloading directly.
     *
     * @param name name of the file
     * @param description description of the file
     * @param file the file to upload
     * @return The NonDataFileDataset object that was saved to Socrata
     */
    public NonDataFileDataset importNonDataFile(final String name, final String description, final File file) throws SodaError, InterruptedException
    {


        SodaRequest requester = new SodaRequest<File>(null, file)
        {
            public ClientResponse issueRequest() throws LongRunningQueryException, SodaError
            {
                final URI scanUri = UriBuilder.fromUri(importUri)
                                              .queryParam("method", "blob")
                                              .queryParam("fileUploaderfile", file.getName())
                                              .build();


                try {
                    final InputStream   is = new FileInputStream(file);
                    try {
                        ClientResponse clientResponse = httpLowLevel.postFileRaw(scanUri, MediaType.APPLICATION_OCTET_STREAM_TYPE, MediaType.TEXT_PLAIN_TYPE, file);


                        //Funny issue with service, currently only returns MediaType.TEXT_PLAIN_TYPE, but the
                        //response needs to be processed as JSON.  So, wrap the return in a ClientResponse that acts
                        //as if the content type is JSON. There is a bug on the core server side to fix this.
                        InBoundHeaders  headers = new InBoundHeaders();
                        headers.putSingle("Content-Type", MediaType.APPLICATION_JSON);
                        return new ClientResponse(clientResponse.getStatus(), headers, clientResponse.getEntityInputStream(), clientResponse.getClient().getMessageBodyWorkers());


                    } finally {
                        is.close();
                    }
                } catch (IOException ioe) {
                    throw new SodaError("Unable to load file: " + file.getAbsolutePath());
                }
            }
        };


        NonDataFileDataset nonDataFileDataset;
        try {
            final ClientResponse response = requester.issueRequest();
            nonDataFileDataset = response.getEntity(NonDataFileDataset.class);
        } catch (LongRunningQueryException e) {
            nonDataFileDataset = getHttpLowLevel().getAsyncResults(e.location, e.timeToRetry, getHttpLowLevel().getMaxRetries(), NonDataFileDataset.class, requester);
        }


        nonDataFileDataset.setDescription(description);
        nonDataFileDataset.setName(name);
        return (NonDataFileDataset) updateDatasetInfo(nonDataFileDataset);
    }




    /**
     * Replaces the file blob for a Imports a NonDataFileDataset.  For changing other properties,
     * use SodaDdl.updateDatasetInfo
     *
     * @param id name of the file
     * @param file the file to upload
     * @return The NonDataFileDataset object that was saved to Socrata
     */
    public NonDataFileDataset replaceNonDataFile(final String id, final File file) throws SodaError, InterruptedException
    {


        SodaRequest requester = new SodaRequest<File>(null, file)
        {
            public ClientResponse issueRequest() throws LongRunningQueryException, SodaError
            {
                final URI scanUri = UriBuilder.fromUri(viewUri)
                                              .path(id + ".txt")
                                              .queryParam("method", "replaceBlob")
                                              .queryParam("fileUploaderfile", file.getName())
                                              .build();


                try {
                    final InputStream   is = new FileInputStream(file);
                    try {
                        ClientResponse clientResponse = httpLowLevel.postFileRaw(scanUri, MediaType.APPLICATION_OCTET_STREAM_TYPE, MediaType.TEXT_PLAIN_TYPE, file);


                        //Funny issue with service, currently only returns MediaType.TEXT_PLAIN_TYPE, but the
                        //response needs to be processed as JSON.  So, wrap the return in a ClientResponse that acts
                        //as if the content type is JSON. There is a bug on the core server side to fix this.
                        InBoundHeaders  headers = new InBoundHeaders();
                        headers.putSingle("Content-Type", MediaType.APPLICATION_JSON);
                        return new ClientResponse(clientResponse.getStatus(), headers, clientResponse.getEntityInputStream(), clientResponse.getClient().getMessageBodyWorkers());


                    } finally {
                        is.close();
                    }
                } catch (IOException ioe) {
                    throw new SodaError("Unable to load file: " + file.getAbsolutePath());
                }
            }
        };


        NonDataFileDataset nonDataFileDataset;
        try {
            final ClientResponse response = requester.issueRequest();
            nonDataFileDataset = response.getEntity(NonDataFileDataset.class);
        } catch (LongRunningQueryException e) {
            nonDataFileDataset = getHttpLowLevel().getAsyncResults(e.location, e.timeToRetry, getHttpLowLevel().getMaxRetries(), NonDataFileDataset.class, requester);
        }


        return (NonDataFileDataset) loadDatasetInfo(id);
    }






    /**
     * Creates a straight translation with no transforms for a  given bluprint.
     *
     * @param blueprint blueprint to build the translation from
     * @return the array of mappings to map each field to itself.  This will create a translation that will do nothing.
     */
    public String[] generateTranslation(final Blueprint blueprint) {
        final String[]    retVal = new String[blueprint.getColumns().size()];


        int i =0;
        for (BlueprintColumn column : blueprint.getColumns()) {
            //retVal[i++] = column.getName();
            retVal[i] = "col" + i;
            i++;
        }


        return retVal;
    }






}
Source Code of com.socrata.api.SodaImporter

Related Classes of com.socrata.api.SodaImporter