AutoDetectParser parser = new AutoDetectParser();
try {
parser.parse(is, contenthandler, metadata);
} catch (SAXException | TikaException e) {
logger.error("Unable to parse stream: " + e.getMessage());
throw new KarmaException("Unable to parse stream: "
+ e.getMessage());
}
MediaTypeRegistry registry = MimeTypes.getDefaultMimeTypes()
.getMediaTypeRegistry();
registry.addSuperType(new MediaType("text", "csv"), new MediaType(
"text", "plain"));
MediaType parsedType = MediaType.parse(metadata
.get(Metadata.CONTENT_TYPE));
if (registry.isSpecializationOf(registry.normalize(type), registry
.normalize(parsedType).getBaseType())) {
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
logger.info("Detected " + metadata.get(Metadata.CONTENT_TYPE));
inputType = getInputType(metadata);
encoding = metadata.get(Metadata.CONTENT_ENCODING);
} else {
encoding = EncodingDetector.detect(is);
}
is.reset();
if(inputType == null) {
throw new KarmaException("Content type unrecognized");
}
switch (inputType) {
case JSON : {
worksheet = generateWorksheetFromJSONStream(sourceName, is,
workspace, encoding, maxNumLines);
break;
}
case XML : {
worksheet = generateWorksheetFromXMLStream(sourceName, is,
workspace, encoding, maxNumLines);
break;
}
case CSV : {
worksheet = generateWorksheetFromDelimitedStream(sourceName,
is, workspace, encoding, maxNumLines);
break;
}
case AVRO : {
worksheet = generateWorksheetFromAvroStream(sourceName, is, workspace, encoding, maxNumLines);
}
}
} catch (Exception e ) {
logger.error("Error generating worksheet", e);
throw new KarmaException("Unable to generate worksheet: " + e.getMessage());
}
if(worksheet == null) {
throw new KarmaException("Content type unrecognized");
}
return worksheet;
}