//set the already parsed contentType
metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
final StringWriter writer = new StringWriter();
final ContentHandler textHandler = new BodyContentHandler( //only the Body
new PlainTextHandler(writer, false,skipLinebreaks)); //skip ignoreable
final ToXMLContentHandler xhtmlHandler;
final ContentHandler mainHandler;
if(!plainMediaType.equals(XHTML)){ //do not parse XHTML from XHTML
xhtmlHandler = new ToXMLContentHandler();
mainHandler = new MultiHandler(textHandler,xhtmlHandler);
} else {
mainHandler = textHandler;
xhtmlHandler = null;
}
try {
parser.parse(in, mainHandler, metadata, context);
} catch (Exception e) {
throw new EngineException("Unable to convert ContentItem "+
ci.getUri()+" with mimeType '"+ci.getMimeType()+"' to "+
"plain text!",e);
}
IOUtils.closeQuietly(in);
if(log.isDebugEnabled()){
log.debug("Plain Content: \n{}",writer.toString());
}
String random = randomUUID().toString();
UriRef textBlobUri = new UriRef("urn:tika:text:"+random);
ci.addPart(textBlobUri,
new InMemoryBlob(writer.toString(),
TEXT_PLAIN.toString())); //string -> no encoding
if(xhtmlHandler != null){
if(log.isDebugEnabled()){
log.debug("XML Content: \n{}",xhtmlHandler.toString());
}
UriRef xhtmlBlobUri = new UriRef("urn:tika:xhtml:"+random);
ci.addPart(xhtmlBlobUri,
new InMemoryBlob(xhtmlHandler.toString(),
"application/xhtml+xml")); //string -> no encoding
}
//add the extracted metadata
if(log.isDebugEnabled()){
for(String name : metadata.names()){