}
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, TikaException, SAXException {
EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
new ParsingEmbeddedDocumentExtractor(context));
String charsetName = "windows-1252";
metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
metadata.set(Metadata.CONTENT_ENCODING, charsetName);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
InputStreamReader isr = new InputStreamReader(stream, charsetName);
BufferedReader reader = new BufferedReader(isr);
try {
String curLine = reader.readLine();
int mailItem = 0;
do {
if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
Metadata mailMetadata = new Metadata();
Queue<String> multiline = new LinkedList<String>();
mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
curLine = reader.readLine();
ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
do {
if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
String latestLine = multiline.poll();
latestLine += " " + curLine.trim();
multiline.add(latestLine);
} else {
multiline.add(curLine);
}
message.write(curLine.getBytes(charsetName));
message.write(0x0A);
curLine = reader.readLine();
} while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
for (String item : multiline) {
saveHeaderInMetadata(mailMetadata, item);
}
ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
message = null;
if (extractor.shouldParseEmbedded(mailMetadata)) {
extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
}
if (tracking) {
getTrackingMetadata().put(mailItem++, mailMetadata);
}