String collectionName, String filename, URLNormalizers urlNormalizers, URLFilters filters) throws ParseException, IOException {
Token docNumber = null;
TRECDoc d = new TRECDoc();
StringBuffer docBody = new StringBuffer();
String mimetype = null;
Metadata metaData = new Metadata();
Date fetchDate = null;
label_1:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case DOC_BEGIN:
;
break;
default:
jj_la1[0] = jj_gen;
break label_1;
}
try {
try {
jj_consume_token(DOC_BEGIN);
label_2:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case 23:
case 24:
case 25:
case 26:
;
break;
default:
jj_la1[1] = jj_gen;
break label_2;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case 23:
jj_consume_token(23);
break;
case 24:
jj_consume_token(24);
break;
case 25:
jj_consume_token(25);
break;
case 26:
jj_consume_token(26);
break;
default:
jj_la1[2] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
label_3:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case DOCNO_BEGIN:
jj_consume_token(DOCNO_BEGIN);
docNumber = jj_consume_token(DOCNO);
jj_consume_token(DOCNO_END);
label_4:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case 23:
case 24:
case 25:
case 26:
;
break;
default:
jj_la1[3] = jj_gen;
break label_4;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case 23:
jj_consume_token(23);
break;
case 24:
jj_consume_token(24);
break;
case 25:
jj_consume_token(25);
break;
case 26:
jj_consume_token(26);
break;
default:
jj_la1[4] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
break;
case OLDDOCNO_BEGIN:
jj_consume_token(OLDDOCNO_BEGIN);
jj_consume_token(DOCNO);
jj_consume_token(OLDDOCNO_END);
label_5:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case 23:
case 24:
case 25:
case 26:
;
break;
default:
jj_la1[5] = jj_gen;
break label_5;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case 23:
jj_consume_token(23);
break;
case 24:
jj_consume_token(24);
break;
case 25:
jj_consume_token(25);
break;
case 26:
jj_consume_token(26);
break;
default:
jj_la1[6] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
break;
default:
jj_la1[7] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case DOCNO_BEGIN:
case OLDDOCNO_BEGIN:
;
break;
default:
jj_la1[8] = jj_gen;
break label_3;
}
}
jj_consume_token(DOCHDR_BEGIN);
docHdrBody(d);
jj_consume_token(DOCHDR_END);
docBody = new StringBuffer(body());
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case DOC_END:
jj_consume_token(DOC_END);
break;
case 0:
jj_consume_token(0);
break;
default:
jj_la1[9] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
label_6:
while (true) {
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case 23:
case 24:
case 25:
case 26:
;
break;
default:
jj_la1[10] = jj_gen;
break label_6;
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case 23:
jj_consume_token(23);
break;
case 24:
jj_consume_token(24);
break;
case 25:
jj_consume_token(25);
break;
case 26:
jj_consume_token(26);
break;
default:
jj_la1[11] = jj_gen;
jj_consume_token(-1);
throw new ParseException();
}
}
}
catch (Error err) { // TODO MC - to catch
LOG.error("Error:"+err.getMessage());
throw new ParseException(err.getMessage());
}
}
catch (Exception e) {
LOG.error("Parsing exception:"+e.toString());
StringBuilder buffer = new StringBuilder(16000); // TODO MC
Token t=null;
do {
try {
t = getNextToken();
buffer.append(t.image);
}
catch (Error err) {
LOG.error("Parsing nested error:"+err.toString());
}
catch (Exception enest) {
LOG.error("Parsing nested exception:"+enest.toString());
}
}
while (t.kind != DOC_END && t.kind != EOF);
docBody.append(buffer.toString()); // TODO MC
if (t.kind != EOF) {
// Eat up DOC_END.
try {
t = getNextToken();
}
catch (Error err) {
LOG.error("Parsing nested error2:"+err.toString());
}
catch (Exception enest) {
LOG.error("Parsing nested exception2:"+enest.toString());
}
}
// If error, skip this doc. completely.
//{if (true) return;} TODO MC - this line ignores all other files from gz files
LOG.error("Parsing will continue with text:"); // TODO MC
LOG.error(""+(new String(docBody.toString().getBytes()))); // TODO MC
}
d.docNumber = docNumber.image;
d.bodyLength = new Integer(docBody.length());
/* TODO MC - normalize and filter URL*/
String url=d.urlString;
try {
url = urlNormalizers.normalize(url,URLNormalizers.SCOPE_FETCHER);
url = filters.filter(url); // filter the url
}
catch (Exception e) {
LOG.error("Error:"+e.getMessage());
throw new ParseException(e.getMessage());
}
LOG.info("Importing DocNo:" + d.docNumber + " url:" + url + " oldurl:" +d.urlString);
if (url!=null) { // for instance if the url is too large then it is null
//Go through headers
for (Iterator it=d.headers.entrySet().iterator(); it.hasNext(); ) {
Map.Entry me = (Map.Entry)it.next();
String key = (String)me.getKey();
String value = (String)me.getValue();
// Find a mimetype
if (key.toLowerCase().equals(
CONTENT_TYPE_KEY)) {
// Is it valid
try {
mimetype = value.toLowerCase().
replaceAll(WHITESPACE,"-");
if (mimetype == null) {
mimetype = "no-type";
}
new MimeType(value.toLowerCase());
} catch (MimeTypeException e) {
mimetype = "no-type";
}
if (skip(mimetype)) { //XXX
}
}
// Parse a fetch date from the http headers
if (key.toLowerCase().equals(DATE_KEY)) {
try {
fetchDate = parseDate(value);
} catch (ParseException e) {
// Need to log this
// Parse exception, default date will be inserted later
LOG.error("Date Exception " + e.getMessage());
}
}
// Add the rest of headers to the metadata
metaData.set(key, value);
}
// Set metadata document number
metaData.set(DOCNO_KEY,d.docNumber);
// Set mimetype
metaData.set(CONTENT_TYPE_KEY, mimetype);
//Set length
metaData.set(CONTENT_LENGTH, d.bodyLength.toString());
// Set Segment Name
metaData.set(Nutch.SEGMENT_NAME_KEY, segmentName);
//Set md5
metaData.set(Nutch.SIGNATURE_KEY, MD5Hash.digest(docBody.toString().getBytes()).toString());
// Set collection
metaData.set(COLLECTION,collectionName); // TODO MC
// Set arcname
metaData.set(ARC_NAME,filename); // TODO MC
//If we didn't get a date Just use a random one.
if (fetchDate == null) {
fetchDate = new Date(Long.decode("1151693552").longValue());
}
//start timer
// long startTime = System.currentTimeMillis();
//Make a content object
Content content = new Content(url,url, docBody.toString().getBytes(), mimetype, metaData, conf);
Parse parse = null;
ParseStatus parseStatus;
try {
parse = pu.parse(content);
parseStatus = parse.getData().getStatus();
}
catch (final Exception e) {
parseStatus = new ParseStatus(e);
LOG.error("error: unknown "+parseStatus.toString());
if(!parseStatus.isSuccess()) {
LOG.error("parse failure");
}
}
catch (StackOverflowError soe){
parseStatus = new ParseStatus(soe);
LOG.error("error: StackOverflowError "+parseStatus.toString());
if(!parseStatus.isSuccess()) {
LOG.error("parse failure");
}
}
if(parseStatus.isSuccess()) {
CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
datum.setFetchTime(fetchDate.getTime());
// Score at this stage is 1.0f.
metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore())); // TODO MC
// WritableComparable outkey = new UTF8(d.urlString);
WritableComparable outkey = new Text(url);
Writable outvalue = new FetcherOutput(datum, null, new ParseImpl(parse));
// output.collect(outkey, outvalue);
Text key=Nutchwax.generateWaxKey(outkey, collectionName);
output.collect(key, outvalue);
}
}
d = new TRECDoc();
metaData = new Metadata();
}
jj_consume_token(0);
}