Package edu.cmu.lemurproject

Examples of edu.cmu.lemurproject.WarcRecord$WarcHeader


        if (whichStream == null) {
            return false;
        }

        WarcRecord newRecord = WarcRecord.readNextWarcRecord(whichStream);
        if (newRecord == null) {
            // try advancing the file
            if (openNextFile()) {
                newRecord = WarcRecord.readNextWarcRecord(whichStream);
            }

            if (newRecord == null) {
                return false;
            }
        }

        totalNumBytesRead += (long) newRecord.getTotalRecordLength();
        newRecord.setWarcFilePath(filePathList[currentFilePath].toString());

        // now, set our output variables
        value.setRecord(newRecord);
        key.set(recordNumber);
View Full Code Here


public class WritableWarcRecord implements Writable {

    WarcRecord record = null;

    public WritableWarcRecord() {
        record = new WarcRecord();
    }
View Full Code Here

    public WritableWarcRecord() {
        record = new WarcRecord();
    }

    public WritableWarcRecord(WarcRecord o) {
        record = new WarcRecord(o);
    }
View Full Code Here

    public WarcRecord getRecord() {
        return record;
    }

    public void setRecord(WarcRecord rec) {
        record = new WarcRecord(rec);
    }
View Full Code Here

  public void map(LongWritable key, WritableWarcRecord record,
      OutputCollector<Text, BehemothDocument> output, Reporter reporter)
      throws IOException {

    WarcRecord wr = record.getRecord();

    if (wr.getHeaderRecordType().equals("response") == false)
      return;

    byte[] binarycontent = wr.getContent();

    String uri = wr.getHeaderMetadataItem("WARC-Target-URI");

    // skip non http documents
    if (uri.startsWith("http") == false)
      return;

    String ip = wr.getHeaderMetadataItem("WARC-IP-Address");

    HttpResponse response;
    try {
      response = new HttpResponse(binarycontent);
    } catch (ProtocolException e) {
View Full Code Here

TOP

Related Classes of edu.cmu.lemurproject.WarcRecord$WarcHeader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.