Examples of WARCRecord


Examples of edu.cmu.lemurproject.WarcRecord

        if (whichStream == null) {
            return false;
        }

        WarcRecord newRecord = WarcRecord.readNextWarcRecord(whichStream);
        if (newRecord == null) {
            // try advancing the file
            if (openNextFile()) {
                newRecord = WarcRecord.readNextWarcRecord(whichStream);
            }

            if (newRecord == null) {
                return false;
            }
        }

        totalNumBytesRead += (long) newRecord.getTotalRecordLength();
        newRecord.setWarcFilePath(filePathList[currentFilePath].toString());

        // now, set our output variables
        value.setRecord(newRecord);
        key.set(recordNumber);
View Full Code Here

Examples of edu.cmu.lemurproject.WarcRecord

public class WritableWarcRecord implements Writable {

    WarcRecord record = null;

    public WritableWarcRecord() {
        record = new WarcRecord();
    }
View Full Code Here

Examples of edu.cmu.lemurproject.WarcRecord

    public WritableWarcRecord() {
        record = new WarcRecord();
    }

    public WritableWarcRecord(WarcRecord o) {
        record = new WarcRecord(o);
    }
View Full Code Here

Examples of edu.cmu.lemurproject.WarcRecord

    public WarcRecord getRecord() {
        return record;
    }

    public void setRecord(WarcRecord rec) {
        record = new WarcRecord(rec);
    }
View Full Code Here

Examples of edu.cmu.lemurproject.WarcRecord

  public void map(LongWritable key, WritableWarcRecord record,
      OutputCollector<Text, BehemothDocument> output, Reporter reporter)
      throws IOException {

    WarcRecord wr = record.getRecord();

    if (wr.getHeaderRecordType().equals("response") == false)
      return;

    byte[] binarycontent = wr.getContent();

    String uri = wr.getHeaderMetadataItem("WARC-Target-URI");

    // skip non http documents
    if (uri.startsWith("http") == false)
      return;

    String ip = wr.getHeaderMetadataItem("WARC-IP-Address");

    HttpResponse response;
    try {
      response = new HttpResponse(binarycontent);
    } catch (ProtocolException e) {
View Full Code Here

Examples of it.unimi.dsi.law.warc.io.WarcRecord

    int cnt = 0;

    //      while (cnt < 10 && it.hasNext()) {
    while (it.hasNext()) {

        WarcRecord nextRecord = it.next();

        //Get the HttpResponse
        try {
      response.fromWarcRecord (nextRecord);
View Full Code Here

Examples of net.bpiwowar.mg4j.extensions.warc.WarcRecord

     * Parses a document from a WARC collection
     *
     * @throws java.io.IOException
     */
    private void parseWAR018CDocument() throws IOException {
      WarcRecord warcRecord = null;
      DataInputStream dis = new DataInputStream(rawContent);

      // Regardless of what the stream gives us, we read and return
      // the first entry which is a response.
      WarcHTMLResponseRecord warcResponse = null;
      while ((warcRecord = WarcRecord.readNextWarcRecord(dis)) != null) {
        // ignore if no WARC response type, otherwise read and finish
        if (warcRecord.getHeaderRecordType().equals("response")) {
          warcResponse = new WarcHTMLResponseRecord(warcRecord);
          break;
        }
      }

View Full Code Here

Examples of org.archive.io.warc.WARCRecord

    /* (non-Javadoc)
     * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
     */
    public WARCRecord adapt(ArchiveRecord o) {
      WARCRecord rec = null;
      if (o instanceof WARCRecord) {
        rec = (WARCRecord) o;
      }
      return rec;
    }
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

            ArchiveRecordHeader header = record.getHeader();
            logger.warning("record at offset: " + header.getOffset()
                + " has errors: " + arcRecord.getErrors());
          }
        } else {
          WARCRecord warcRecord = (WARCRecord) record;
          warcRecord.getHeader();
        }
  }
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

   * @param warcReader a WARCReader instance
   * @throws IOException
   */
  private void replayRecord(WARCReader warcReader) throws IOException {
    warcReader.setStrict(true);
    WARCRecord warcRecord = (WARCRecord) warcReader.get(this.offset);
      byte[] buffer = new byte[BUFFER_SIZE];
      if (warcRecord.available() > 0) {
        int r = -1;
        while((r = warcRecord.read(buffer, 0, BUFFER_SIZE)) != -1) {
          System.out.write(buffer, 0, r);
        }
      }
    System.out.println("record bytes available: "
        + warcRecord.available());
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.