Examples of WarcRecord


Examples of org.archive.io.warc.WARCRecord

   * @throws IOException
   */
  private void indexRecord(WARCReader warcReader) throws IOException {
    warcReader.setStrict(true);
    // warcReader.setParseHttpHeaders(true);
    WARCRecord warcRecord = (WARCRecord)warcReader.get(this.offset);
    ArchiveRecordHeader header = warcRecord.getHeader();
    System.out.println("========== selected metadata:");
    warcRecord.close(); // must close record to get digest
    printMetadata(warcRecord,header);
    System.out.println("========== header: \n" + header);
  }
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

       Logger l = Logger.getLogger(writer.getClass().getName());
       Level oldLevel = l.getLevel();
     try {
           l.setLevel(Level.WARNING);
       for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
               WARCRecord r = (WARCRecord)i.next();
               if (!isARCType(r.getHeader().getMimetype())) {
                   continue;
               }
               if (r.getHeader().getContentBegin() <= 0) {
                   // Otherwise, because length include Header-Line and
                   // Named Fields, these will end up in the ARC unless there
                   // is a non-zero content begin.
                   continue;
               }
               String ip = (String)r.getHeader().
                   getHeaderValue((WARCConstants.HEADER_KEY_IP));
               long length = r.getHeader().getLength();
               int offset = r.getHeader().getContentBegin();
               // This mimetype is not exactly what you'd expect to find in
               // an ARC though technically its 'correct'.  To get right one,
               // need to parse the HTTP Headers.  Thats messy.  Not doing for
               // now.
               String mimetype = r.getHeader().getMimetype();
               // Clean out ISO time string '-', 'T', ':', and 'Z' characters.
               String t = r.getHeader().getDate().replaceAll("[-T:Z]", "");
               long time = ArchiveUtils.getSecondsSinceEpoch(t).getTime();
               writer.write(r.getHeader().getUrl(), mimetype, ip, time,
                   (int)(length - offset), r);
       }
     } finally {
       if (reader != null) {
         reader.close();
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

           + "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n"
           + "\r\n";

        final String hdr = warcHeader + HTTPHEADER + BODY;

        WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()),
                "READER_IDENTIFIER", 0, false, true);
        HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);

        har.skipHttpHeader();
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

    }
   
    public static Resource createTestHtmlResource(byte[] payloadBytes) throws IOException {
        WARCRecordInfo recinfo = TestWARCRecordInfo.createCompressedHttpResponse("text/html", payloadBytes);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource resource = new WarcResource(rec, ar);
        resource.parseHeaders();
        return resource;
    }
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

  }
 
  public static Resource createTestJSResource(byte[] payloadBytes) throws IOException {
    WARCRecordInfo recinfo = TestWARCRecordInfo.createHttpResponse("text/javascript", payloadBytes);
    TestWARCReader ar = new TestWARCReader(recinfo);
    WARCRecord rec = ar.get(0);
    WarcResource resource = new WarcResource(rec, ar);
    resource.parseHeaders();
    return resource;
  }
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

    }
   
    public static Resource createTestHtmlResource(byte[] payloadBytes) throws IOException {
        WARCRecordInfo recinfo = TestWARCRecordInfo.createCompressedHttpResponse("text/html", payloadBytes);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource resource = new WarcResource(rec, ar);
        resource.parseHeaders();
        return resource;
    }
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

    }
    public static Resource createTestRevisitResource(byte[] payloadBytes, boolean withHeader, boolean gzipContent) throws IOException {
        WARCRecordInfo recinfo = TestWARCRecordInfo.createRevisitHttpResponse(
                "text/html", payloadBytes.length, withHeader, gzipContent);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource resource = new WarcResource(rec, ar);
        resource.parseHeaders();
        return resource;
    }
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

     */
    public void testPlainHttpRecord() throws Exception {
        String payload = "hogehogehogehogehoge";
        WARCRecordInfo recinfo = TestWARCRecordInfo.createHttpResponse(payload);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
       
        assertEquals("statusCode", 200, res.getStatusCode());
        assertEquals("content-type", "text/plain", res.getHeader("Content-Type"));
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

        String payload = "hogehogehogehogehoge";
        WARCRecordInfo recinfo = new TestWARCRecordInfo(
                TestWARCRecordInfo.buildHttpResponseBlock("200 OK",
                        "text/plain", payload.getBytes("UTF-8"), true));
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
       
        assertEquals("statusCode", 200, res.getStatusCode());
        assertEquals("content-type", "text/plain", res.getHeader("Content-Type"));
View Full Code Here

Examples of org.archive.io.warc.WARCRecord

        String ctype = "text/plain";
        WARCRecordInfo recinfo = new TestWARCRecordInfo(
                TestWARCRecordInfo.buildCompressedHttpResponseBlock(ctype,
                        payload.getBytes()));
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
       
        assertEquals("statusCode", 200, res.getStatusCode());
        assertEquals("content-type", ctype, res.getHeader("Content-Type"));
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.