Package org.archive.io.warc

Examples of org.archive.io.warc.WARCRecordInfo


        }
    }
  
    protected void writeDnsRecords(final CrawlURI curi, WARCWriter w,
            final URI baseid, final String timestamp) throws IOException {
        WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setType(WARCRecordType.response);
        recordInfo.setUrl(curi.toString());
        recordInfo.setCreate14DigitDate(timestamp);
        recordInfo.setMimetype(curi.getContentType());
        recordInfo.setRecordId(baseid);
       
        recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize());
        recordInfo.setEnforceLength(true);
       
        String ip = (String)curi.getData().get(A_DNS_SERVER_IP_LABEL);
        if (ip != null && ip.length() > 0) {
            recordInfo.addExtraHeader(HEADER_KEY_IP, ip);
        }
       
        ReplayInputStream ris =
            curi.getRecorder().getRecordedInput().getReplayInputStream();
        recordInfo.setContentStream(ris);
       
        try {
            w.writeRecord(recordInfo);
        } finally {
            IOUtils.closeQuietly(ris);
        }
       
        recordInfo.getRecordId();
    }
View Full Code Here


        recordInfo.getRecordId();
    }

    protected void writeWhoisRecords(WARCWriter w, CrawlURI curi, URI baseid,
            String timestamp) throws IOException {
        WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setType(WARCRecordType.response);
        recordInfo.setUrl(curi.toString());
        recordInfo.setCreate14DigitDate(timestamp);
        recordInfo.setMimetype(curi.getContentType());
        recordInfo.setRecordId(baseid);
        recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize());
        recordInfo.setEnforceLength(true);
       
        Object whoisServerIP = curi.getData().get(CoreAttributeConstants.A_WHOIS_SERVER_IP);
        if (whoisServerIP != null) {
            recordInfo.addExtraHeader(HEADER_KEY_IP, whoisServerIP.toString());
        }
       
        ReplayInputStream ris =
            curi.getRecorder().getRecordedInput().getReplayInputStream();
        recordInfo.setContentStream(ris);
       
        try {
            w.writeRecord(recordInfo);
        } finally {
            IOUtils.closeQuietly(ris);
        }
        recordInfo.getRecordId();
    }
View Full Code Here

    protected URI writeFtpControlConversation(WARCWriter w, String timestamp,
            URI baseid, CrawlURI curi, ANVLRecord headers,
            String controlConversation) throws IOException {
       
        WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setCreate14DigitDate(timestamp);
        recordInfo.setUrl(curi.toString());
        recordInfo.setMimetype(FTP_CONTROL_CONVERSATION_MIMETYPE);
        recordInfo.setExtraHeaders(headers);
        recordInfo.setEnforceLength(true);
        recordInfo.setType(WARCRecordType.metadata);

        recordInfo.setRecordId(qualifyRecordID(baseid, TYPE, WARCRecordType.metadata.toString()));
       
        byte[] b = controlConversation.getBytes("UTF-8");
       
        recordInfo.setContentStream(new ByteArrayInputStream(b));
        recordInfo.setContentLength((long) b.length);
       
        w.writeRecord(recordInfo);
       
        return recordInfo.getRecordId();
    }
View Full Code Here

    protected URI writeRequest(final WARCWriter w,
            final String timestamp, final String mimetype,
            final URI baseid, final CrawlURI curi,
            final ANVLRecord namedFields)
    throws IOException {
        WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setType(WARCRecordType.request);
        recordInfo.setUrl(curi.toString());
        recordInfo.setCreate14DigitDate(timestamp);
        recordInfo.setMimetype(mimetype);
        recordInfo.setExtraHeaders(namedFields);
        recordInfo.setContentLength(curi.getRecorder().getRecordedOutput().getSize());
        recordInfo.setEnforceLength(true);
       
        final URI uid = qualifyRecordID(baseid, TYPE, WARCRecordType.request.toString());
        recordInfo.setRecordId(uid);
       
        ReplayInputStream
            ris = curi.getRecorder().getRecordedOutput().getReplayInputStream();
        recordInfo.setContentStream(ris);
       
        try {
            w.writeRecord(recordInfo);
        } finally {
            IOUtils.closeQuietly(ris);
        }
       
        return recordInfo.getRecordId();
    }
View Full Code Here

               String[] kv = StringUtils.split(((String)headerObj),":",2);
               namedFields.addLabelValue(kv[0].trim(), kv[1].trim());
           }
        }
       
        WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setType(WARCRecordType.response);
        recordInfo.setUrl(curi.toString());
        recordInfo.setCreate14DigitDate(timestamp);
        recordInfo.setMimetype(mimetype);
        recordInfo.setRecordId(baseid);
        recordInfo.setExtraHeaders(namedFields);
        recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize());
        recordInfo.setEnforceLength(true);
       
        ReplayInputStream ris =
            curi.getRecorder().getRecordedInput().getReplayInputStream();
        recordInfo.setContentStream(ris);
       
        try {
            w.writeRecord(recordInfo);
        } finally {
            IOUtils.closeQuietly(ris);
        }
       
        return recordInfo.getRecordId();
    }
View Full Code Here

    protected URI writeResource(final WARCWriter w,
            final String timestamp, final String mimetype,
            final URI baseid, final CrawlURI curi,
            final ANVLRecord namedFields)
    throws IOException {
        WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setType(WARCRecordType.resource);
        recordInfo.setUrl(curi.toString());
        recordInfo.setCreate14DigitDate(timestamp);
        recordInfo.setMimetype(mimetype);
        recordInfo.setRecordId(baseid);
        recordInfo.setExtraHeaders(namedFields);
        recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize());
        recordInfo.setEnforceLength(true);
       
        ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream();
        recordInfo.setContentStream(ris);
        try {
            w.writeRecord(recordInfo);
        } finally {
            IOUtils.closeQuietly(ris);
        }
       
        return recordInfo.getRecordId();
    }
View Full Code Here

            final String timestamp, final String mimetype,
            final URI baseid, final CrawlURI curi,
            final ANVLRecord headers,
            final long contentLength
                  throws IOException {
        WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setType(WARCRecordType.revisit);
        recordInfo.setUrl(curi.toString());
        recordInfo.setCreate14DigitDate(timestamp);
        recordInfo.setMimetype(mimetype);
        recordInfo.setRecordId(baseid);
        recordInfo.setContentLength(contentLength);
        recordInfo.setEnforceLength(false);
     
        RevisitProfile revisitProfile = curi.getRevisitProfile();
       
        headers.addLabelValue(HEADER_KEY_PROFILE, revisitProfile.getProfileName());
        headers.addLabelValue(HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH);

        Map<String, String> revisitHeaders = revisitProfile.getWarcHeaders();
       
        if (!revisitHeaders.isEmpty()) {
          recordInfo.setExtraHeaders(headers);
          for ( String key : revisitHeaders.keySet()) {
              headers.addLabelValue(key, revisitHeaders.get(key));         
          }
        }
       
    ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream();
    recordInfo.setContentStream(ris);

    try {
      w.writeRecord(recordInfo);
    } finally {
      IOUtils.closeQuietly(ris);
    }
    return recordInfo.getRecordId();
    }
View Full Code Here

  protected URI writeMetadata(final WARCWriter w,
            final String timestamp,
            final URI baseid, final CrawlURI curi,
            final ANVLRecord namedFields)
    throws IOException {
      WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setType(WARCRecordType.metadata);
        recordInfo.setUrl(curi.toString());
        recordInfo.setCreate14DigitDate(timestamp);
        recordInfo.setMimetype(ANVLRecord.MIMETYPE);
        recordInfo.setExtraHeaders(namedFields);
        recordInfo.setEnforceLength(true);
     
        recordInfo.setRecordId(qualifyRecordID(baseid, TYPE, WARCRecordType.metadata.toString()));

        // Get some metadata from the curi.
        // TODO: Get all curi metadata.
        // TODO: Use other than ANVL (or rename ANVL as NameValue or use
        // RFC822 (commons-httpclient?).
        ANVLRecord r = new ANVLRecord();
        if (curi.isSeed()) {
            r.addLabel("seed");
        } else {
          if (curi.forceFetch()) {
            r.addLabel("force-fetch");
          }
            if(StringUtils.isNotBlank(flattenVia(curi))) {
                r.addLabelValue("via", flattenVia(curi));
            }
            if(StringUtils.isNotBlank(curi.getPathFromSeed())) {
                r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
            }
            if (curi.containsDataKey(A_SOURCE_TAG)) {
                r.addLabelValue("sourceTag",
                        (String)curi.getData().get(A_SOURCE_TAG));
            }
        }
        long duration = curi.getFetchCompletedTime() - curi.getFetchBeginTime();
        if (duration > -1) {
            r.addLabelValue("fetchTimeMs", Long.toString(duration));
        }
       
        if (curi.getData().containsKey(A_FTP_FETCH_STATUS)) {
            r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString());
        }

        if (curi.getRecorder() != null && curi.getRecorder().getCharset() != null) {
            r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name());
        }
       
        for (String annotation: curi.getAnnotations()) {
            if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) {
                String[] kv = annotation.split(":", 2);
                r.addLabelValue(kv[0], kv[1]);
            }
        }

        // Add outlinks though they are effectively useless without anchor text.
        Collection<CrawlURI> links = curi.getOutLinks();
        if (links != null && links.size() > 0) {
            for (CrawlURI link: links) {
                r.addLabelValue("outlink", link.getURI());
            }
        }
       
        // TODO: Other curi fields to write to metadata.
        //
        // Credentials
        //
        // fetch-began-time: 1154569278774
        // fetch-completed-time: 1154569281816
        //
        // Annotations.
       
        byte [] b = r.getUTF8Bytes();
        recordInfo.setContentStream(new ByteArrayInputStream(b));
        recordInfo.setContentLength((long) b.length);
       
        w.writeRecord(recordInfo);
       
        return recordInfo.getRecordId();
    }
View Full Code Here

     }
   }
  
   protected void write(final WARCWriter writer, final ARCRecord r)
   throws IOException {
       WARCRecordInfo recordInfo = new WARCRecordInfo();
       recordInfo.setUrl(r.getHeader().getUrl());
       recordInfo.setContentStream(r);
       recordInfo.setContentLength(r.getHeader().getLength());
       recordInfo.setEnforceLength(true);

       // convert ARC date to WARC-Date format
       String arcDateString = r.getHeader().getDate();
       String warcDateString = DateTimeFormat.forPattern("yyyyMMddHHmmss")
           .withZone(DateTimeZone.UTC)
               .parseDateTime(arcDateString)
                   .toString(ISODateTimeFormat.dateTimeNoMillis());
       recordInfo.setCreate14DigitDate(warcDateString);

       ANVLRecord ar = new ANVLRecord();
       String ip = (String)r.getHeader()
           .getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY));
       if (ip != null && ip.length() > 0) {
           ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
           r.getMetaData();
       }
       recordInfo.setExtraHeaders(ar);

       // enable reconstruction of ARC from transformed WARC
       // TODO: deferred for further analysis (see HER-1750)
       // ar.addLabelValue("ARC-Header-Line", r.getHeaderString());

       // If contentBody > 0, assume http headers.  Make the mimetype
       // be application/http.  Otherwise, give it ARC mimetype.
       if (r.getHeader().getContentBegin() > 0) {
           recordInfo.setType(WARCRecordType.response);
           recordInfo.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
           recordInfo.setRecordId(generator.getRecordID());
       } else {
           recordInfo.setType(WARCRecordType.resource);
           recordInfo.setMimetype(r.getHeader().getMimetype());
           recordInfo.setRecordId(((WARCWriterPoolSettings)writer.settings).getRecordIDGenerator().getRecordID());
       }

       writer.writeRecord(recordInfo);
   }
View Full Code Here

        result.setOriginalUrl("http://www.example.com/");
        result.setCaptureTimestamp("20100101123456");
    }
   
    public static Resource createTestHtmlResource(byte[] payloadBytes) throws IOException {
        WARCRecordInfo recinfo = TestWARCRecordInfo.createCompressedHttpResponse("text/html", payloadBytes);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource resource = new WarcResource(rec, ar);
        resource.parseHeaders();
        return resource;
View Full Code Here

TOP

Related Classes of org.archive.io.warc.WARCRecordInfo

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.