Package org.archive.util.anvl

Examples of org.archive.util.anvl.ANVLRecord


       ByteArrayOutputStream baos =
         new ByteArrayOutputStream((int)firstRecord.getHeader().
             getLength());
       firstRecord.dump(baos);
         // Add ARC first record content as an ANVLRecord.
         ANVLRecord ar = new ANVLRecord();
         ar.addLabelValue("Filedesc", baos.toString());
         List<String> metadata = new ArrayList<String>(1);
         metadata.add(ar.toString());
         // Now create the writer.  If reader was compressed, lets write
         // a compressed WARC.
       writer = new WARCWriter(
                   new AtomicInteger(),
                   bos,
View Full Code Here


           .withZone(DateTimeZone.UTC)
               .parseDateTime(arcDateString)
                   .toString(ISODateTimeFormat.dateTimeNoMillis());
       recordInfo.setCreate14DigitDate(warcDateString);

       ANVLRecord ar = new ANVLRecord();
       String ip = (String)r.getHeader()
           .getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY));
       if (ip != null && ip.length() > 0) {
           ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
           r.getMetaData();
       }
       recordInfo.setExtraHeaders(ar);

       // enable reconstruction of ARC from transformed WARC
View Full Code Here

        recordInfo.setCreate14DigitDate(ArchiveUtils.getLog14Date());
        recordInfo.setMimetype(ANVLRecord.MIMETYPE);
        recordInfo.setExtraHeaders(null);
        recordInfo.setEnforceLength(true);
       
      ANVLRecord meta = new ANVLRecord();
      meta.addLabelValue("size", "1G");
      meta.addLabelValue("operator", "igor");
      byte [] bytes = meta.getUTF8Bytes();
      recordInfo.setContentStream(new ByteArrayInputStream(bytes));
      recordInfo.setContentLength((long) bytes.length);
     
        final URI recordid = writer.generateRecordId(WARCWriter.TYPE, WARCRecordType.warcinfo.toString());
        recordInfo.setRecordId(recordid);
View Full Code Here

      recordInfo.setUrl("http://www.archive.org/");
      recordInfo.setCreate14DigitDate(ArchiveUtils.get14DigitDate());
      recordInfo.setMimetype("no/type");
      recordInfo.setEnforceLength(true);
     
      ANVLRecord headerFields = new ANVLRecord();
      headerFields.addLabelValue("x", "y");
      headerFields.addLabelValue("a", "b");
      recordInfo.setExtraHeaders(headerFields);
     
      URI rid = (new UUIDGenerator()).getQualifiedRecordID(TYPE, WARCRecordType.metadata.toString());
      recordInfo.setRecordId(rid);
     
View Full Code Here

            final URI baseid, final String timestamp) throws IOException {
        // Add named fields for ip, checksum, and relate the metadata
        // and request to the resource field.
        // TODO: Use other than ANVL (or rename ANVL as NameValue or
        // use RFC822 (commons-httpclient?).
        ANVLRecord headers = new ANVLRecord();
        if (curi.getContentDigest() != null) {
            headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
                    curi.getContentDigestSchemeString());
        }
        headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));

        URI rid;
       
        if (curi.isRevisit()) {
            rid = writeRevisit(w, timestamp, HTTP_RESPONSE_MIMETYPE, baseid, curi, headers);
        } else {
            // Check for truncated annotation
            String value = null;
            Collection<String> anno = curi.getAnnotations();
            if (anno.contains(TIMER_TRUNC)) {
                value = NAMED_FIELD_TRUNCATED_VALUE_TIME;
            } else if (anno.contains(LENGTH_TRUNC)) {
                value = NAMED_FIELD_TRUNCATED_VALUE_LENGTH;
            } else if (anno.contains(HEADER_TRUNC)) {
                value = NAMED_FIELD_TRUNCATED_VALUE_HEAD;
            }
            // TODO: Add annotation for TRUNCATED_VALUE_UNSPECIFIED
            if (value != null) {
                headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
            }
            rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,
              baseid, curi, headers);
        }
       
        headers = new ANVLRecord();
        headers.addLabelValue(HEADER_KEY_CONCURRENT_TO,
            '<' + rid.toString() + '>');

        if (getWriteRequests()) {
            writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,
                    baseid, curi, headers);
View Full Code Here

        }
    }

    protected void writeFtpRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
            final String timestamp) throws IOException {
        ANVLRecord headers = new ANVLRecord();
        headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
        String controlConversation = curi.getData().get(A_FTP_CONTROL_CONVERSATION).toString();
        URI rid = writeFtpControlConversation(w, timestamp, baseid, curi, headers, controlConversation);
       
        if (curi.getContentDigest() != null) {
            headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
            curi.getContentDigestSchemeString());
        }
           
        if (curi.getRecorder() != null) {
            if (curi.isRevisit()) {
                rid = writeRevisit(w, timestamp, null,
                        baseid, curi, headers, 0);
            } else {
                headers = new ANVLRecord();
                // Check for truncated annotation
                String value = null;
                Collection<String> anno = curi.getAnnotations();
                if (anno.contains(TIMER_TRUNC)) {
                    value = NAMED_FIELD_TRUNCATED_VALUE_TIME;
                } else if (anno.contains(LENGTH_TRUNC)) {
                    value = NAMED_FIELD_TRUNCATED_VALUE_LENGTH;
                } else if (anno.contains(HEADER_TRUNC)) {
                    value = NAMED_FIELD_TRUNCATED_VALUE_HEAD;
                }
                // TODO: Add annotation for TRUNCATED_VALUE_UNSPECIFIED
                if (value != null) {
                    headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
                }
               
                if (curi.getContentDigest() != null) {
                    headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
                            curi.getContentDigestSchemeString());
                }
                headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
                rid = writeResource(w, timestamp, curi.getContentType(), baseid, curi, headers);
            }
        }
        if (getWriteMetadata()) {
            headers = new ANVLRecord();
            headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
            writeMetadata(w, timestamp, baseid, curi, headers);
        }
    }
View Full Code Here

    protected URI writeResponse(final WARCWriter w,
            final String timestamp, final String mimetype,
            final URI baseid, final CrawlURI curi,
            final ANVLRecord suppliedFields)
    throws IOException {
        ANVLRecord namedFields = suppliedFields;
        if(curi.getData().containsKey(A_WARC_RESPONSE_HEADERS)) {
           namedFields = namedFields.clone();
           for (Object headerObj : curi.getDataList(A_WARC_RESPONSE_HEADERS)) {
               String[] kv = StringUtils.split(((String)headerObj),":",2);
               namedFields.addLabelValue(kv[0].trim(), kv[1].trim());
           }
        }
       
        WARCRecordInfo recordInfo = new WARCRecordInfo();
        recordInfo.setType(WARCRecordType.response);
View Full Code Here

        // Get some metadata from the curi.
        // TODO: Get all curi metadata.
        // TODO: Use other than ANVL (or rename ANVL as NameValue or use
        // RFC822 (commons-httpclient?).
        ANVLRecord r = new ANVLRecord();
        if (curi.isSeed()) {
            r.addLabel("seed");
        } else {
          if (curi.forceFetch()) {
            r.addLabel("force-fetch");
          }
            if(StringUtils.isNotBlank(flattenVia(curi))) {
                r.addLabelValue("via", flattenVia(curi));
            }
            if(StringUtils.isNotBlank(curi.getPathFromSeed())) {
                r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
            }
            if (curi.containsDataKey(A_SOURCE_TAG)) {
                r.addLabelValue("sourceTag",
                        (String)curi.getData().get(A_SOURCE_TAG));
            }
        }
        long duration = curi.getFetchCompletedTime() - curi.getFetchBeginTime();
        if (duration > -1) {
            r.addLabelValue("fetchTimeMs", Long.toString(duration));
        }
       
        if (curi.getData().containsKey(A_FTP_FETCH_STATUS)) {
            r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString());
        }

        if (curi.getRecorder() != null && curi.getRecorder().getCharset() != null) {
            r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name());
        }
       
        for (String annotation: curi.getAnnotations()) {
            if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) {
                String[] kv = annotation.split(":", 2);
                r.addLabelValue(kv[0], kv[1]);
            }
        }

        // Add outlinks though they are effectively useless without anchor text.
        Collection<CrawlURI> links = curi.getOutLinks();
        if (links != null && links.size() > 0) {
            for (CrawlURI link: links) {
                r.addLabelValue("outlink", link.getURI());
            }
        }
       
        // TODO: Other curi fields to write to metadata.
        //
        // Credentials
        //
        // fetch-began-time: 1154569278774
        // fetch-completed-time: 1154569281816
        //
        // Annotations.
       
        byte [] b = r.getUTF8Bytes();
        recordInfo.setContentStream(new ByteArrayInputStream(b));
        recordInfo.setContentLength((long) b.length);
       
        w.writeRecord(recordInfo);
       
View Full Code Here

    public List<String> getMetadata() {
        if (cachedMetadata != null) {
            return cachedMetadata;
        }
        ANVLRecord record = new ANVLRecord();
        record.addLabelValue("software", "Heritrix/" +
                ArchiveUtils.VERSION + " http://crawler.archive.org");
        try {
            InetAddress host = InetAddress.getLocalHost();
            record.addLabelValue("ip", host.getHostAddress());
            record.addLabelValue("hostname", host.getCanonicalHostName());
        } catch (UnknownHostException e) {
            logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
        }
       
        // conforms to ISO 28500:2009 as of May 2009
        // as described at http://bibnum.bnf.fr/WARC/
        // latest draft as of November 2008
        record.addLabelValue("format","WARC File Format 1.0");
        record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
       
        // Get other values from metadata provider

        CrawlMetadata provider = getMetadataProvider();

        addIfNotBlank(record,"operator", provider.getOperator());
        addIfNotBlank(record,"publisher", provider.getOrganization());
        addIfNotBlank(record,"audience", provider.getAudience());
        addIfNotBlank(record,"isPartOf", provider.getJobName());
        // TODO: make date match 'job creation date' as in Heritrix 1.x
        // until then, leave out (plenty of dates already in WARC
        // records
//            String rawDate = provider.getBeginDate();
//            if(StringUtils.isNotBlank(rawDate)) {
//                Date date;
//                try {
//                    date = ArchiveUtils.parse14DigitDate(rawDate);
//                    addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
//                } catch (ParseException e) {
//                    logger.log(Level.WARNING,"obtaining warc created date",e);
//                }
//            }
        addIfNotBlank(record,"description", provider.getDescription());
        addIfNotBlank(record,"robots", provider.getRobotsPolicyName().toLowerCase());

        addIfNotBlank(record,"http-header-user-agent",
                provider.getUserAgent());
        addIfNotBlank(record,"http-header-from",
                provider.getOperatorFrom());

        // really ugly to return as List<String>, but changing would require
        // larger refactoring
        return Collections.singletonList(record.toString());
    }
View Full Code Here

      byte GET[] = "GET".getBytes();
      byte HTTP11[] = "HTTP/1.1".getBytes();
      InetAddress addr = InetAddress.getByName(hostname);
      HttpRequestMessage requestMessage = new HttpRequestMessage(
          GET,url.getFile().getBytes(),HTTP11);
      ANVLRecord headers = new ANVLRecord();
      headers.addLabelValue("Host", hostname);
     
     
      if(offset != 0) {
        headers.addLabelValue(RANGE_HTTP_HEADER,
            HEADER_BYTES_PREFIX + String.valueOf(offset) +
              HEADER_BYTES_SUFFIX);
      }
      InetSocketAddress sockAddr = new InetSocketAddress(addr,port);
      Socket socket = new Socket();
      socket.setSoTimeout(socketTimeoutMs);
      socket.setReceiveBufferSize(BUF_SIZE);

      socket.connect(sockAddr, connectTimeoutMs);
      OutputStream socketOut = socket.getOutputStream();
      InputStream socketIn = socket.getInputStream();
      socketOut.write(requestMessage.getBytes(true));
      socketOut.write(headers.getUTF8Bytes());
      socketOut.flush();
      HttpResponse response = HttpResponse.load(socketIn);
      String contentType = response.getHeaders().asMap().get("Content-Type");
      if(contentType == null) {
        contentType = "application/unknown";
View Full Code Here

TOP

Related Classes of org.archive.util.anvl.ANVLRecord

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.