Examples of org.archive.io.arc.ARCRecord

org.archive.io.arc.ARCRecord

                this.reporter.incrCounter(Counter.ARCS_COUNT, 1);              
                
                // Iterate over each ARCRecord.
                for (final Iterator i = arc.iterator();
                        i.hasNext() && !currentThread().isInterrupted();) {
                    final ARCRecord rec = (ARCRecord)i.next();
                    this.reporter.incrCounter(Counter.ARCRECORDS_COUNT, 1);
                    
                    
                    try {
                        ARCMapRunner.this.mapper.map(
                            new Text(rec.getMetaData().getUrl()),
                            new ObjectWritable(rec), this.output,
                            this.reporter);
                        
                        final long b = rec.getMetaData().getContentBegin();
                        final long l = rec.getMetaData().getLength();
                        final long recordLength = (l > b)? (l - b): l;
                        if (recordLength >
                                ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE) {
                            // Now, if the content length is larger than a
                            // standard ARC, then it is most likely the last
                            // record in the ARC because ARC is closed after we
                            // exceed 100MB (DEFAULT_MAX_ARC...). Calling
                            // hasNext above will make us read through the
                            // whole record, even if its a 1.7G video. On a
                            // loaded machine, this might cause us timeout with
                            // tasktracker -- so, just skip out here.
                            this.reporter.setStatus("skipping " +
                                this.location + " -- very long record " +
                                rec.getMetaData());
                            this.reporter.
                                incrCounter(Counter.LONG_ARCRECORDS_COUNT, 1);
                            break;
                        }
                    } catch (final Throwable e) {
                        // Failed parse of record. Keep going.
                        LOG.warn("Error processing " + rec.getMetaData(), e);
                    }
                }
                if (currentThread().isInterrupted()) {
                    LOG.info(currentThread().getName() + " interrupted");
                }

View Full Code Here

          "ARC(" + name + ") at (" + offset + ")");
      resource = arcCacheDir.getResource(name, offset);
      // add the result to the index:
      if(resource instanceof ArcResource) {
        ArcResource aResource = (ArcResource) resource;
        ARCRecord record = (ARCRecord) aResource.getArcRecord();
        
        SearchResult result = adapter.adapt(record);
        index.addSearchResult(result);
        LOGGER.info("Added URL(" + url.toString() + ") in " +
            "ARC(" + name + ") at (" + offset + ") to LiveIndex");

View Full Code Here


    /* (non-Javadoc)
     * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
     */
    public ARCRecord adapt(ArchiveRecord o) {
      ARCRecord rec = null;
      if(o instanceof ARCRecord) {
        rec = (ARCRecord) o;
      }
      return rec;
    }

View Full Code Here

      new SearchResultToCDXLineAdapter();
    
    public void map(WritableComparable key, Writable value,
        OutputCollector output, Reporter reporter) throws IOException {
      ObjectWritable ow = (ObjectWritable) value;
      ARCRecord rec = (ARCRecord) ow.get();
      String line;
      SearchResult result = ARtoSR.adapt(rec);
      if(result != null) {
        line = SRtoCDX.adapt(result);
        if(line != null) {

View Full Code Here

  {
    // Assumption is that this map is being run by ARCMapRunner.
    // Otherwise, the below casts fail.
    String url = key.toString();
        
    ARCRecord rec = (ARCRecord)((ObjectWritable)value).get();
    ARCReporter reporter = (ARCReporter)r;       


    // Its null first time map is called on an ARC.
    checkArcName(rec);   
    if (! isIndex(rec))
    {
      return;
    }
    checkCollectionName();
    
    final ARCRecordMetaData arcData = rec.getMetaData();
    String oldUrl = url;
    
    try
    {
      url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_FETCHER);
      url = filters.filter(url); // filter the url
    }
    catch (Exception e)
    {
      LOG.warn("Skipping record. Didn't pass normalization/filter " +
        oldUrl + ": " + e.toString());


      return;
    }


    final long b = arcData.getContentBegin();
    final long l = arcData.getLength();
    final long recordLength = (l > b)? (l - b): l;


    // Look at ARCRecord meta data line mimetype. It can be empty.  If so,
    // two more chances at figuring it either by looking at HTTP headers or
    // by looking at first couple of bytes of the file.  See below.
    String mimetype =
      getMimetype(arcData.getMimetype(), this.mimeTypes, url);
    
    if (skip(mimetype))
    {
      return;
    }


    // Copy http headers to nutch metadata.
    final Metadata metaData = new Metadata();
    final Header[] headers = rec.getHttpHeaders();
    for (int j = 0; j < headers.length; j++)
    {
      final Header header = headers[j];
      
      if (mimetype == null)
      {
        // Special handling. If mimetype is still null, try getting it
        // from the http header. I've seen arc record lines with empty
        // content-type and a MIME unparseable file ending; i.e. .MID.
        if ((header.getName() != null) &&
          header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY))
        {
          mimetype = getMimetype(header.getValue(), null, null);
          
          if (skip(mimetype))
          {
            return;
          }
        }
      }
      
      metaData.set(header.getName(), header.getValue());
    }


    // This call to reporter setStatus pings the tasktracker telling it our
    // status and telling the task tracker we're still alive (so it doesn't
    // time us out).
    final String noSpacesMimetype =
      TextUtils.replaceAll(ImportArcs.WHITESPACE,
      ((mimetype == null || mimetype.length() <= 0)?
      "TODO": mimetype),
      "-");
    final String recordLengthAsStr = Long.toString(recordLength);
    
    reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype));


    // This is a nutch 'more' field.
    metaData.set("contentLength", recordLengthAsStr);


    rec.skipHttpHeader();
    reporter.setStatusIfElapse("read headers on " + url);


    // TODO: Skip if unindexable type.
    int total = 0;
    
    // Read in first block. If mimetype still null, look for MAGIC.
    int len = rec.read(this.buffer, 0, this.buffer.length);
    
    if (mimetype == null)
    {
      MimeType mt = this.mimeTypes.getMimeType(this.buffer);
      
      if (mt == null || mt.getName() == null)
      {
        LOG.warn("Failed to get mimetype for: " + url);
        
        return;
      }
      
      mimetype = mt.getName();
    }
    
    metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype);


    // How much do we read total? If pdf, we will read more. If equal to -1,
    // read all.
    int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype))?
      this.pdfContentLimit : this.contentLimit;
    
    // Reset our contentBuffer so can reuse.  Over the life of an ARC
    // processing will grow to maximum record size.
    this.contentBuffer.reset();
 
    while ((len != -1) && ((readLimit == -1) || (total < readLimit)))
    {
      total += len;
      this.contentBuffer.write(this.buffer, 0, len);
      len = rec.read(this.buffer, 0, this.buffer.length);
      reporter.setStatusIfElapse("reading " + url);
    }


    // Close the Record.  We're done with it.  Side-effect is calculation
    // of digest -- if we're digesting.
    rec.close();
    reporter.setStatusIfElapse("closed " + url);


    final byte[] contentBytes = this.contentBuffer.toByteArray();
    final CrawlDatum datum = new CrawlDatum();
    datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);


    // Calculate digest or use precalculated sha1.
    String digest = (this.sha1)? rec.getDigestStr():
    MD5Hash.digest(contentBytes).toString();
    metaData.set(Nutch.SIGNATURE_KEY, digest);
    
    // Set digest back into the arcData so available later when we write
    // CDX line.

View Full Code Here

  public void map(final WritableComparable key, final Writable value,
      final OutputCollector output, final Reporter r) throws IOException {
    // Assumption is that this map is being run by ARCMapRunner.
    // Otherwise, the below casts fail.
    String url = key.toString();
    ARCRecord rec = (ARCRecord) ((ObjectWritable) value).get();
    ARCReporter reporter = (ARCReporter) r;
    
    reporter.incrCounter(Counter.TOTALFILES, 1);    
        if (!isIndex(rec)) {  // If it is the first record skip it so there are no errors from this record.
          reporter.incrCounter(Counter.ISINDEX, 1);
            return;
        }
        
    final ARCRecordMetaData arcData = rec.getMetaData();
    // Look at ARCRecord meta data line mimetype. It can be empty. If so,
    // two more chances at figuring it either by looking at HTTP headers or
    // by looking at first couple of bytes of the file. See below.
    String mimetype = getMimetype(arcData.getMimetype(), this.mimeTypes, url);
    rec.skipHttpHeader();
    reporter.setStatusIfElapse("read headers on " + url);


    // Read in first block. If mimetype still null, look for MAGIC.
    int len = rec.read(this.buffer, 0, this.buffer.length);
    // check mimetype
    if (mimetype == null) {
      MimeType mt = this.mimeTypes.getMimeType(this.buffer);


      if (mt == null || mt.getName() == null) {
        LOG.warn("ProcessArcs" + "Failed to get mimetype for: " + url);


        return;
      }


      mimetype = mt.getName();
    }
    
    // filter documents
    if (filter(mimetype)) {
      return;
    }    
    
    // Reset our contentBuffer so can reuse. Over the life of an ARC
    // processing will grow to maximum record size.
    this.contentBuffer.reset();


    int total = 0;
    while ((len != -1)) {
      total += len;
      this.contentBuffer.write(this.buffer, 0, len);
      len = rec.read(this.buffer, 0, this.buffer.length);
      reporter.setStatusIfElapse("reading " + url);
    }


    // Close the Record. We're done with it. Side-effect is calculation
    // of digest -- if we're digesting.
    rec.close();
    reporter.setStatusIfElapse("closed " + url);


    final byte[] contentBytes = this.contentBuffer.toByteArray();


    // Html file from ARC
    ByteArrayInputStream in = new ByteArrayInputStream(contentBytes);
    // HTML file with correct formating is to put here
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    // HTML errors are put here
    ByteArrayOutputStream err = new ByteArrayOutputStream();
    // Process file
    processor(in, out, err, rec.getHeader(), reporter);
  }

View Full Code Here

  }


  void logRecordErrors(ArchiveRecord record) {
        Logger logger = Logger.getLogger(this.getClass().getName());
        if (this.isARCFormat()) {
          ARCRecord arcRecord = (ARCRecord) record;
          if (arcRecord.hasErrors()) {
            ArchiveRecordHeader header = record.getHeader();
            logger.warning("record at offset: " + header.getOffset()
                + " has errors: " + arcRecord.getErrors());
          }
        } else {
          WARCRecord warcRecord = (WARCRecord) record;
          warcRecord.getHeader();
        }

View Full Code Here

   * @param arcReader an ARCReader intance
   * @throws IOException
   */
  private void replayRecord(ARCReader arcReader) throws IOException {
      arcReader.setStrict(true);
      ARCRecord arcRecord = (ARCRecord) arcReader.get(this.offset);
      arcRecord.skipHttpHeader();
      if (arcRecord.hasErrors()) {
        logger.warning("record has errors: " + arcRecord.getErrors());
      }
      byte[] buffer = new byte[BUFFER_SIZE];
      if (arcRecord.available() > 0) {
        // for (int r = -1; (r = arcRecord.read(buffer, 0, BUFFER_SIZE)) != -1;) {
        int r = -1;
        while((r = arcRecord.read(buffer, 0, BUFFER_SIZE)) != -1) {
          // os.write(buffer, 0, r);
          System.out.write(buffer, 0, r);
        }
      } else {
        System.out.println("record bytes available: " 
            + arcRecord.available());
      }
  }

View Full Code Here

   * @throws IOException
   */
  private void indexRecord(ARCReader arcReader) throws IOException {
    arcReader.setStrict(true);
    arcReader.setParseHttpHeaders(true);
    ARCRecord arcRecord = (ARCRecord) arcReader.get(this.offset);
    ArchiveRecordHeader header = arcRecord.getHeader();
    if (arcRecord.hasErrors()) 
      logger.warning("record has errors: " + arcRecord.getErrors());
    System.out.println("========== dumping HTTP header:");
    arcRecord.dumpHttpHeader();
    System.out.println("========== selected metadata:");
    arcRecord.close(); // must close record to get digest
    printMetadata(arcRecord,header);
    System.out.println("========== getting metadata:");
    System.out.println(arcRecord.getMetaData());
    System.out.println("\n"
        + "record length declared: " 
        + header.getLength() + "\n"
        + "header bytes read     : " 
        + arcRecord.httpHeaderBytesRead);

View Full Code Here

       BufferedOutputStream bos =
         new BufferedOutputStream(new FileOutputStream(warc));
       // Get the body of the first ARC record as a String so can dump it
       // into first record of WARC.
       final Iterator<ArchiveRecord> i = reader.iterator();
       ARCRecord firstRecord = (ARCRecord)i.next();
       ByteArrayOutputStream baos =
         new ByteArrayOutputStream((int)firstRecord.getHeader().
             getLength());
       firstRecord.dump(baos);
         // Add ARC first record content as an ANVLRecord.
         ANVLRecord ar = new ANVLRecord();
         ar.addLabelValue("Filedesc", baos.toString());
         List<String> metadata = new ArrayList<String>(1);
         metadata.add(ar.toString());

View Full Code Here

0 1

TOP

Related Classes of org.archive.io.arc.ARCRecord

org.archive.access.nutch.jobs.ImportArcs

org.archive.io.Arc2Warc

org.archive.io.ArchiveTest

org.archive.io.HeaderedArchiveRecordTest

org.archive.mapred.ARCMapRunner$IndexingThread

org.archive.wayback.liveweb.ARCUnwrappingProxy

org.archive.wayback.liveweb.LiveWebCache

org.archive.wayback.liveweb.RemoteLiveWebCache

org.archive.wayback.resourceindex.indexer.hadoop.Driver$MapClass

org.archive.wayback.resourcestore.ArcIndexer$ArchiveRecordToARCRecordAdapter

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.