Package org.archive.io

Examples of org.archive.io.ArchiveReader


        BasePath outputPath = platform.makePath("build/test/ParserPipeTest/out");
        Tap out = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), outputPath, SinkMode.REPLACE);

        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());

        ArchiveReader archiveReader = ArchiveReaderFactory.get("src/test/resources/someHtml.arc");
        Iterator<ArchiveRecord> iterator = archiveReader.iterator();
        int max = 300;
        int count = 0;
        int validRecords = 0;
        while (count++ < max && iterator.hasNext()) {
            ArchiveRecord archiveRecord = iterator.next();
View Full Code Here


       
        /**
         * @return Null if fails download.
         */
        protected ArchiveReader getArchiveReader() {
            ArchiveReader arc = null;
            // Need a thread that will keep updating TaskTracker during long
            // downloads else tasktracker will kill us.
            Thread reportingDuringDownload = null;
            try {
                this.reporter.setStatus("opening " + this.location, true);
View Full Code Here

        public void run() {
            if (this.location == null || this.location.length() <= 0) {
                return;
            }
             
            ArchiveReader arc = getArchiveReader();
            if (arc == null) {
                return;
            }

            try {
                ARCMapRunner.this.mapper.onARCOpen();
                this.reporter.incrCounter(Counter.ARCS_COUNT, 1);             
               
                // Iterate over each ARCRecord.
                for (final Iterator i = arc.iterator();
                        i.hasNext() && !currentThread().isInterrupted();) {
                    final ARCRecord rec = (ARCRecord)i.next();
                    this.reporter.incrCounter(Counter.ARCRECORDS_COUNT, 1);
                   
                   
                    try {
                        ARCMapRunner.this.mapper.map(
                            new Text(rec.getMetaData().getUrl()),
                            new ObjectWritable(rec), this.output,
                            this.reporter);
                       
                        final long b = rec.getMetaData().getContentBegin();
                        final long l = rec.getMetaData().getLength();
                        final long recordLength = (l > b)? (l - b): l;
                        if (recordLength >
                                ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE) {
                            // Now, if the content length is larger than a
                            // standard ARC, then it is most likely the last
                            // record in the ARC because ARC is closed after we
                            // exceed 100MB (DEFAULT_MAX_ARC...). Calling
                            // hasNext above will make us read through the
                            // whole record, even if its a 1.7G video. On a
                            // loaded machine, this might cause us timeout with
                            // tasktracker -- so, just skip out here.
                            this.reporter.setStatus("skipping " +
                                this.location + " -- very long record " +
                                rec.getMetaData());
                            this.reporter.
                                incrCounter(Counter.LONG_ARCRECORDS_COUNT, 1);
                            break;
                        }
                    } catch (final Throwable e) {
                        // Failed parse of record. Keep going.
                        LOG.warn("Error processing " + rec.getMetaData(), e);
                    }
                }
                if (currentThread().isInterrupted()) {
                    LOG.info(currentThread().getName() + " interrupted");
                }               
                this.reporter.setStatus("closing " + this.location, true);
               
            } catch (final Throwable e) {
                // Problem parsing arc file.
                this.reporter.incrCounter(Counter.BAD_ARC_PARSE_COUNT, 1);
                final String msg = "Error parsing " + this.location;
                //try {
                    this.reporter.setStatus(msg, true);
                /* TODO MC - to be compitable with hadoop 0.14
                } catch (final IOException ioe) {
                    ioe.printStackTrace();
                }
                */
                LOG.warn("ARCMapRunner - Throwable:"+ msg, e);           
            }
      finally {
                try {
                    arc.close();
                    ARCMapRunner.this.mapper.onARCClose();
                } catch (final IOException e) {
                    e.printStackTrace();
                }
            }
View Full Code Here

        testGetArcStream(false);
    }

    protected void testGetArcStream(boolean compress) throws IOException, FileNotFoundException {
        File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
        ArchiveReader reader = ARCReaderFactory.get(null, new FileInputStream(arc), compress);
        assertNotNull(reader);
        Iterator<ArchiveRecord> i = reader.iterator();
       
        // ARC header
        assertTrue(i.hasNext());
        ARCRecord r = (ARCRecord)i.next();
        assertEquals("filedesc://test.arc", r.getHeader().getHeaderValue("subject-uri"));
       
        // 1 fake http record
        assertTrue(i.hasNext());
        r = (ARCRecord)i.next();
        assertEquals(200, r.getStatusCode());
        assertEquals("http://www.archive.org/test/", r.getHeader().getHeaderValue("subject-uri"));
       
        assertFalse(i.hasNext());
        reader.close();
    }
View Full Code Here

     * {@link HttpParser#parseHeaders(java.io.InputStream, String)}, which
     * wayback cdx indexer depends on
     */
    public void testBadArcHeaders() throws IOException {
        ByteArrayInputStream in = new ByteArrayInputStream(ARC_RECORD_BAD_HEADERS.getBytes("UTF-8"));
        ArchiveReader reader = ARCReaderFactory.get(null, in, false);
        assertNotNull(reader);

        Iterator<ArchiveRecord> i = reader.iterator();

        // ARC header
        assertTrue(i.hasNext());
        ARCRecord r = (ARCRecord)i.next();
        assertEquals("filedesc://NARA-PEOT-2004-20041014205819-00000-crawling009.archive.org.arc", r.getHeader().getHeaderValue("subject-uri"));

        // record with 2 http status lines
        assertTrue(i.hasNext());
        r = (ARCRecord)i.next();
        assertEquals("http://schrock.house.gov/PollTemplate_PollAnswers_1", r.getHeader().getHeaderValue("subject-uri"));
        assertEquals(302, r.getStatusCode());
        assertEquals("HttpClient-Bad-Header-Line-Failed-Parse", r.getHttpHeaders()[3].getName());
        assertEquals("HTTP/1.1 404 Object Not Found", r.getHttpHeaders()[3].getValue());

        assertFalse(i.hasNext());
        reader.close();
    }
View Full Code Here

    return r;
  }
 
  protected Resource loadResource(String path, InputStream is) throws IOException, ResourceNotAvailableException
  {
    ArchiveReader archiveReader = ArchiveReaderFactory.get(path, is, false);
   
    if (archiveReader instanceof ARCReader) {
      return new ArcResource((ARCRecord)archiveReader.get(), archiveReader);
    } else if (archiveReader instanceof WARCReader) {
      return new WarcResource((WARCRecord)archiveReader.get(), archiveReader)
    } else {
      throw new IOException("Unknown ArchiveReader");
    }
  }
View Full Code Here

    FSDataInputStream is = hdfsSys.open( path );
    is.seek( offset );

    if (isArc(path.getName()))
      {
        ArchiveReader reader = ARCReaderFactory.get(path.getName(), is, false);
        r = ARCArchiveRecordToResource(reader.get(), reader);
      }
    else if (isWarc(path.getName()))
      {
        ArchiveReader reader = WARCReaderFactory.get(path.getName(), is, false);
        r = WARCArchiveRecordToResource(reader.get(), reader);
      }
    else
      {
      is.close();
        throw new ResourceNotAvailableException("Unknown extension");
View Full Code Here

    RandomAccessFile raf = new RandomAccessFile(file, "r");
    raf.seek(offset);
    InputStream is = new FileInputStream(raf.getFD());
    String fPath = file.getAbsolutePath();
    if (isArc(name)) {
      ArchiveReader reader = ARCReaderFactory.get(fPath, is, false);
      r = ARCArchiveRecordToResource(reader.get(), reader);

    } else if (isWarc(name)) {

      ArchiveReader reader = WARCReaderFactory.get(fPath, is, false);
      r = WARCArchiveRecordToResource(reader.get(), reader);

    } else {
      is.close();
      raf.close();
      throw new ResourceNotAvailableException("Unknown extension");
View Full Code Here

  throws IOException, ResourceNotAvailableException {
   
    Resource r = null;
    long start = System.currentTimeMillis();
    TimeoutArchiveReaderFactory tarf = defaultTimeoutReader;
    ArchiveReader reader = tarf.getArchiveReader(url,offset);
    if(reader instanceof ARCReader) {
      ARCReader areader = (ARCReader) reader;
      r = ARCArchiveRecordToResource(areader.get(),areader);
   
    } else if(reader instanceof WARCReader) {
View Full Code Here

        final byte[] block = "blahblahblah\n".getBytes();
        WARCRecordInfo recinfo = new TestWARCRecordInfo(block);
        recinfo.setType(WARCRecordType.resource);
        recinfo.setUrl("ftp://ftp.example.com/afile.txt");
        recinfo.setMimetype(ct);
        ArchiveReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = (WARCRecord)ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
               
        int scode = res.getStatusCode();
        assertEquals("statusCode", 200, scode);
View Full Code Here

TOP

Related Classes of org.archive.io.ArchiveReader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.