Package org.archive.io

Examples of org.archive.io.ReplayInputStream


     * @param curi CrawlURI to process.
     */
    protected boolean innerExtract(CrawlURI curi){
        int links = 0;
        InputStream contentStream = null;
        ReplayInputStream documentStream = null;
        SeekReader docReader = null;

        // Get the doc as a repositionable reader
        try
        {
            contentStream = curi.getRecorder().getContentReplayInputStream();
            if (contentStream==null) {
                // TODO: note problem
                return false;
            }
            documentStream = new ReplayInputStream(contentStream);
          
           
            docReader = Doc.getText(documentStream);
        } catch(Exception e){
            curi.getNonFatalFailures().add(e);
            return false;
        } finally {
            IOUtils.closeQuietly(contentStream);
        }

        CharSequence cs = new SeekReaderCharSequence(docReader, 0);
        Matcher m = PATTERN.matcher(cs);
        while (m.find()) {
            links++;
            addLink(curi, m.group(1));
        }
        documentStream.destroy();
        logger.fine(curi + " has " + links + " links.");
        return true;
    }
View Full Code Here


      }
    }

    // now write the content, or a fake record:
    ARCWriter writer = null;
    ReplayInputStream replayIS = null;
    try {
      writer = cache.getWriter();
      if(gotUrl) {

        RecordingInputStream ris = recorder.getRecordedInput();
View Full Code Here

TOP

Related Classes of org.archive.io.ReplayInputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.