Examples of net.sf.regain.crawler.document.RawDocument

          continue;
        }
      }


      // Create a raw document
      RawDocument rawDocument;
      try {
        rawDocument = new RawDocument(url, mCurrentJob.getSourceUrl(),
          mCurrentJob.getSourceLinkText(), 
          CrawlerToolkit.findAuthenticationValuesForURL(url, accountPasswordStore));
        
      } catch (RedirectException exc) {
        String redirectUrl = exc.getRedirectUrl();
        mLog.info("Redirect '" + url +  "' -> '" + redirectUrl + "'");
        mUrlChecker.setIgnored(url);
        // the RedirectURL inherit the properties for shouldBeParsed, shouldBeIndexed from the 
        // sourceURL. This is possibly not right according to definitions in the whitelist
        addJob(redirectUrl, mCurrentJob.getSourceUrl(), shouldBeParsed,
               shouldBeIndexed, mCurrentJob.getSourceLinkText());
        mCrawlerJobProfiler.stopMeasuring(0);
        continue;
      }
      catch (RegainException exc) {
        // Check whether the exception was caused by a dead link
        handleDocumentLoadingException(exc, mCurrentJob);


        // This document does not exist -> We can't parse or index anything
        // -> continue
        mCrawlerJobProfiler.abortMeasuring();
        continue;
      }


      if( shouldBeIndexed || shouldBeParsed ){
        if (mLog.isDebugEnabled()) {
          mLog.debug("Parsing and indexing " + rawDocument.getUrl());
        }
        mHtmlParsingProfiler.startMeasuring();


        // Parse and index content and metadata
        if (shouldBeIndexed) {
           try {
            mIndexWriterManager.addToIndex(rawDocument, this);
          }
          catch (RegainException exc) {
            logError("Indexing failed for: " + rawDocument.getUrl(), exc, false);
          }
        } 


        // Extract links form the document (parse=true). The real meaning of parse in this context
        // is link-extraction. The document is parsed anyway (building a html-node tree).
        if (shouldBeParsed) {
          if(!shouldBeIndexed){
            // The document is not parsed so parse it
            mIndexWriterManager.getDocumentFactory().createDocument(rawDocument, this);
          }
          try {
            //parseHtmlDocument(rawDocument);
            createCrawlerJobs(rawDocument);
          }
          catch (RegainException exc) {
            logError("CrawlerJob creation failed for: " + rawDocument.getUrl(), exc, false);
          }
        }
        mHtmlParsingProfiler.stopMeasuring(rawDocument.getLength());
      }
      // System-Ressourcen des RawDocument wieder frei geben.
      rawDocument.dispose();


      // Zeitmessung stoppen
      mCrawlerJobProfiler.stopMeasuring(rawDocument.getLength());
      mCurrentJob = null;
      
      // Check whether to create a breakpoint
      int breakpointInterval = mConfiguration.getBreakpointInterval();
      boolean breakpointIntervalIsOver = (breakpointInterval > 0)
Examples of net.sf.regain.crawler.document.RawDocument

Related Classes of net.sf.regain.crawler.document.RawDocument