Package net.sf.regain.crawler.document

Examples of net.sf.regain.crawler.document.RawDocument


    for (int i = 0; i < docFileArr.length; i++) {
      if (docFileArr[i].isFile()) {
        String url = RegainToolkit.fileToUrl(docFileArr[i]);
        mLog.info("Preparing document: " + url);
        try {
          RawDocument doc = new RawDocument(url, sourceUrl, null, null);

          profiler.startMeasuring();
          String content;
          try {
            prep.prepare(doc);
View Full Code Here


          continue;
        }
      }

      // Create a raw document
      RawDocument rawDocument;
      try {
        rawDocument = new RawDocument(url, mCurrentJob.getSourceUrl(),
          mCurrentJob.getSourceLinkText(),
          CrawlerToolkit.findAuthenticationValuesForURL(url, accountPasswordStore));
       
      } catch (RedirectException exc) {
        String redirectUrl = exc.getRedirectUrl();
        mLog.info("Redirect '" + url +  "' -> '" + redirectUrl + "'");
        mUrlChecker.setIgnored(url);
        // the RedirectURL inherit the properties for shouldBeParsed, shouldBeIndexed from the
        // sourceURL. This is possibly not right according to definitions in the whitelist
        addJob(redirectUrl, mCurrentJob.getSourceUrl(), shouldBeParsed,
               shouldBeIndexed, mCurrentJob.getSourceLinkText());
        mCrawlerJobProfiler.stopMeasuring(0);
        continue;
      }
      catch (RegainException exc) {
        // Check whether the exception was caused by a dead link
        handleDocumentLoadingException(exc, mCurrentJob);

        // This document does not exist -> We can't parse or index anything
        // -> continue
        mCrawlerJobProfiler.abortMeasuring();
        continue;
      }

      if( shouldBeIndexed || shouldBeParsed ){
        if (mLog.isDebugEnabled()) {
          mLog.debug("Parsing and indexing " + rawDocument.getUrl());
        }
        mHtmlParsingProfiler.startMeasuring();

        // Parse and index content and metadata
        if (shouldBeIndexed) {
           try {
            mIndexWriterManager.addToIndex(rawDocument, this);
          }
          catch (RegainException exc) {
            logError("Indexing failed for: " + rawDocument.getUrl(), exc, false);
          }
        }

        // Extract links form the document (parse=true). The real meaning of parse in this context
        // is link-extraction. The document is parsed anyway (building a html-node tree).
        if (shouldBeParsed) {
          if(!shouldBeIndexed){
            // The document is not parsed so parse it
            mIndexWriterManager.getDocumentFactory().createDocument(rawDocument, this);
          }
          try {
            //parseHtmlDocument(rawDocument);
            createCrawlerJobs(rawDocument);
          }
          catch (RegainException exc) {
            logError("CrawlerJob creation failed for: " + rawDocument.getUrl(), exc, false);
          }
        }
        mHtmlParsingProfiler.stopMeasuring(rawDocument.getLength());
      }
      // System-Ressourcen des RawDocument wieder frei geben.
      rawDocument.dispose();

      // Zeitmessung stoppen
      mCrawlerJobProfiler.stopMeasuring(rawDocument.getLength());
      mCurrentJob = null;
     
      // Check whether to create a breakpoint
      int breakpointInterval = mConfiguration.getBreakpointInterval();
      boolean breakpointIntervalIsOver = (breakpointInterval > 0)
View Full Code Here

TOP

Related Classes of net.sf.regain.crawler.document.RawDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.