Examples of ExtractorSourceLevelException


Examples of com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelException

        String errorMsg = new StringBuffer().append(feed_count).append(" docs, ").append(error_on_feed_count).append(", errors").toString();
        if (error_on_feed_count > 20) {
          throw new ExtractorSourceLevelMajorException(errorMsg);
        }
        else {
          throw new ExtractorSourceLevelException(errorMsg);
        }//TESTED (copy/paste from legacy HarvestController.extractTextAndEntities)
      }
    }//TESTED (web_errors_test)
    catch (ExtractorSourceLevelMajorException e) {
      this.handleDocOrSourceError(source, null, null, e, true)
View Full Code Here

Examples of com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelException

          // ie specified one but it doesn't exist....
          StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" no_extractor=").append(source.useExtractor());
          logger.warn(errMsg.toString());

          // No point trying this for the rest of the day
          throw new ExtractorSourceLevelException(errMsg.toString());         
        }
        else if (null == source.useExtractor()) { // Didn't specify one, just use default:
          currentEntityExtractor = default_entity_extractor;
        }
      }//TESTED         

      if (bFinalizeBatchOnly) {
        try {
          currentEntityExtractor.extractEntities(null);
        }
        catch (Exception e) {} // do nothing, eg handle entity extractors that don't handle things well
        return;
      }

      // A teeny bit of complex logic:
      // toAdd by default use a text extractor
      // DB/Files by default don't (but can override)

      ITextExtractor currentTextExtractor = null;
      boolean bUseRawContentWhereAvailable = false; // (only applies for feeds)
      if (null != source.useTextExtractor()) {
        currentTextExtractor = text_extractor_mappings.get(source.useTextExtractor().toLowerCase());
        if (null == currentTextExtractor) { // (second chance)
          currentTextExtractor = (ITextExtractor) lookForDynamicExtractor(source, true);
        }
      }
      if (null == currentTextExtractor) { // none specified or didn't find it (<-latter is error)       
        if (null != source.useTextExtractor()) {                   

          if ((null == source.getStructuredAnalysisConfig()) && (null == source.getUnstructuredAnalysisConfig())
              && (null == source.getProcessingPipeline()))
          {
            //(UAH and SAH get raw access to the data if they need it, so can carry on - ditto processing pipeline)

            StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" no_txt_extractor=").append(source.useTextExtractor());
            logger.warn(errMsg.toString());

            // No point trying this for the rest of the day
            throw new ExtractorSourceLevelException(errMsg.toString());
          }
          else {
            bUseRawContentWhereAvailable = true; // (only checked for feeds)           
          }//TESTED
        }
        else if (source.getExtractType().equalsIgnoreCase("feed")) // (DB/files just use their existing fullText)
        {
          if (null != currentEntityExtractor) {
            String selfExtraction = currentEntityExtractor.getCapability(EntityExtractorEnum.URLTextExtraction);
            // Leave as null unless have no built-in capability
            if ((null == selfExtraction) || !selfExtraction.equals("true"))
            {
              currentTextExtractor = default_text_extractor;
            }
          }
          else {
            currentTextExtractor = default_text_extractor;           
          }
        }//TESTED   
      }

      // EXTRACTION
      Iterator<DocumentPojo> i = toAdd.iterator(); //iterator created so that elements in the toAdd list can be
      // removed within the loop
      while ( i.hasNext() )
      {
        long nTime_ms = System.currentTimeMillis();
        DocumentPojo doc = i.next();
        boolean bExtractedText = false;

        // If I've been stopped then just remove all remaining documents
        // (pick them up next time through)
        if (bIsKilled) {
          i.remove();
          if (!calledFromPipeline) {
            doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
          }
          continue;
        }

        if ( calledFromPipeline || !urlsThatError.contains(doc.getUrl()) ) //only attempt if url is okay
        {       
          feed_count++;

          try {
            // (Check for truncation)
            if ((null != currentEntityExtractor) && (null != doc.getFullText())) {
              try {
                String s = currentEntityExtractor.getCapability(EntityExtractorEnum.MaxInputBytes);
                if (null != s) {
                  int maxLength = Integer.parseInt(s);
                  if (doc.getFullText().length() > maxLength) { //just warn, it's up to the extractor to sort it out
                    getHarvestStatus().logMessage("Warning: truncating document to max length: " + s, false);
                  }
                }
              }
              catch (Exception e) {} // max length not reported just carry on
            }
           
            if (null != currentTextExtractor)
            { 
              bExtractedText = true;
              currentTextExtractor.extractText(doc);
              if (null != currentEntityExtractor) {
                currentEntityExtractor.extractEntities(doc);
              }

            }//TESTED
            else //db/filesys should already have full text extracted (unless otherwise specified)
            {
              if (source.getExtractType().equalsIgnoreCase("feed")) { // Need full text so get from current
               
                if ((null == doc.getFullText()) || !bUseRawContentWhereAvailable) {
                  bExtractedText = true;
                  if (null != currentEntityExtractor) {
                    currentEntityExtractor.extractEntitiesAndText(doc);
                  }
                }//TESTED (AlchemyAPI case)
                else { // Feed for which we've already extracted data
                  if (null != currentEntityExtractor) {
                    currentEntityExtractor.extractEntities(doc);
                  }
                }//TESTED
              }
              else { // DB/File => use full text
                if (null != currentEntityExtractor) {
                  currentEntityExtractor.extractEntities(doc);
                }
              }//TESTED
            }

            //statistics counting
            if ( doc.getEntities() != null )
              num_ent_extracted.addAndGet(doc.getEntities().size());
            if ( doc.getAssociations() != null )
              num_event_extracted.addAndGet(doc.getAssociations().size());

          }
          catch (ExtractorDailyLimitExceededException e) {

            //extractor can't do anything else today, return
            i.remove();
            if (!calledFromPipeline) {
              doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
            }

            // Source error, ignore all other documents
            while (i.hasNext()) {
              doc = i.next();
              if (!calledFromPipeline) {
                doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
              }
              i.remove();
            }
            //TESTED

            throw e; // (ie stop processing this source)
          }//TESTED
          catch (Exception e) { // Anything except daily limit exceeded, expect it to be ExtractorDocumentLevelException

            //TODO (INF-1922): put this in a separate function and call that from pipeline on failure...
            // (not sure what to do about error_on_feed_count though, need to maintain a separate one of those in pipeline?)
           
            // This can come from (sort-of/increasingly) "user" code so provide a bit more information
            StringBuffer errMessage = HarvestExceptionUtils.createExceptionMessage(e);
            _harvestStatus.logMessage(errMessage.toString(), true);
            num_error_url.incrementAndGet();
            nUrlErrorsThisSource++;
           
            if (!calledFromPipeline) {
              urlsThatError.add(doc.getUrl());
            }

            error_on_feed_count++;
            i.remove();
            if (!calledFromPipeline) {
              doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
            }
          }
          //TESTED
        }
        // (note this is only ever called in legacy mode - it's handled in the HarvestControllerPipeline)
        if ((null != source.getExtractType()) && (source.getExtractType().equalsIgnoreCase("feed"))) {
          if (i.hasNext() && bExtractedText) {
            nTime_ms = nBetweenFeedDocs_ms - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
            if (nTime_ms > 0) {
              try { Thread.sleep(nTime_ms); } catch (Exception e) {};
              // (wait 10s between web-site accesses for politeness)
            }
          }
        }//(TESTED)

      } // end loop over documents 
      //check if all toAdd were erroring, or more than 20 (arbitrary number)
      //NOTE: this is duplicated in HarvestControllerPipeline for non-legacy cases
      if ((error_on_feed_count == feed_count) && (feed_count > 5))
      {
        String errorMsg = new StringBuffer().append(feed_count).append(" docs, ").append(error_on_feed_count).append(", errors").toString();
        if (error_on_feed_count > 20) {
          throw new ExtractorSourceLevelMajorException(errorMsg);
        }
        else {
          throw new ExtractorSourceLevelException(errorMsg);
        }//TESTED
      }
    }
    catch (ExtractorDailyLimitExceededException e) {
      // Percolate upwards!
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.