Examples of IEntityExtractor

com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor

Examples of com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor

        for (String customEntity: customEntityArray) {
          if (!customExtractors.containsKey(customEntity)) {
            // (else already have this extractor - but may have it for text, so some work to do)
            try {
              Class customEntityExtractor = customExtractorClassLoader.loadClass(customEntity);
              IEntityExtractor obj = (IEntityExtractor)customEntityExtractor.newInstance(); 
              entity_extractor_mappings.put(obj.getName().toLowerCase(), obj);
              customExtractors.put(customEntity, customEntityExtractor);
            }
            catch (Exception e) {
              logger.error("IEntityExtractor: Couldn't load " + customEntity +": " + e.getMessage(), e);
            }
            catch(NoClassDefFoundError e) {
              logger.error("IEntityExtractor: Couldn't load " + customEntity +": " + e.getMessage(), e);
            }        
          }
          else { // If this object exists and if it's a text extractor, then see if it's also an entity extractor
            try {
              Class customEntityExtractor = customExtractors.get(customEntity);            
              IEntityExtractor obj = (IEntityExtractor)customEntityExtractor.newInstance(); 
              entity_extractor_mappings.put(obj.getName(), obj);
            }
            catch (Exception e) { 
              logger.error("IEntityExtractor: Couldn't use already loaded " + customEntity +": " + e.getMessage(), e);            
            }
            catch(NoClassDefFoundError e) {

View Full Code Here

Examples of com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor

   */
  public void extractTextAndEntities(List<DocumentPojo> toAdd, SourcePojo source, boolean bFinalizeBatchOnly, boolean calledFromPipeline)
  throws ExtractorDocumentLevelException, ExtractorSourceLevelException, 
  ExtractorDailyLimitExceededException, ExtractorSourceLevelMajorException, ExtractorSourceLevelTransientException
  {
    IEntityExtractor currentEntityExtractor = null;
    try {
      int error_on_feed_count = 0, feed_count = 0;


      // EXTRACTOR SELECTION LOGIC


      if (null != source.useExtractor()) {
        currentEntityExtractor = entity_extractor_mappings.get(source.useExtractor().toLowerCase());
        if (null == currentEntityExtractor) { // (second chance)
          currentEntityExtractor = (IEntityExtractor) lookForDynamicExtractor(source, false);
        }
      }
      if (currentEntityExtractor == null) // none specified or didn't find it (<-latter is error)
      {
        if ((null != source.useExtractor()) && !source.useExtractor().equalsIgnoreCase("none")) {          


          // ie specified one but it doesn't exist....
          StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" no_extractor=").append(source.useExtractor());
          logger.warn(errMsg.toString());


          // No point trying this for the rest of the day
          throw new ExtractorSourceLevelException(errMsg.toString());          
        }
        else if (null == source.useExtractor()) { // Didn't specify one, just use default:
          currentEntityExtractor = default_entity_extractor;
        }
      }//TESTED          


      if (bFinalizeBatchOnly) {
        try {
          currentEntityExtractor.extractEntities(null);
        }
        catch (Exception e) {} // do nothing, eg handle entity extractors that don't handle things well
        return;
      }


      // A teeny bit of complex logic:
      // toAdd by default use a text extractor
      // DB/Files by default don't (but can override)


      ITextExtractor currentTextExtractor = null;
      boolean bUseRawContentWhereAvailable = false; // (only applies for feeds)
      if (null != source.useTextExtractor()) {
        currentTextExtractor = text_extractor_mappings.get(source.useTextExtractor().toLowerCase());
        if (null == currentTextExtractor) { // (second chance)
          currentTextExtractor = (ITextExtractor) lookForDynamicExtractor(source, true);
        }
      }
      if (null == currentTextExtractor) { // none specified or didn't find it (<-latter is error)        
        if (null != source.useTextExtractor()) {                    


          if ((null == source.getStructuredAnalysisConfig()) && (null == source.getUnstructuredAnalysisConfig())
              && (null == source.getProcessingPipeline()))
          {
            //(UAH and SAH get raw access to the data if they need it, so can carry on - ditto processing pipeline)


            StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" no_txt_extractor=").append(source.useTextExtractor());
            logger.warn(errMsg.toString());


            // No point trying this for the rest of the day
            throw new ExtractorSourceLevelException(errMsg.toString());
          }
          else {
            bUseRawContentWhereAvailable = true; // (only checked for feeds)            
          }//TESTED
        }
        else if (source.getExtractType().equalsIgnoreCase("feed")) // (DB/files just use their existing fullText) 
        {
          if (null != currentEntityExtractor) {
            String selfExtraction = currentEntityExtractor.getCapability(EntityExtractorEnum.URLTextExtraction); 
            // Leave as null unless have no built-in capability
            if ((null == selfExtraction) || !selfExtraction.equals("true"))
            {
              currentTextExtractor = default_text_extractor;
            }
          }
          else {
            currentTextExtractor = default_text_extractor;            
          }
        }//TESTED    
      }


      // EXTRACTION
      Iterator<DocumentPojo> i = toAdd.iterator(); //iterator created so that elements in the toAdd list can be 
      // removed within the loop
      while ( i.hasNext() )
      {
        long nTime_ms = System.currentTimeMillis();
        DocumentPojo doc = i.next();
        boolean bExtractedText = false;


        // If I've been stopped then just remove all remaining documents
        // (pick them up next time through)
        if (bIsKilled) {
          i.remove();
          if (!calledFromPipeline) {
            doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
          }
          continue;
        }


        if ( calledFromPipeline || !urlsThatError.contains(doc.getUrl()) ) //only attempt if url is okay
        {        
          feed_count++;


          try {
            // (Check for truncation)
            if ((null != currentEntityExtractor) && (null != doc.getFullText())) {
              try {
                String s = currentEntityExtractor.getCapability(EntityExtractorEnum.MaxInputBytes);
                if (null != s) {
                  int maxLength = Integer.parseInt(s);
                  if (doc.getFullText().length() > maxLength) { //just warn, it's up to the extractor to sort it out
                    getHarvestStatus().logMessage("Warning: truncating document to max length: " + s, false);
                  }
                }
              }
              catch (Exception e) {} // max length not reported just carry on
            }
            
            if (null != currentTextExtractor)
            {  
              bExtractedText = true;
              currentTextExtractor.extractText(doc);
              if (null != currentEntityExtractor) {
                currentEntityExtractor.extractEntities(doc);
              }


            }//TESTED
            else //db/filesys should already have full text extracted (unless otherwise specified)
            {
              if (source.getExtractType().equalsIgnoreCase("feed")) { // Need full text so get from current
                
                if ((null == doc.getFullText()) || !bUseRawContentWhereAvailable) {
                  bExtractedText = true;
                  if (null != currentEntityExtractor) {
                    currentEntityExtractor.extractEntitiesAndText(doc);
                  }
                }//TESTED (AlchemyAPI case)
                else { // Feed for which we've already extracted data
                  if (null != currentEntityExtractor) {
                    currentEntityExtractor.extractEntities(doc);
                  }
                }//TESTED
              }
              else { // DB/File => use full text
                if (null != currentEntityExtractor) {
                  currentEntityExtractor.extractEntities(doc);
                }
              }//TESTED
            }


            //statistics counting

View Full Code Here

Examples of com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor

          ITextExtractor txtExtractor = (ITextExtractor )classToLoad.newInstance();
          text_extractor_mappings.put(source.useTextExtractor(), txtExtractor);
          outClassInstance = txtExtractor;
        }
        else {
          IEntityExtractor entExtractor = (IEntityExtractor)classToLoad.newInstance();
          entity_extractor_mappings.put(source.useExtractor(), entExtractor);          
          outClassInstance = entExtractor;
        }
      }
    }

View Full Code Here

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.