Examples of com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo

Package com.ikanow.infinit.e.data_model.store.config.source

Examples of com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo

com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo
UnstructuredAnalysisPojo

    }
    // TESTED: default and overridden values


    _context = contextController;
    securityManager = _context.getSecurityManager();
    UnstructuredAnalysisConfigPojo uap = source.getUnstructuredAnalysisConfig();


    if (uap != null) {
      boolean bGetRawDoc = source.getExtractType().equalsIgnoreCase("feed");


      String headerRegEx = uap.getHeaderRegEx();
      String footerRegEx = uap.getFooterRegEx();
      List<metaField> meta = uap.getMeta();


      if (headerRegEx != null)
        headerPattern = createRegex(headerRegEx, uap.getHeaderRegExFlags());
      if (footerRegEx != null)
        footerPattern = createRegex(footerRegEx, uap.getFooterRegExFlags());


      Iterator<DocumentPojo> it = documents.iterator();
      int nDocs = 0;
      while (it.hasNext()) {
        nDocs++;
        DocumentPojo d = it.next();
         regexDuplicates = new HashSet<String>();
        cleaner = null;


        // For feeds, may need to go get the document text manually,
        // it's a bit horrible since
        // obviously may then go get the data again for full text
        // extraction
        boolean bFetchedUrl = false;
        if (bGetRawDoc && (null == d.getFullText())) {
          if (null == source.getRssConfig()) {
            source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
          }
          // (first time through, sleep following a URL/RSS access)
          if ((1 == nDocs) && (null != source.getUrl())) { // (have already made a call to RSS (or "searchConfig" URL)
            try {
              Thread.sleep(nBetweenDocs_ms);
            } catch (InterruptedException e) {
            }
          }
          // TESTED (first time only, correct value after searchConfig override)


          try {
            if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
              // Special case: if tika enabled then do that first
              if (null == tikaExtractor) {
                tikaExtractor = new TextExtractorTika();
                tikaExtractor.extractText(d);
              }
            }
            else {
              this.getRawTextFromUrlIfNeeded(d, source.getRssConfig());
            }
            bFetchedUrl = true;
            
          } catch (Exception e) { // Failed to get full text twice, remove doc
            if (e instanceof SecurityException) { // This seems worthy of actually logging, even though it's a lowly doc error
              contextController.getHarvestStatus().logMessage(e.getMessage(), true);
            }
            contextController.handleExtractError(e, source); //handle extractor error if need be        
            it.remove();
            d.setTempSource(null); // (can safely corrupt this doc since it's been removed)            
            continue;
          }
        }
        long nTime_ms = System.currentTimeMillis();
        // ^^^ (end slight hack to get raw text to the UAH for RSS feeds)


        try {
          processBody(d, meta, true, source, uap);
        } catch (Exception e) {
          this._context.getHarvestStatus().logMessage("processBody1: " + e.getMessage(), true);
          //DEBUG (don't output log messages per doc)
          //logger.error("processBody1: " + e.getMessage(), e);
        }


        try {
          if (uap.getSimpleTextCleanser() != null) {
            cleanseText(uap.getSimpleTextCleanser(), d);
          }
        } catch (Exception e) {
          this._context.getHarvestStatus().logMessage("cleanseText: " + e.getMessage(), true);
          //DEBUG (don't output log messages per doc)
          //logger.error("cleanseText: " + e.getMessage(), e);

View Full Code Here

      }
    } // TESTED (overridden and using system default)


    _context = context;
    securityManager = _context.getSecurityManager();
    UnstructuredAnalysisConfigPojo uap = source.getUnstructuredAnalysisConfig();


    int nChanges = 0;
    if (null != doc.getMetaData()) {
      nChanges = doc.getMetaData().size();
    }
    boolean bFetchedUrl = false;
    if (bGetRawDoc) {
      if (null == source.getRssConfig()) {
        source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
      }
      try {
        // Workaround for observed twitter bug (first access after the
        // RSS was gzipped)
        if (bFirstTime) {
          // (first time through, sleep following a URL/RSS access)
          if (null != source.getUrl()) { // (have already made a call to RSS (or "searchConfig" URL)
            try {
              Thread.sleep(nBetweenDocs_ms);
            } catch (InterruptedException e) {
            }
          }
          // TESTED
        }
        
        if ((null != source.useTextExtractor()) && source.useTextExtractor().equalsIgnoreCase("tika")) {
          // Special case: if tika enabled then do that first
          if (null == tikaExtractor) {
            tikaExtractor = new TextExtractorTika();
            tikaExtractor.extractText(doc);
          }
        }
        else {
          getRawTextFromUrlIfNeeded(doc, source.getRssConfig());
        }
        bFetchedUrl = true;
        
      }
      catch (SecurityException e) { // This seems worthy of actually logging, even though it's a lowly doc error
        _context.getHarvestStatus().logMessage(e.getMessage(), true);
        throw new ExtractorDocumentLevelException(e.getMessage());
      }//TESTED
      catch (Exception e) { // Failed to get full text twice... remove doc and carry on
        throw new ExtractorDocumentLevelException(e.getMessage());
      }
    }
    long nTime_ms = System.currentTimeMillis();
    // ^^^ (end slight hack to get raw text to the UAH for RSS feeds)


    if (uap != null) {
      List<metaField> meta = uap.getMeta();
      if (savedUap != uap) {
        String headerRegEx = uap.getHeaderRegEx();
        String footerRegEx = uap.getFooterRegEx();


        if (headerRegEx != null)
          headerPattern = Pattern.compile(headerRegEx, Pattern.DOTALL);
        if (footerRegEx != null)
          footerPattern = Pattern.compile(footerRegEx, Pattern.DOTALL);


        savedUap = uap;
      }
      try {
        processBody(doc, meta, true, source, uap);
        
      } catch (Exception e) {
        this._context.getHarvestStatus().logMessage("processBody1: " + e.getMessage(), true);
        //DEBUG (don't output log messages per doc)
        //logger.error("processBody1: " + e.getMessage(), e);
      }
      try {
        if (uap.getSimpleTextCleanser() != null) {
          cleanseText(uap.getSimpleTextCleanser(), doc);
        }
      } catch (Exception e) {
        this._context.getHarvestStatus().logMessage("cleanseText: " + e.getMessage(), true);
        //DEBUG (don't output log messages per doc)
        //logger.error("cleanseText: " + e.getMessage(), e);

View Full Code Here

    // Now allowed to stop paginating on duplicate in success_iteration/error cases
    if ((null == src.getHarvestStatus()) || (HarvestEnum.success != src.getHarvestStatus().getHarvest_status())) {
      searchConfig.setStopPaginatingOnDuplicate(false);
    }//TESTED    
    
    UnstructuredAnalysisConfigPojo savedUAHconfig = src.getUnstructuredAnalysisConfig(); // (can be null)
    String savedUserAgent = feedConfig.getUserAgent();
    LinkedHashMap<String, String> savedHttpFields = feedConfig.getHttpFields();
    Integer savedWaitTimeOverride_ms = feedConfig.getWaitTimeOverride_ms();


    // Create a deduplication set to ensure URLs derived from the search pages don't duplicate the originals
    // (and also derived URLs)
    HashSet<String> dedupSet = new HashSet<String>();
    if (null != src.getRssConfig().getExtraUrls()) {
      Iterator<ExtraUrlPojo> itDedupUrls = src.getRssConfig().getExtraUrls().iterator();
      while (itDedupUrls.hasNext()) {
        ExtraUrlPojo itUrl = itDedupUrls.next();
        if (null != itUrl.title) {
          String dedupUrl = itUrl.url;
          dedupSet.add(dedupUrl);
          if (maxDocsPerCycle != Integer.MAX_VALUE) {
            maxDocsPerCycle++; // (ensure we get as far as adding these)
          }
        }
      }
    }//TESTED
    
    Iterator<ExtraUrlPojo> itUrls = null;
    
    // Spider parameters used in conjunction with itUrls
    List<ExtraUrlPojo> iteratingList = null;
    List<ExtraUrlPojo> waitingList = null;
    int nIteratingDepth = 0;
    
    // (ie no URL specified, so using extra URLs as search URLs - and optionally as real URLs also)
    if ((null == savedUrl) && (null != src.getRssConfig().getExtraUrls()) && !src.getRssConfig().getExtraUrls().isEmpty()) {
      // Spider logic:
      iteratingList = src.getRssConfig().getExtraUrls();
      // (end spidering logic)
      
      itUrls = iteratingList.iterator();
      src.getRssConfig().setExtraUrls(new LinkedList<ExtraUrlPojo>());
        // (ie overwrite the original list)
    }//TESTED
    
    for (;;) { // The logic for this loop can vary...
      if (dedupSet.size() >= maxDocsPerCycle) {
        break;
      }
      String currTitle = null;
      String currFullText = null;
      String currDesc = null;
    
      if (null != itUrls) {
                
        ExtraUrlPojo urlPojo = itUrls.next(); 
        savedUrl = urlPojo.url;
        if (0 == nIteratingDepth) {
          if (null != urlPojo.title) { // Also harvest this
            src.getRssConfig().getExtraUrls().add(urlPojo);
            if (maxDocsPerCycle != Integer.MAX_VALUE) {
              maxDocsPerCycle--; // (now added, can remove)
            }
          }
        }
        currTitle = urlPojo.title;
        currDesc = urlPojo.description;
        currFullText = urlPojo.fullText;
      }//TESTED
      
      try { // If we error out, we're probably going to abandon the entire search
        
      // We're going to loop over pages
      
      // Apply the regex to the URL for pagination, part 1
        
        int nResultOffset = 0;
        int nMaxPages = 1;
        Pattern pageChangeRegex = null;
        Matcher pageChangeRegexMatcher = null;
        if (null != feedConfig.getSearchConfig().getPageChangeRegex()) {
          pageChangeRegex = Pattern.compile(feedConfig.getSearchConfig().getPageChangeRegex(), Pattern.CASE_INSENSITIVE);
          pageChangeRegexMatcher = pageChangeRegex.matcher(savedUrl);
          nMaxPages = feedConfig.getSearchConfig().getNumPages();
          
          if (pageChangeRegexMatcher.find()) {
            String group = pageChangeRegexMatcher.group(1);
            if (null != group) {
              try {
                nResultOffset = Integer.parseInt(group);
              }
              catch (Exception e) {} // just carry on
            }
          }
          else { // URL doesn't match
            pageChangeRegexMatcher = null;          
          }//TESTED
          
        }//TESTED
  
        // Page limit check (see also nLinksFound/nCurrDedupSetSize inside loop)
        int nMinLinksToExitLoop = 10; // (use to check one iteration past the point at which nothing happens)
        
        // If checking vs duplicates then have a flag to exit (note: only applies to the current URL)
        boolean stopPaginating = false;
        boolean stopLinkFollowing = false; 
          // (if set to stop paginating but only link following occurs, assume this is treated like pagination, eg nextUrl sort of thing)
        
        for (int nPage = 0; nPage < nMaxPages; ++nPage) {                    
          if ((dedupSet.size() >= maxDocsPerCycle) || stopPaginating) {
            if (dedupSet.size() >= maxDocsPerCycle) {
              src.setReachedMaxDocs();
            }
            break;
          }
          // Will use this to check if we reached a page limit (eg some sites will just repeat the same page over and over again)
          int nLinksFound = 0;
          int nCurrDedupSetSize = dedupSet.size();
          
          String url = savedUrl;  
          
      // Apply the regex to the URL for pagination, part 2
          
          if ((null != pageChangeRegex) && (null != feedConfig.getSearchConfig().getPageChangeReplace())) {
            int nResultStart = nPage*feedConfig.getSearchConfig().getNumResultsPerPage() + nResultOffset;
            String replace = feedConfig.getSearchConfig().getPageChangeReplace().replace("$1", Integer.toString(nResultStart));
  
            if (null == pageChangeRegexMatcher) {
              url += replace;
            }
            else {
              url = pageChangeRegexMatcher.replaceFirst(replace);
            }
          }//TESTED


          //DEBUG
          //System.out.println("URL=" + url);
          
      // Create a custom UAH object to fetch and parse the search results
      
          UnstructuredAnalysisConfigPojo dummyUAHconfig = new UnstructuredAnalysisConfigPojo();
          if (null == feedConfig.getSearchConfig().getScriptflags()) { // Set flags if necessary
            if (null == feedConfig.getSearchConfig().getExtraMeta()) {
              feedConfig.getSearchConfig().setScriptflags("dt");
            }
            else {
              feedConfig.getSearchConfig().setScriptflags("dtm");              
            }
          }
          if (null != feedConfig.getSearchConfig().getExtraMeta()) {
            dummyUAHconfig.CopyMeta(feedConfig.getSearchConfig().getExtraMeta());
            // Legacy -> Pipeline port
            for (metaField extraMeta: dummyUAHconfig.getMeta()) {
              if (null == extraMeta.context) { // mandatory in legacy, discarded in pipeline!
                extraMeta.context = Context.First;
              }
            }
          }
          dummyUAHconfig.setScript(feedConfig.getSearchConfig().getGlobals());
          dummyUAHconfig.AddMetaField("searchEngineSubsystem", Context.All, feedConfig.getSearchConfig().getScript(), "javascript", feedConfig.getSearchConfig().getScriptflags());
          src.setUnstructuredAnalysisConfig(dummyUAHconfig);
          if (null != searchConfig.getProxyOverride()) {
            feedConfig.setProxyOverride(searchConfig.getProxyOverride());
          }
          if (null != searchConfig.getUserAgent()) {

View Full Code Here

      // (saved fields to rewrite later if not public, do up here for visibility)
      String url = source.getUrl();
      SourceRssConfigPojo rss = source.getRssConfig();
      List<SourcePipelinePojo> pxPipe = source.getProcessingPipeline(); 
      StructuredAnalysisConfigPojo sah = source.getStructuredAnalysisConfig();
      UnstructuredAnalysisConfigPojo uah = source.getUnstructuredAnalysisConfig();
      
      if (!bIsPublic) { // Cleanse URLs, remove processing pipeline information
        source.setPartiallyPublished(true); //TESTED
        
        // Copy URL info from px pipeline into the main source
        if ((null != source.getProcessingPipeline()) && !source.getProcessingPipeline().isEmpty()) {          
          SourcePipelinePojo firstEl = source.getProcessingPipeline().iterator().next();
          if (null != firstEl.web) {
            source.setRssConfig(firstEl.web);
          }
          else if (null != firstEl.feed) {
            source.setRssConfig(firstEl.feed);
          }
          else if (null != firstEl.database) {
            source.setUrl(firstEl.database.getUrl());
          }
          else if (null != firstEl.file) {
            source.setUrl(firstEl.file.getUrl());
          }
          source.setProcessingPipeline(new ArrayList<SourcePipelinePojo>()); // (delete px pipeline)
        }//(end if non-empty px pipeline)
        //TESTED
        
        int nIndex = -1;
        if ((null != url) && ((nIndex = url.indexOf('?')) >= 0)) {
          source.setUrl(url.substring(0, 1 + nIndex));
        }
        if (null != rss) {
          rss.setHttpFields(null); // (remove cookie information)
        }
        if ((null != rss) && (null != rss.getExtraUrls())) {
          SourceRssConfigPojo newRss = new SourceRssConfigPojo();
          ArrayList<SourceRssConfigPojo.ExtraUrlPojo> newList = new ArrayList<SourceRssConfigPojo.ExtraUrlPojo>(rss.getExtraUrls().size());
          for (SourceRssConfigPojo.ExtraUrlPojo urlObj: rss.getExtraUrls()) {
            SourceRssConfigPojo.ExtraUrlPojo newUrlObj = new SourceRssConfigPojo.ExtraUrlPojo();
            if ((null != urlObj.url) && ((nIndex = urlObj.url.indexOf('?')) >= 0)) {
              newUrlObj.url = urlObj.url.substring(0, 1 + nIndex);
            }
            else {
              newUrlObj.url = urlObj.url;
            }
            newUrlObj.title = urlObj.title; 
            newUrlObj.description = urlObj.description;
            newUrlObj.publishedDate = urlObj.publishedDate;
            newUrlObj.fullText = urlObj.fullText;
            newList.add(newUrlObj);
          }
          newRss.setExtraUrls(newList);
          source.setRssConfig(newRss);
        }
        else if (null != rss) {
          source.setRssConfig(null);          
        }
        if (null != source.getStructuredAnalysisConfig()) {
          source.setStructuredAnalysisConfig(new StructuredAnalysisConfigPojo());
        }//TESTED 
        if (null != source.getUnstructuredAnalysisConfig()) {
          source.setUnstructuredAnalysisConfig(new UnstructuredAnalysisConfigPojo());
        }//TESTED         
      }
      //TESTED (extraUrls with and without ?s, RSS/no extraURLs, URL)  
      
      if (null == source.getCommunityIds()) { // Somehow a security error has occurred

View Full Code Here

TOP

Related Classes of com.ikanow.infinit.e.data_model.store.config.source.UnstructuredAnalysisConfigPojo

com.ikanow.infinit.e.data_model.api.config.SourcePojoApiMap$SourcePojoSerializer

com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester

com.ikanow.infinit.e.harvest.extraction.document.rss.FeedHarvester_searchEngineSubsystem

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.