Examples of com.ikanow.infinit.e.data_model.store.config.source.SourceRssConfigPojo$ExtraUrlPojo

Package com.ikanow.infinit.e.data_model.store.config.source

Examples of com.ikanow.infinit.e.data_model.store.config.source.SourceRssConfigPojo$ExtraUrlPojo

com.ikanow.infinit.e.data_model.store.config.source.SourceRssConfigPojo

    
    // Fill in any blank user agents at the top level, for simplicity
    if (null != props.getHarvestUserAgent()) {
      String userAgent = props.getHarvestUserAgent();      
      if (null == source.getRssConfig()) {
        source.setRssConfig(new SourceRssConfigPojo());
      }
      if (null == source.getRssConfig().getUserAgent()) { // ...But rss.userAgent doesn't:
        source.getRssConfig().setUserAgent(userAgent); // then override it
      }
      if (null != source.getRssConfig().getSearchConfig()) { // If rss.searchConfig exists...

View Full Code Here

    // otherwise get everything and worry about max docs in the main feed harvester
    // (probably slightly less efficient than checking duplicates here, but much simpler, can 
    //  always change it later)
    
    String savedUrl = src.getUrl();
    SourceRssConfigPojo feedConfig = src.getRssConfig();    
    SourceSearchFeedConfigPojo searchConfig = feedConfig.getSearchConfig();
    String savedProxyOverride = feedConfig.getProxyOverride();
    if ((null == feedConfig) || (null == searchConfig)) {
      return;
    }


    // Now allowed to stop paginating on duplicate in success_iteration/error cases
    if ((null == src.getHarvestStatus()) || (HarvestEnum.success != src.getHarvestStatus().getHarvest_status())) {
      searchConfig.setStopPaginatingOnDuplicate(false);
    }//TESTED    
    
    UnstructuredAnalysisConfigPojo savedUAHconfig = src.getUnstructuredAnalysisConfig(); // (can be null)
    String savedUserAgent = feedConfig.getUserAgent();
    LinkedHashMap<String, String> savedHttpFields = feedConfig.getHttpFields();
    Integer savedWaitTimeOverride_ms = feedConfig.getWaitTimeOverride_ms();


    // Create a deduplication set to ensure URLs derived from the search pages don't duplicate the originals
    // (and also derived URLs)
    HashSet<String> dedupSet = new HashSet<String>();
    if (null != src.getRssConfig().getExtraUrls()) {
      Iterator<ExtraUrlPojo> itDedupUrls = src.getRssConfig().getExtraUrls().iterator();
      while (itDedupUrls.hasNext()) {
        ExtraUrlPojo itUrl = itDedupUrls.next();
        if (null != itUrl.title) {
          String dedupUrl = itUrl.url;
          dedupSet.add(dedupUrl);
          if (maxDocsPerCycle != Integer.MAX_VALUE) {
            maxDocsPerCycle++; // (ensure we get as far as adding these)
          }
        }
      }
    }//TESTED
    
    Iterator<ExtraUrlPojo> itUrls = null;
    
    // Spider parameters used in conjunction with itUrls
    List<ExtraUrlPojo> iteratingList = null;
    List<ExtraUrlPojo> waitingList = null;
    int nIteratingDepth = 0;
    
    // (ie no URL specified, so using extra URLs as search URLs - and optionally as real URLs also)
    if ((null == savedUrl) && (null != src.getRssConfig().getExtraUrls()) && !src.getRssConfig().getExtraUrls().isEmpty()) {
      // Spider logic:
      iteratingList = src.getRssConfig().getExtraUrls();
      // (end spidering logic)
      
      itUrls = iteratingList.iterator();
      src.getRssConfig().setExtraUrls(new LinkedList<ExtraUrlPojo>());
        // (ie overwrite the original list)
    }//TESTED
    
    for (;;) { // The logic for this loop can vary...
      if (dedupSet.size() >= maxDocsPerCycle) {
        break;
      }
      String currTitle = null;
      String currFullText = null;
      String currDesc = null;
    
      if (null != itUrls) {
                
        ExtraUrlPojo urlPojo = itUrls.next(); 
        savedUrl = urlPojo.url;
        if (0 == nIteratingDepth) {
          if (null != urlPojo.title) { // Also harvest this
            src.getRssConfig().getExtraUrls().add(urlPojo);
            if (maxDocsPerCycle != Integer.MAX_VALUE) {
              maxDocsPerCycle--; // (now added, can remove)
            }
          }
        }
        currTitle = urlPojo.title;
        currDesc = urlPojo.description;
        currFullText = urlPojo.fullText;
      }//TESTED
      
      try { // If we error out, we're probably going to abandon the entire search
        
      // We're going to loop over pages
      
      // Apply the regex to the URL for pagination, part 1
        
        int nResultOffset = 0;
        int nMaxPages = 1;
        Pattern pageChangeRegex = null;
        Matcher pageChangeRegexMatcher = null;
        if (null != feedConfig.getSearchConfig().getPageChangeRegex()) {
          pageChangeRegex = Pattern.compile(feedConfig.getSearchConfig().getPageChangeRegex(), Pattern.CASE_INSENSITIVE);
          pageChangeRegexMatcher = pageChangeRegex.matcher(savedUrl);
          nMaxPages = feedConfig.getSearchConfig().getNumPages();
          
          if (pageChangeRegexMatcher.find()) {
            String group = pageChangeRegexMatcher.group(1);
            if (null != group) {
              try {
                nResultOffset = Integer.parseInt(group);
              }
              catch (Exception e) {} // just carry on
            }
          }
          else { // URL doesn't match
            pageChangeRegexMatcher = null;          
          }//TESTED
          
        }//TESTED
  
        // Page limit check (see also nLinksFound/nCurrDedupSetSize inside loop)
        int nMinLinksToExitLoop = 10; // (use to check one iteration past the point at which nothing happens)
        
        // If checking vs duplicates then have a flag to exit (note: only applies to the current URL)
        boolean stopPaginating = false;
        boolean stopLinkFollowing = false; 
          // (if set to stop paginating but only link following occurs, assume this is treated like pagination, eg nextUrl sort of thing)
        
        for (int nPage = 0; nPage < nMaxPages; ++nPage) {                    
          if ((dedupSet.size() >= maxDocsPerCycle) || stopPaginating) {
            if (dedupSet.size() >= maxDocsPerCycle) {
              src.setReachedMaxDocs();
            }
            break;
          }
          // Will use this to check if we reached a page limit (eg some sites will just repeat the same page over and over again)
          int nLinksFound = 0;
          int nCurrDedupSetSize = dedupSet.size();
          
          String url = savedUrl;  
          
      // Apply the regex to the URL for pagination, part 2
          
          if ((null != pageChangeRegex) && (null != feedConfig.getSearchConfig().getPageChangeReplace())) {
            int nResultStart = nPage*feedConfig.getSearchConfig().getNumResultsPerPage() + nResultOffset;
            String replace = feedConfig.getSearchConfig().getPageChangeReplace().replace("$1", Integer.toString(nResultStart));
  
            if (null == pageChangeRegexMatcher) {
              url += replace;
            }
            else {
              url = pageChangeRegexMatcher.replaceFirst(replace);
            }
          }//TESTED


          //DEBUG
          //System.out.println("URL=" + url);
          
      // Create a custom UAH object to fetch and parse the search results
      
          UnstructuredAnalysisConfigPojo dummyUAHconfig = new UnstructuredAnalysisConfigPojo();
          if (null == feedConfig.getSearchConfig().getScriptflags()) { // Set flags if necessary
            if (null == feedConfig.getSearchConfig().getExtraMeta()) {
              feedConfig.getSearchConfig().setScriptflags("dt");
            }
            else {
              feedConfig.getSearchConfig().setScriptflags("dtm");              
            }
          }
          if (null != feedConfig.getSearchConfig().getExtraMeta()) {
            dummyUAHconfig.CopyMeta(feedConfig.getSearchConfig().getExtraMeta());
            // Legacy -> Pipeline port
            for (metaField extraMeta: dummyUAHconfig.getMeta()) {
              if (null == extraMeta.context) { // mandatory in legacy, discarded in pipeline!
                extraMeta.context = Context.First;
              }
            }
          }
          dummyUAHconfig.setScript(feedConfig.getSearchConfig().getGlobals());
          dummyUAHconfig.AddMetaField("searchEngineSubsystem", Context.All, feedConfig.getSearchConfig().getScript(), "javascript", feedConfig.getSearchConfig().getScriptflags());
          src.setUnstructuredAnalysisConfig(dummyUAHconfig);
          if (null != searchConfig.getProxyOverride()) {
            feedConfig.setProxyOverride(searchConfig.getProxyOverride());
          }
          if (null != searchConfig.getUserAgent()) {
            feedConfig.setUserAgent(searchConfig.getUserAgent());
          }
          if (null != searchConfig.getHttpFields()) {
            feedConfig.setHttpFields(searchConfig.getHttpFields());
          }
          if (null != searchConfig.getWaitTimeBetweenPages_ms()) {
            // Web etiquette: don't hit the same site too often
            // (applies this value to sleeps inside UAH.executeHarvest)
            feedConfig.setWaitTimeOverride_ms(searchConfig.getWaitTimeBetweenPages_ms());
          }
          //TESTED (including RSS-level value being written back again and applied in SAH/UAH code)
          
          DocumentPojo searchDoc = docToSplit;
          Object[] savedMeta = null;
          if (null == searchDoc) {
            searchDoc = new DocumentPojo();
            // Required terms:
            searchDoc.setUrl(url);
            searchDoc.setScore((double)nIteratingDepth); // (spidering param)
            // Handy terms
            if (null != src.getHarvestStatus()) {
              searchDoc.setModified(src.getHarvestStatus().getHarvested()); // the last time the source was harvested - can use to determine how far back to go
            }
            // If these exist (they won't normally), fill them:
            searchDoc.setFullText(currFullText);
            searchDoc.setDescription(currDesc);
            searchDoc.setTitle(currTitle);
          }//TOTEST
          else if (null != searchDoc.getMetadata()){ 
            savedMeta = searchDoc.getMetadata().remove("searchEngineSubsystem");
              // (this is normally null)
          }//TOTEST
          UnstructuredAnalysisHarvester dummyUAH = new UnstructuredAnalysisHarvester();
          boolean bMoreDocs = (nPage < nMaxPages - 1);
          Object[] searchResults = null;
          try {
            dummyUAH.executeHarvest(context, src, searchDoc, false, bMoreDocs);
              // (the leading false means that we never sleep *before* the query, only after)
            searchResults = searchDoc.getMetaData().get("searchEngineSubsystem");
          }
          finally {
            if (null != savedMeta) { // (this is really obscure but handle the case where someone has created this meta field already) 
              searchDoc.getMetadata().put("searchEngineSubsystem", savedMeta);              
            }
            else if ((null != searchDoc) && (null != searchDoc.getMetadata())) {
              searchDoc.getMetadata().remove("searchEngineSubsystem");
            }
          }//TOTEST
          
          //DEBUG
          //System.out.println("NEW DOC MD: " + new com.google.gson.GsonBuilder().setPrettyPrinting().create().toJson(searchDoc.getMetadata()));
          
      // Create extraUrl entries from the metadata
      
          if ((null != searchResults) && (searchResults.length > 0)) {
            for (Object searchResultObj: searchResults) {
              try {
                BasicDBObject bsonObj = (BasicDBObject)searchResultObj;
                
                // 3 fields: url, title, description(=optional)
                String linkUrl = bsonObj.getString(DocumentPojo.url_);
                
                nLinksFound++;
                if (!dedupSet.contains(linkUrl)) {
                  dedupSet.add(linkUrl);                  
                
                  String linkTitle = bsonObj.getString(DocumentPojo.title_);
                  String linkDesc = bsonObj.getString(DocumentPojo.description_);
                  String linkPubDate = bsonObj.getString(DocumentPojo.publishedDate_);
                  String linkFullText = bsonObj.getString(DocumentPojo.fullText_);
                  String spiderOut = bsonObj.getString("spiderOut");
                  
                  if (null != linkUrl) {
                    SourceRssConfigPojo.ExtraUrlPojo link = new SourceRssConfigPojo.ExtraUrlPojo();
                    link.url = linkUrl;
                    link.title = linkTitle;
                    link.description = linkDesc;
                    link.publishedDate = linkPubDate;
                    link.fullText = linkFullText;
                    if (!stopLinkFollowing && (null != itUrls) && (null != spiderOut) && spiderOut.equalsIgnoreCase("true")) { 
                      // In this case, add it back to the original list for chained processing
            
                      if (null == waitingList) {
                        waitingList = new LinkedList<ExtraUrlPojo>();
                      }
                      waitingList.add(link);
                        // (can't result in an infinite loop like this because we check 
                         //  dedupSet.size() and only allow links not already in dedupSet)
                      
                    } //TESTED


                    if (null != linkTitle) {
                      
                      boolean isDuplicate = false;
                      if (!stopPaginating && searchConfig.getStopPaginatingOnDuplicate()) {
                        // Quick duplicate check (full one gets done later)
                        isDuplicate = context.getDuplicateManager().isDuplicate_Url(linkUrl, src, null);
                      }//TESTED                      
                      if (!isDuplicate) {
                        if (null == feedConfig.getExtraUrls()) {
                          feedConfig.setExtraUrls(new ArrayList<ExtraUrlPojo>(searchResults.length));
                        }
                        feedConfig.getExtraUrls().add(link);
                      }
                      else {
                        stopPaginating = true;
                        if (null == feedConfig.getSearchConfig().getPageChangeRegex()) {
                          stopLinkFollowing = true;
                        }//TESTED                      
                      }//TESTED
                    }
                    
                  }
                }//(end if URL not already found)
              }
              catch (Exception e) {
                // (just carry on)
                //DEBUG
                //e.printStackTrace();
              }
            }
          }//TESTED
          else if (0 == nPage) { //returned no links, log an error if this is page 1 and one has been saved
            Object[] onError = searchDoc.getMetaData().get("_ONERROR_");
            if ((null != onError) && (onError.length > 0) && (onError[0] instanceof String) && !(((String)(onError[0]))).isEmpty()) {
              throw new ExtractorSourceLevelTransientException("generateFeedFromSearch: _ONERROR_: " + onError[0]);          
            }
          }//TESTED


          if (context.isStandalone()) { // debug mode, will display some additional logging
            Object[] onDebug = searchDoc.getMetaData().get("_ONDEBUG_");
            if ((null != onDebug) && (onDebug.length > 0)) {
              for (Object debug: onDebug) {
                if (debug instanceof String) {
                  context.getHarvestStatus().logMessage("_ONDEBUG_: " + (String)debug, true);                              
                }
                else {
                  context.getHarvestStatus().logMessage("_ONDEBUG_: " + new com.google.gson.Gson().toJson(debug), true);
                }
              }
            }
          }//TESTED
          
          // PAGINGATION BREAK LOGIC:
          // 1: All the links are duplicates of links already in the DB
          // 2: No new links from last page
          
          // LOGIC CASE 1: (All the links are duplicates of links already in the DB)
           
          //(already handled above)
          
          // LOGIC CASE 2: (No new links from last page)
          
          //DEBUG
          //System.out.println("LINKS_SIZE=" + feedConfig.getExtraUrls().size());
          //System.out.println("LINKS=\n"+new com.google.gson.GsonBuilder().setPrettyPrinting().create().toJson(feedConfig.getExtraUrls()));
              
          if (dedupSet.size() == nCurrDedupSetSize) { // All links were duplicate
            //DEBUG
            //System.out.println("FOUND " + nLinksFound + " vs " + nMinLinksToExitLoop + " duplicate URLs (" + nCurrDedupSetSize + ")");
            if (nLinksFound >= nMinLinksToExitLoop) { // (at least 10 found so insta-quit)
              break;              
            }
            else { // (fewer than 10 found - includ
              nMinLinksToExitLoop = 0; // (also handles the no links found case)
            }
          }//TESTED
          else {
            nMinLinksToExitLoop = 10; // (reset)
          }//TESTED
          
        }// end loop over pages
        
      }
      catch (Exception e) {
        //DEBUG
        //e.printStackTrace();
        
        if ((null == dedupSet) || dedupSet.isEmpty()) {
          throw new ExtractorSourceLevelTransientException("generateFeedFromSearch: " + e.getMessage());          
        }
        else {
          throw new ExtractorDocumentLevelException("generateFeedFromSearch: " + e.getMessage());
        }
        // (don't log since these errors will appear in the log under the source, ie more usefully)
      }//TESTED
      finally {
        // Fix any temp changes we made to the source
        src.setUnstructuredAnalysisConfig(savedUAHconfig);
        feedConfig.setUserAgent(savedUserAgent);
        feedConfig.setHttpFields(savedHttpFields);
        feedConfig.setWaitTimeOverride_ms(savedWaitTimeOverride_ms);
        feedConfig.setProxyOverride(savedProxyOverride);
      }      
      if (null == itUrls) {
        break;    
      }
      else if (!itUrls.hasNext()) {

View Full Code Here

        // obviously may then go get the data again for full text
        // extraction
        boolean bFetchedUrl = false;
        if (bGetRawDoc && (null == d.getFullText())) {
          if (null == source.getRssConfig()) {
            source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
          }
          // (first time through, sleep following a URL/RSS access)
          if ((1 == nDocs) && (null != source.getUrl())) { // (have already made a call to RSS (or "searchConfig" URL)
            try {
              Thread.sleep(nBetweenDocs_ms);

View Full Code Here

      nChanges = doc.getMetaData().size();
    }
    boolean bFetchedUrl = false;
    if (bGetRawDoc) {
      if (null == source.getRssConfig()) {
        source.setRssConfig(new SourceRssConfigPojo()); // (makes logic easier down the road)
      }
      try {
        // Workaround for observed twitter bug (first access after the
        // RSS was gzipped)
        if (bFirstTime) {

View Full Code Here

          }
        }
      }
      // (saved fields to rewrite later if not public, do up here for visibility)
      String url = source.getUrl();
      SourceRssConfigPojo rss = source.getRssConfig();
      List<SourcePipelinePojo> pxPipe = source.getProcessingPipeline(); 
      StructuredAnalysisConfigPojo sah = source.getStructuredAnalysisConfig();
      UnstructuredAnalysisConfigPojo uah = source.getUnstructuredAnalysisConfig();
      
      if (!bIsPublic) { // Cleanse URLs, remove processing pipeline information
        source.setPartiallyPublished(true); //TESTED
        
        // Copy URL info from px pipeline into the main source
        if ((null != source.getProcessingPipeline()) && !source.getProcessingPipeline().isEmpty()) {          
          SourcePipelinePojo firstEl = source.getProcessingPipeline().iterator().next();
          if (null != firstEl.web) {
            source.setRssConfig(firstEl.web);
          }
          else if (null != firstEl.feed) {
            source.setRssConfig(firstEl.feed);
          }
          else if (null != firstEl.database) {
            source.setUrl(firstEl.database.getUrl());
          }
          else if (null != firstEl.file) {
            source.setUrl(firstEl.file.getUrl());
          }
          source.setProcessingPipeline(new ArrayList<SourcePipelinePojo>()); // (delete px pipeline)
        }//(end if non-empty px pipeline)
        //TESTED
        
        int nIndex = -1;
        if ((null != url) && ((nIndex = url.indexOf('?')) >= 0)) {
          source.setUrl(url.substring(0, 1 + nIndex));
        }
        if (null != rss) {
          rss.setHttpFields(null); // (remove cookie information)
        }
        if ((null != rss) && (null != rss.getExtraUrls())) {
          SourceRssConfigPojo newRss = new SourceRssConfigPojo();
          ArrayList<SourceRssConfigPojo.ExtraUrlPojo> newList = new ArrayList<SourceRssConfigPojo.ExtraUrlPojo>(rss.getExtraUrls().size());
          for (SourceRssConfigPojo.ExtraUrlPojo urlObj: rss.getExtraUrls()) {
            SourceRssConfigPojo.ExtraUrlPojo newUrlObj = new SourceRssConfigPojo.ExtraUrlPojo();
            if ((null != urlObj.url) && ((nIndex = urlObj.url.indexOf('?')) >= 0)) {
              newUrlObj.url = urlObj.url.substring(0, 1 + nIndex);
            }
            else {
              newUrlObj.url = urlObj.url;
            }
            newUrlObj.title = urlObj.title; 
            newUrlObj.description = urlObj.description;
            newUrlObj.publishedDate = urlObj.publishedDate;
            newUrlObj.fullText = urlObj.fullText;
            newList.add(newUrlObj);
          }
          newRss.setExtraUrls(newList);
          source.setRssConfig(newRss);
        }
        else if (null != rss) {
          source.setRssConfig(null);          
        }

View Full Code Here

      }
    }//TESTED
    
    if (webLinks) {
      if (null == source.getRssConfig()) { // (Just set some fields to be used by other)
        source.setRssConfig(new SourceRssConfigPojo());
        
        // Also copy across HTTP control fields:
        source.getRssConfig().setHttpFields(links.getHttpFields());
        source.getRssConfig().setProxyOverride(links.getProxyOverride());
        source.getRssConfig().setUserAgent(links.getUserAgent());

View Full Code Here

  
  private void splitDocuments(DocumentPojo doc, SourcePojo source, SourcePipelinePojo splitter, List<DocumentPojo> docs)
  {
    try {
      if (null == source.getRssConfig()) {
        source.setRssConfig(new SourceRssConfigPojo());
      }
      if (null != source.getRssConfig().getExtraUrls()) { // refreshed ready for new document
        source.getRssConfig().setExtraUrls(null);
      }
      source.getRssConfig().setSearchConfig(splitter.splitter);

View Full Code Here

TOP

Related Classes of com.ikanow.infinit.e.data_model.store.config.source.SourceRssConfigPojo$ExtraUrlPojo

com.ikanow.infinit.e.data_model.api.config.SourcePojoApiMap$SourcePojoSerializer

com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester

com.ikanow.infinit.e.harvest.extraction.document.rss.FeedHarvester

com.ikanow.infinit.e.harvest.extraction.document.rss.FeedHarvester_searchEngineSubsystem

com.ikanow.infinit.e.harvest.HarvestControllerPipeline

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.