Examples of DuplicateManager

com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager
@author cmorgan

Examples of com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager

          if (duplicate) {
            continue;
          }


          try {          
            DuplicateManager qr = _context.getDuplicateManager();
            if (null != entry.getDescription()) {
              duplicate = qr.isDuplicate_UrlTitleDescription(url, title.replaceAll("\\<.*?\\>", "").trim(), desc.replaceAll("\\<.*?\\>", "").trim(), source, duplicateSources);
            }
            else {
              duplicate = qr.isDuplicate_UrlTitleDescription(url, title.replaceAll("\\<.*?\\>", "").trim(), null, source, duplicateSources);            
              //^^^(this is different to isDuplicate_UrlTitle because it enforces that the description be null, vs just checking the title)
            }
            if (duplicate && (null != source.getRssConfig()) && (null != source.getRssConfig().getUpdateCycle_secs())) { 
              // Check modified times...
              Date dupModDate = qr.getLastDuplicateModifiedTime();
              ObjectId dupId = qr.getLastDuplicateId();
              
              if ((null != dupModDate) && (null != dupId)) {
                if (dupModDate.getTime() + source.getRssConfig().getUpdateCycle_secs()*1000 < nNow) {
                  
                  DocumentPojo doc = buildDocument(url, entry, source, duplicateSources);

View Full Code Here

Examples of com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager

      throw e;
    }


    try {      
      //Dedup code, ironically enough partly duplicated in parse(), probably unnecessarily
      DuplicateManager qr = _context.getDuplicateManager();
      for(DocumentPojo doc: files)
      {
        try {      
          duplicateSources.clear();
          if (null != doc.getSourceUrl()) { 


            boolean add = true;


            // However still need to check for duplicates so can update entities correctly (+maintain _ids, etc)
            // We only do this if the source URL changes (unless URL is taken from the object in which case all bets are off) 
            
            boolean sourceUrlUpdated = sourceUrlsGettingUpdated.contains(doc.getSourceUrl());
            if (!doc.getHasDefaultUrl() || sourceUrlUpdated) { // src URL for a given URL              
              // (only if the the sourceUrl is not new...)
              if (qr.isDuplicate_Url(doc.getUrl(), source, duplicateSources)) {
                doc.setUpdateId(qr.getLastDuplicateId()); // (set _id to doc we're going to replace)
                
                if (!sourceUrlUpdated && !_deleteExistingFilesBySourceKey) {
                  // Here update instead so we delete the old doc and add the new one
                  add = false;
                  docsToUpdate.add(doc);
                }//TESTED
                else {
                  // (else *still* don't add this to updates because we've added the source URL or source key to the delete list)
                  // (hence approximate create with the updateId...)
                  if (null != doc.getUpdateId()) {
                    doc.setCreated(new Date(doc.getUpdateId().getTime()));
                  }//TESTED                  
                }//TESTED
              }
              //(note we don't get about duplicate sources in this case - just too complex+rare a case)
              
            }//TESTED (src url changing, different src url, non-default URL)
            
            // For composite files we (almost always) delete everything that already exists (via docsToRemove) and then add new docs
            if (add) {
              docsToAdd.add(doc);
            }            
            //TESTED
          }
          else if (qr.isDuplicate_Url(doc.getUrl(), source, duplicateSources)) {
            // Other files, if the file already exists then update it (essentially, delete/add)
            doc.setUpdateId(qr.getLastDuplicateId()); // (set _id to doc we're going to replace)
            docsToUpdate.add(doc);
          }
          else { // if duplicateSources is non-empty then this URL is a duplicate of one from a different source 
            if (!duplicateSources.isEmpty()) { 
              doc.setDuplicateFrom(duplicateSources.getFirst());

View Full Code Here

Examples of com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager




  private boolean needsUpdated_SourceUrl(Date mod, String sourceUrl, SourcePojo source)
  {
    try {          
      DuplicateManager qr = _context.getDuplicateManager();
      return qr.needsUpdated_SourceUrl(mod, sourceUrl, source);
    } 
    catch (Exception e) {
      // Do nothing
    } 
    return false;

View Full Code Here

Examples of com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager

  }


  private boolean needsUpdated_Url(Date mod, String url, SourcePojo source)
  {
    try {          
      DuplicateManager qr = _context.getDuplicateManager();


      return qr.needsUpdated_Url(mod, url, source);
    } 
    catch (Exception e) {
      // Do nothing
    } 
    return false;

View Full Code Here

Examples of com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager

         * source.url + primary key fields. See example below
         * jdbc:mysql://<IP ADDRESS>:3306/washingtondc/987863
         */
        try 
        {
          DuplicateManager qr = _context.getDuplicateManager();
          
            String primaryKey = null;
            if (null != source.getDatabaseConfig().getPrimaryKey()) {
              primaryKey = rs.getString(source.getDatabaseConfig().getPrimaryKey());
            }
            if (null == primaryKey) { // Just pick something unique, to avoid URL collisions
              primaryKey = new ObjectId().toString();
            }
            String docUrl = source.getUrl() + "/" + primaryKey;
            
          // Check to see if the record has already been added
          // If it has been added then we need to update it with the new information
          if (!qr.isDuplicate_Url(docUrl, source, duplicateSources)) 
          {
            nAdded++;
            DocumentPojo newDoc = createDoc(CommitType.insert, rs, md, source, docUrl);
            if (!duplicateSources.isEmpty()) {
              newDoc.setDuplicateFrom(duplicateSources.getFirst());

View Full Code Here

Examples of com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager

       * source.url + primary key fields. See example below
       * jdbc:mysql://<IP ADDRESS>:3306/washingtondc/987863
       */
      try 
      {
        DuplicateManager qr = _context.getDuplicateManager();
        
          String primaryKey = null;
          if (null != source.getDatabaseConfig().getPrimaryKey()) {
            primaryKey = rs.getString(source.getDatabaseConfig().getPrimaryKey());
          }
          if (null == primaryKey) { // Just pick something unique, to avoid URL collisions
            primaryKey = new ObjectId().toString();
          }
          String docUrl = source.getDatabaseConfig().getPrimaryKeyValue();
          if (null == docUrl) {
            docUrl = source.getUrl() + "/" + primaryKey;
          }
          else {
            docUrl = docUrl + primaryKey;
          }
          
        // Check to see if the record has already been added
        // If it has been added then we need to update it with the new information
        if (!qr.isDuplicate_Url(docUrl, source, duplicateSources)) 
        {
          nAdded++;
          DocumentPojo newDoc = createDoc(CommitType.insert, rs, md, source, docUrl);
          if (!duplicateSources.isEmpty()) {
            newDoc.setDuplicateFrom(duplicateSources.getFirst());

View Full Code Here

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.