Package com.ikanow.infinit.e.data_model.store.document

Examples of com.ikanow.infinit.e.data_model.store.document.DocumentPojo


          firstIgnoreField = false;
        }
        if (bMatched) continue;
      }//TESTED
     
      DocumentPojo newDoc = new DocumentPojo();
      String primaryKey = null;
      if (null != parser) {
        JsonObject json = new JsonObject();
        try {
          String[] records = parser.parseLine(line);
          for (int i = 0; i < records.length; ++i) {
            String record = records[i];
            if ((record.length() > 0) && (i < indexToField.length)) {
              String fieldName = (String) indexToField[i];
              if ((null != fieldName) && (fieldName.length() > 0)) {
                json.addProperty(fieldName, record);
                if (fieldName.equals(source.getFileConfig().XmlPrimaryKey)) {
                  primaryKey = record;
                }
              }
            }
          }
          if ((null != primaryKey) && (null != source.getFileConfig().XmlSourceName)) {
            newDoc.setUrl(source.getFileConfig().XmlSourceName + primaryKey);
          }//TESTED
          newDoc.addToMetadata("csv", JsonToMetadataParser.convertJsonObjectToLinkedHashMap(json, _memUsage));         
        }
        catch (Exception e) {} // can just skip over the line and carry on
       
      }//TESTED
     
      newDoc.setFullText(line);
      if (line.length() > 128) {
        newDoc.setDescription(line.substring(0, 128));
      }
      else {
        newDoc.setDescription(line);
      }
      partials.add(newDoc);
      docs++;
      if (docs >= _debugMaxDocs) { // debug mode only, otherwise commit to all docs in this file
        break;
View Full Code Here


        }         
        if (!_isParsingScriptInitialized) {
          _securityManager.eval(_scriptEngine, PARSING_SCRIPT);
          _isParsingScriptInitialized = true;
        }
        DocumentPojo doc = DocumentPojo.fromDb(docObj, DocumentPojo.class);
            _scriptEngine.put("old_document", _gson.toJson(doc));
            try {
              _securityManager.eval(_scriptEngine,JavaScriptUtils.initOnUpdateScript);
              Object returnVal = _securityManager.eval(_scriptEngine, onUpdateScript);
          BasicDBList outList = JavaScriptUtils.parseNativeJsObject(returnVal, _scriptEngine);                       
          f.addToMetadata("_PERSISTENT_", outList.toArray());
            }
            catch (Exception e) {
              // Extra step here...
              if (null != doc.getMetadata()) { // Copy persistent metadata across...
                Object[] persist = doc.getMetadata().get("_PERSISTENT_");
                if (null != persist) {
                  f.addToMetadata("_PERSISTENT_", persist);
                }                       
            this._context.getHarvestStatus().logMessage("SAH::onUpdateScript: " + e.getMessage(), true);
            //DEBUG (don't output log messages per doc)
View Full Code Here

      // to add to the feed using the source entity and association spec pojos
      Iterator<DocumentPojo> it = docs.iterator();
      int nDocs = 0;
      while (it.hasNext())
      {
        DocumentPojo f = it.next();
        nDocs++;
        try
        {
          resetEntityCache();
          _document = null;
          _docPojo = null;
            // (don't create this until needed, since it might need to be (re)serialized after a call
            //  to the UAH which would obviously be undesirable)
                                 
          // If the script engine has been instantiated pass the feed document and any scripts
          if (_scriptEngine != null)
          {
            List<String> scriptList = null;
            List<String> scriptFileList = null;
            try {
              // Script code embedded in source
              scriptList = Arrays.asList(s.getScript());
            }
            catch (Exception e) {}
            try {
              // scriptFiles - can contain String[] of script files to import into the engine
              scriptFileList = Arrays.asList(s.getScriptFiles());             
            }
            catch (Exception e) {}             
            this.loadGlobalFunctions(scriptFileList, scriptList, s.getScriptEngine());           
          }//TESTED
         
      // 1. Document level fields
         
          // Extract Title if applicable
          boolean bTryTitleLater = false;
          try {
            if (s.getTitle() != null)
            {
              intializeDocIfNeeded(f, g);
              if (JavaScriptUtils.containsScript(s.getTitle()))
              {
                f.setTitle((String)getValueFromScript(s.getTitle(), null, null));
              }
              else
              {
                f.setTitle(getFormattedTextFromField(s.getTitle(), null));
              }
              if (null == f.getTitle()) {
                bTryTitleLater = true;
              }
            }
          }
          catch (Exception e)
          {
            this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);           
            //DEBUG (don't output log messages per doc)
            //logger.error("title: " + e.getMessage(), e);
          }

          // Extract Display URL if applicable
          boolean bTryDisplayUrlLater = false;
          try {
            if (s.getDisplayUrl() != null)
            {
              intializeDocIfNeeded(f, g);
              if (JavaScriptUtils.containsScript(s.getDisplayUrl()))
              {
                f.setDisplayUrl((String)getValueFromScript(s.getDisplayUrl(), null, null));
              }
              else
              {
                f.setDisplayUrl(getFormattedTextFromField(s.getDisplayUrl(), null));
              }
              if (null == f.getDisplayUrl()) {
                bTryDisplayUrlLater = true;
              }
            }
          }
          catch (Exception e)
          {
            this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);           
            //DEBUG (don't output log messages per doc)
            //logger.error("displayUrl: " + e.getMessage(), e);
          }
          //TOTEST

          // Extract Description if applicable
          boolean bTryDescriptionLater = false;
          try {
            if (s.getDescription() != null)
            {
              intializeDocIfNeeded(f, g);
              if (JavaScriptUtils.containsScript(s.getDescription()))
              {
                f.setDescription((String)getValueFromScript(s.getDescription(), null, null));
              }
              else
              {
                f.setDescription(getFormattedTextFromField(s.getDescription(), null));
              }
              if (null == f.getDescription()) {
                bTryDescriptionLater = true;
              }
            }
          }
          catch (Exception e)
          {
            this._context.getHarvestStatus().logMessage("description: " + e.getMessage(), true);           
            //DEBUG (don't output log messages per doc)
            //logger.error("description: " + e.getMessage(), e);
          }
         

          // Extract fullText if applicable
          boolean bTryFullTextLater = false;
          try {
            if (s.getFullText() != null)
            {
              intializeDocIfNeeded(f, g);
              if (JavaScriptUtils.containsScript(s.getFullText()))
              {
                f.setFullText((String)getValueFromScript(s.getFullText(), null, null));
              }
              else
              {
                f.setFullText(getFormattedTextFromField(s.getFullText(), null));
              }
              if (null == f.getFullText()) {
                bTryFullTextLater = true;
              }
            }
          }
          catch (Exception e)
          {
            this._context.getHarvestStatus().logMessage("fullText: " + e.getMessage(), true);           
            //DEBUG (don't output log messages per doc)
            //logger.error("fullText: " + e.getMessage(), e);
          }
 
          // Published date is done after the UAH
          // (since the UAH can't access it, and it might be populated via the UAH)
         
      // 2. UAH/extraction properties
         
          // Add fields to metadata that can be used to create entities and associations
          // (Either with the UAH, or with the entity extractor)
          try {
            boolean bMetadataChanged = false;
            if (null != this._unstructuredHandler)
            {
              try
              {
                this._unstructuredHandler.set_sahEngine(_scriptEngine);
                bMetadataChanged = this._unstructuredHandler.executeHarvest(_context, source, f, (1 == nDocs), it.hasNext());
              }
              catch (Exception e) {
                contextController.handleExtractError(e, source); //handle extractor error if need be   
               
                it.remove(); // remove the document from the list...
                f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
               
                // (Note: this can't be source level error, so carry on harvesting - unlike below)
                continue;
              }
            } 
            if (contextController.isEntityExtractionRequired(source))
            {
              bMetadataChanged = true;
             
              // Text/Entity Extraction
              List<DocumentPojo> toAdd = new ArrayList<DocumentPojo>(1);
              toAdd.add(f);
              try {
                contextController.extractTextAndEntities(toAdd, source, false, false);
                if (toAdd.isEmpty()) { // this failed...
                  it.remove(); // remove the document from the list...
                  f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                  continue;
                }//TESTED
              }
              catch (Exception e) {
                contextController.handleExtractError(e, source); //handle extractor error if need be       
                it.remove(); // remove the document from the list...
                f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
               
                if (source.isHarvestBadSource())
                {
                  // Source error, ignore all other documents
                  while (it.hasNext()) {
                    f = it.next();
                    f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                    it.remove();
                  }
                  break;
                }
                else {
                  continue;
                }
                //TESTED
              }
            }
            if (bMetadataChanged) {
              // Ugly, but need to re-create doc json because metadata has changed
              String sTmpFullText = f.getFullText();
              f.setFullText(null); // (no need to serialize this, can save some cycles)
              _document = null;
              _docPojo = null;
              intializeDocIfNeeded(f, g);             
                  f.setFullText(sTmpFullText); //(restore)
            }
           
            // Can copy metadata from old documents to new ones:           
            handleDocumentUpdates(s.getOnUpdateScript(), f);
           
            // Check (based on the metadata and entities so far) whether to retain the doc
            if (rejectDoc(s.getRejectDocCriteria(), f)) {
              it.remove(); // remove the document from the list...
              f.setTempSource(null); // (can safely corrupt this doc since it's been removed)
              continue;                             
            }
          }
          catch (Exception e) {
            this._context.getHarvestStatus().logMessage("SAH->UAH: " + e.getMessage(), true);           
            //DEBUG (don't output log messages per doc)
            //logger.error("SAH->UAH: " + e.getMessage(), e);
          }
           
          // Now create document since there's no risk of having to re-serialize
          intializeDocIfNeeded(f, g);
         
      // 3. final doc-level metadata fields:
         
          // If description was null before might need to get it from a UAH field
          if (bTryTitleLater) {
            try {
              if (s.getTitle() != null)
              {
                intializeDocIfNeeded(f, g);
                if (JavaScriptUtils.containsScript(s.getTitle()))
                {
                  f.setTitle((String)getValueFromScript(s.getTitle(), null, null));
                }
                else
                {
                  f.setTitle(getFormattedTextFromField(s.getTitle(), null));
                }
              }
            }
            catch (Exception e)
            {
              this._context.getHarvestStatus().logMessage("title: " + e.getMessage(), true);           
              //DEBUG (don't output log messages per doc)
              //logger.error("title: " + e.getMessage(), e);
            }
          }
         
          // Extract Display URL if needed
          if (bTryDisplayUrlLater) {
            try {
              if (s.getDisplayUrl() != null)
              {
                intializeDocIfNeeded(f, g);
                if (JavaScriptUtils.containsScript(s.getDisplayUrl()))
                {
                  f.setDisplayUrl((String)getValueFromScript(s.getDisplayUrl(), null, null));
                }
                else
                {
                  f.setDisplayUrl(getFormattedTextFromField(s.getDisplayUrl(), null));
                }
              }
            }
            catch (Exception e)
            {
              this._context.getHarvestStatus().logMessage("displayUrl: " + e.getMessage(), true);           
              //DEBUG (don't output log messages per doc)
              //logger.error("displayUrl: " + e.getMessage(), e);
            }
          }         
          //TOTEST
         
          // If description was null before might need to get it from a UAH field
          if (bTryDescriptionLater) {
            try {
              if (s.getDescription() != null)
              {
                intializeDocIfNeeded(f, g);
                if (JavaScriptUtils.containsScript(s.getDescription()))
                {
                  f.setDescription((String)getValueFromScript(s.getDescription(), null, null));
                }
                else
                {
                  f.setDescription(getFormattedTextFromField(s.getDescription(), null));
                }
              }
            }
            catch (Exception e)
            {
              this._context.getHarvestStatus().logMessage("description2: " + e.getMessage(), true);           
              //DEBUG (don't output log messages per doc)
              //logger.error("description2: " + e.getMessage(), e);
            }           
          }
         
          // If fullText was null before might need to get it from a UAH field
          if (bTryFullTextLater) {
            try {
              if (s.getFullText() != null)
              {
                intializeDocIfNeeded(f, g);
                if (JavaScriptUtils.containsScript(s.getFullText()))
                {
                  f.setFullText((String)getValueFromScript(s.getFullText(), null, null));
                }
                else
                {
                  f.setFullText(getFormattedTextFromField(s.getFullText(), null));
                }
              }
            }
            catch (Exception e)
            {
              this._context.getHarvestStatus().logMessage("fullText2: " + e.getMessage(), true);           
              //DEBUG (don't output log messages per doc)
              //logger.error("fullText2: " + e.getMessage(), e);
            }           
          }
         
          // Extract Published Date if applicable
          if (s.getPublishedDate() != null)
          {
            if (JavaScriptUtils.containsScript(s.getPublishedDate()))
            {
              try
              {
                f.setPublishedDate(new Date(
                    DateUtility.parseDate((String)getValueFromScript(s.getPublishedDate(), null, null))));
              }
              catch (Exception e)
              {
                this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);           
              }
            }
            else
            {
              try
              {
                f.setPublishedDate(new Date(
                    DateUtility.parseDate((String)getFormattedTextFromField(s.getPublishedDate(), null))));
              }
              catch (Exception e)
              {
                this._context.getHarvestStatus().logMessage("publishedDate: " + e.getMessage(), true);           
              }
            }
          }
         
      // 4. Entity level fields   
         
          // Extract Document GEO if applicable
         
          if (s.getDocumentGeo() != null)
          {
            try
            {
              f.setDocGeo(getDocGeo(s.getDocumentGeo()));
            }
            catch (Exception e)
            {
              this._context.getHarvestStatus().logMessage("docGeo: " + e.getMessage(), true);           
            }
          }

          // Extract Entities
          if (s.getEntities() != null)
          {
            f.setEntities(getEntities(s.getEntities(), f));
          }

          // Extract Associations
          if (s.getAssociations() != null)
          {
            f.setAssociations(getAssociations(s.getAssociations(), f));
          }
         
      // 5. Remove unwanted metadata fields
         
          removeUnwantedMetadataFields(s.getMetadataFields(), f);         
View Full Code Here

        // Check to see if the record has already been added
        // If it has been added then we need to update it with the new information
        if (!qr.isDuplicate_Url(docUrl, source, duplicateSources))
        {
          nAdded++;
          DocumentPojo newDoc = createDoc(CommitType.insert, rs, md, source, docUrl);
          if (!duplicateSources.isEmpty()) {
            newDoc.setDuplicateFrom(duplicateSources.getFirst());
          }
          this.docsToAdd.add(newDoc);
        }
        else {
          //TODO (INF-1300): update, I guess need to check if the record has changed?
View Full Code Here

        // Set up the primary variables
          String primaryKey = rs.getString(source.getDatabaseConfig().getPrimaryKey());
          String docUrl = source.getUrl() + "/" + primaryKey;
         
          // Get our system id from the record
          DocumentPojo docToRemove = new DocumentPojo();
          docToRemove.setUrl(docUrl);
          docsToDelete.add(docToRemove);
      }
    }
      catch (SQLException e)
      {
View Full Code Here

   */
  private DocumentPojo createDoc(CommitType commitType, ResultSet rs, ResultSetMetaData md,
      SourcePojo source, String docUrl)
  {
    // Set up Feed object to be used to contain information
    DocumentPojo doc = null;
 
    // Check to see if the commit type is a insert or update
    if (commitType == CommitType.insert)
    {
      // create the doc pojo
      doc = new DocumentPojo();
      doc.setUrl(docUrl);
      doc.setCreated(new Date());       
    }
    else
   
      //TODO (INF-1300): support for updated docs (will want to save old creation time, zap everything else?)
    }   
    doc.setModified(new Date());
     
    try
   
      // Strip out html if it is present
      if (null != source.getDatabaseConfig().getTitle()) {
        if (rs.getString(source.getDatabaseConfig().getTitle()) != null)
        {
          doc.setTitle(rs.getString(source.getDatabaseConfig().getTitle()).replaceAll("\\<.*?\\>", ""));
        }
      }
    }
    catch (SQLException e)
    {
      //DEBUG (don't log per record exceptions)
      //logger.error("Error on add: Exception Message: " + e.getMessage(), e);
      _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage("Error on add: ", e).toString(), true);
    }
     
    try
   
      if (null != source.getDatabaseConfig().getSnippet()) {
        if (rs.getString(source.getDatabaseConfig().getSnippet()) != null)
        {
          doc.setDescription(rs.getString(source.getDatabaseConfig().getSnippet()).replaceAll("\\<.*?\\>", ""));
        }
      }
    }
    catch (SQLException e)
    {
      //DEBUG (don't log per record exceptions)
      //logger.error("Error on add: Exception Message: " + e.getMessage(), e);
      _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage("Error on add: ", e).toString(), true);
    }

    try
   
      if (null != source.getDatabaseConfig().getPublishedDate()) {
        if (rs.getString(source.getDatabaseConfig().getPublishedDate()) != null)
        {
          Object d = null;
          try
          {
            Object o = rs.getDate(source.getDatabaseConfig().getPublishedDate());
            d = convertJdbcTypes(null, o);
          }
          catch (Exception e)
          {
            d = new Date();
          }
          doc.setPublishedDate((Date) d);
        }
        else
        {
          doc.setPublishedDate(new Date());
        }
      }
      else
      {
        doc.setPublishedDate(new Date());
      }
    }
    catch (SQLException e)
    {
      //DEBUG (don't log per record exceptions)
      //logger.error("Error on add: Exception Message: " + e.getMessage(), e);
      _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage("Error on add: ", e).toString(), true);
    }
     
    try
   
          // Create a list of metadata to be added to the doc
      for ( int i = 1; i <= md.getColumnCount(); i++ )
      {
        String column = md.getColumnLabel(i);
        Object value = rs.getObject(i);
       
        // Convert value to standard Java type from JDBC type if needed
        value = convertJdbcTypes(column, value);
       
        if (  (column != null) && (value != null) )
        {
          if (!source.getDatabaseConfig().getPreserveCase()) {
            column = column.toLowerCase();
          }
          doc.addToMetadata(column, value);
        }
          }     
    }
    catch (SQLException e)
    {
View Full Code Here

      {
        if (_nCurrBatchedDocs < _nBatchSize) {
          _batchedDocuments[_nCurrBatchedDocs] = null; // (null-terminated array)
        }
       
        DocumentPojo megaDoc = new DocumentPojo();
        megaDoc.setUrl(_batchedDocuments[0].doc.getUrl() + "|" + _batchedDocuments[_nCurrBatchedDocs-1].doc.getUrl());
        megaDoc.setFullText(_batchText.toString());
       
        _alch.setNumKeywords(_nNumKeywords*_nCurrBatchedDocs);
        int nSavedBatchSize = _nBatchSize;
        // Recurse, but only once since setting _nBatchSize to 1
        _nBatchSize = 1;
        try {
          this.extractEntities(megaDoc);
         
          // Apply megaDoc results to individual docs
          handleBatchProcessing(megaDoc, _batchedDocuments);
        }
        catch (Exception e) {         
          String strError = "Exception Message (0): doc=" + megaDoc.getUrl() + " error=" +  e.getMessage();
          logger.error(strError, e);
          throw new InfiniteEnums.ExtractorDocumentLevelException(strError);
        }
        finally {
          _alch.setNumKeywords(_nNumKeywords); // (<- probably not necessary)
View Full Code Here

                break;
              }
              // Handle cloning on "duplicate docs" from different sources
              boolean bDuplicated = false;
              if (null != doc.getDuplicateFrom() && (null == doc.getUpdateId())) {
                DocumentPojo newDoc = enrichDocByDuplicating(doc);
                // (Note this is compatible with the cloning case whose logic is below:
                //  this document gets fully populated here then added to dup list (with dupFrom==null), with a set of slaves
                //  with dupFrom==sourceKey. When the dup list is traversed (after bypassing enrichment), the slaves are
                //  then created from this master)
                if (null != newDoc) {
                  doc = newDoc;
                  bDuplicated = true;
                }
              }
              else { // if the update id is non-null then ignore the above logic
                doc.setDuplicateFrom(null);
              }
              // Copy over material from source pojo:
              doc.setSource(source.getTitle());
              doc.setTempSource(source);
              doc.setMediaType(source.getMediaType());
              if ((null == source.getAppendTagsToDocs()) || source.getAppendTagsToDocs()) {
                doc.setTags(source.getTags());
              }
              ObjectId sCommunityId = source.getCommunityIds().iterator().next(); // (multiple communities handled below)
              String sIndex = new StringBuffer("doc_").append(sCommunityId.toString()).toString();
              doc.setCommunityId(sCommunityId);               
              doc.setIndex(sIndex);
              if (normalCase) { // Normal case (or test case)
                doc.setSourceKey(source.getKey());
              }
              else { // Many communities for a single source, not a pleasant case
                String sMasterDocSourceKey = null;
                for (ObjectId id: source.getCommunityIds()) {
                  if (null == sMasterDocSourceKey) {
                    sMasterDocSourceKey = (source.getKey());
                    doc.setSourceKey(sMasterDocSourceKey);
                  }
                  else { // Will defer these until after the master doc has been added to the database
                    DocumentPojo cloneDoc = new DocumentPojo();

                    // Will need these fields
                    cloneDoc.setIndex(new StringBuffer("doc_").append(id).toString());
                    cloneDoc.setCommunityId(id);
                    cloneDoc.setSourceKey(source.getKey());
                    cloneDoc.setSource(source.getTitle());
                    cloneDoc.setUrl(doc.getUrl());
                    if ((null == source.getAppendTagsToDocs()) || source.getAppendTagsToDocs()) {
                      cloneDoc.setTags(source.getTags());
                    }

                    cloneDoc.setCloneFrom(doc);
                    toDup.add(cloneDoc);
                  }
                }//TESTED (both in clone and clone+duplicate)
              }             
              // Normally add to enrichment list (for duplicates, bypass this)
View Full Code Here

      Iterator<DocumentPojo> i = toAdd.iterator(); //iterator created so that elements in the toAdd list can be
      // removed within the loop
      while ( i.hasNext() )
      {
        long nTime_ms = System.currentTimeMillis();
        DocumentPojo doc = i.next();
        boolean bExtractedText = false;

        // If I've been stopped then just remove all remaining documents
        // (pick them up next time through)
        if (bIsKilled) {
          i.remove();
          if (!calledFromPipeline) {
            doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
          }
          continue;
        }

        if ( calledFromPipeline || !urlsThatError.contains(doc.getUrl()) ) //only attempt if url is okay
        {       
          feed_count++;

          try {
            // (Check for truncation)
            if ((null != currentEntityExtractor) && (null != doc.getFullText())) {
              try {
                String s = currentEntityExtractor.getCapability(EntityExtractorEnum.MaxInputBytes);
                if (null != s) {
                  int maxLength = Integer.parseInt(s);
                  if (doc.getFullText().length() > maxLength) { //just warn, it's up to the extractor to sort it out
                    getHarvestStatus().logMessage("Warning: truncating document to max length: " + s, false);
                  }
                }
              }
              catch (Exception e) {} // max length not reported just carry on
            }
           
            if (null != currentTextExtractor)
            { 
              bExtractedText = true;
              currentTextExtractor.extractText(doc);
              if (null != currentEntityExtractor) {
                currentEntityExtractor.extractEntities(doc);
              }

            }//TESTED
            else //db/filesys should already have full text extracted (unless otherwise specified)
            {
              if (source.getExtractType().equalsIgnoreCase("feed")) { // Need full text so get from current
               
                if ((null == doc.getFullText()) || !bUseRawContentWhereAvailable) {
                  bExtractedText = true;
                  if (null != currentEntityExtractor) {
                    currentEntityExtractor.extractEntitiesAndText(doc);
                  }
                }//TESTED (AlchemyAPI case)
                else { // Feed for which we've already extracted data
                  if (null != currentEntityExtractor) {
                    currentEntityExtractor.extractEntities(doc);
                  }
                }//TESTED
              }
              else { // DB/File => use full text
                if (null != currentEntityExtractor) {
                  currentEntityExtractor.extractEntities(doc);
                }
              }//TESTED
            }

            //statistics counting
            if ( doc.getEntities() != null )
              num_ent_extracted.addAndGet(doc.getEntities().size());
            if ( doc.getAssociations() != null )
              num_event_extracted.addAndGet(doc.getAssociations().size());

          }
          catch (ExtractorDailyLimitExceededException e) {

            //extractor can't do anything else today, return
            i.remove();
            if (!calledFromPipeline) {
              doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
            }

            // Source error, ignore all other documents
            while (i.hasNext()) {
              doc = i.next();
              if (!calledFromPipeline) {
                doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
              }
              i.remove();
            }
            //TESTED

            throw e; // (ie stop processing this source)
          }//TESTED
          catch (Exception e) { // Anything except daily limit exceeded, expect it to be ExtractorDocumentLevelException

            //TODO (INF-1922): put this in a separate function and call that from pipeline on failure...
            // (not sure what to do about error_on_feed_count though, need to maintain a separate one of those in pipeline?)
           
            // This can come from (sort-of/increasingly) "user" code so provide a bit more information
            StringBuffer errMessage = HarvestExceptionUtils.createExceptionMessage(e);
            _harvestStatus.logMessage(errMessage.toString(), true);
            num_error_url.incrementAndGet();
            nUrlErrorsThisSource++;
           
            if (!calledFromPipeline) {
              urlsThatError.add(doc.getUrl());
            }

            error_on_feed_count++;
            i.remove();
            if (!calledFromPipeline) {
              doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
            }
          }
          //TESTED
        }
        // (note this is only ever called in legacy mode - it's handled in the HarvestControllerPipeline)
View Full Code Here

  // Utility to handle the various multiple community problems:
  // - Different sources, name URL ("duplicates") ... get the doc from the DB (it's there by definition)
  // - Same source, multiple communities ("clones") ... get the doc from the first community processed

  private static DocumentPojo enrichDocByDuplicating(DocumentPojo docToReplace) {
    DocumentPojo newDoc = null;
    BasicDBObject dbo = getDocumentMetadataFromWhichToDuplicate(docToReplace);
    if (null != dbo) {
      String sContent = getDocumentContentFromWhichToDuplicate(docToReplace);
      if (null != sContent) {
        newDoc = duplicateDocument(docToReplace, dbo, sContent, false);
View Full Code Here

TOP

Related Classes of com.ikanow.infinit.e.data_model.store.document.DocumentPojo

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.