Examples of com.ikanow.infinit.e.data_model.store.document.DocumentPojo

Package com.ikanow.infinit.e.data_model.store.document

Examples of com.ikanow.infinit.e.data_model.store.document.DocumentPojo

com.ikanow.infinit.e.data_model.store.document.DocumentPojo
@author apiggottThe generic document data model

          //String source = doc.getSource();
          // 4. Data model abstraction with faster deserialization
          value.removeField("associations");
          value.removeField("entities");
          value.removeField("metadata");
          DocumentPojo doc = DocumentPojo.fromDb( (BasicDBObject) value, DocumentPojo.class );
          String source = doc.getSource();
          
          if ( source != null )
          {
            word.set(source.toString());
            context.write( word, one);

View Full Code Here

    HashSet<String> unstoredFields = new HashSet<String>();
    
    int error_on_feed_count = 0, feed_count = 0;
    LinkedList<DocumentPojo> splitterList = null;
    for (docIt.hasNext();;) {      
      DocumentPojo doc = null;
      HashSet<String> currentBranches = null;
      unstoredFields.clear();
      
      if (!docIt.hasNext()) {
        if ((null == splitterList) || (splitterList.isEmpty())) {
          break;
        } // all done!
        else { // add all splitterList elements to toAdd
          while (!splitterList.isEmpty()) {
            docIt.add(splitterList.removeLast());
            doc = docIt.previous();
          }
        }//TESTED (doc_splitte_test)
      }
      else { 
        doc = docIt.next();
      }//TESTED


      boolean processSpawnedDocOrNotSpawnedDoc = null == doc.getSpawnedFrom(); // (initially: only true if not spawned doc...)


      // (Do this at the top so don't get foxed by any continues in the code)
      long currTime = new Date().getTime();    
      
      if ( HarvestController.isHarvestKilled() // (harvest manually killed or because of global time)
          || 
        ((currTime - pipelineStartTime) > nMaxTimeSpentInPipeline_ms))
            // Don't let any source spend too long in one iteration...
      { 
        source.setReachedMaxDocs(); // (move to success iteration)    


        // Remove the rest of the documents
        doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
        docIt.remove();
        while (docIt.hasNext()) {
          doc = docIt.next();
          doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
          docIt.remove();
        }        
        // Exit loop
        break;
      }//TESTED
      
      feed_count++;
      
      try {
        // For cases where we grab the full text for early processing and then want it back
        _cachedRawFullText = null;
        _cachedRawFullText_available = true; // (set this latch if I never see the full text, eg non-raw text engine is called)
        
        if (null != _uah) {
          _uah.resetForNewDoc();        
        }
        if (null != _sah) {
          _sah.resetForNewDoc();  
        }
        _lastDocInPipeline = !docIt.hasNext();
  
        //NOTE: inter-doc waiting needs to happen before the following processing elements:
        // pxPipe.textEngine: always
        // pxPipe.text: only if doc.fullText==null
        // pxPipe.contentMetadata: only if doc.fullText==null
        // pxPipe.featureEngine: only if doc.fullText==null
              
        for (SourcePipelinePojo pxPipe: source.getProcessingPipeline()) { /// (must be non null if here)
          //DEBUG
          //System.out.println("PX EL: " + pxPipe.display + ", " + processSpawnedDocOrNotSpawnedDoc + ", " + doc.getUrl() + ": " + toAdd.size());
          
          // Spawned documents only enter at their spot in the pipeline:
          if (!processSpawnedDocOrNotSpawnedDoc) {
            if (pxPipe == doc.getSpawnedFrom()) { // (intentionally ptr ==)
              processSpawnedDocOrNotSpawnedDoc = true; // (next pipeline element, start processing)
            }
            continue; // (skip past elements, including the spawnee)
            
          }//TESTED (doc_splitter_test);
          
          // Run criteria for this pipeline element:
          if ((null != pxPipe.criteria) && !pxPipe.criteria.isEmpty()) {
            // Check branches (read)
            boolean moddedCriteria = false;
            String newCriteria = pxPipe.criteria;
            
            Matcher m1 = this.BRANCH_MAP_GET.matcher(newCriteria);
            boolean modCriteria = false;
            
            boolean branchMismatch = false;
            while (m1.find()) {
              modCriteria = true;
              if ((null == currentBranches) || !currentBranches.contains(m1.group(1))) {
                branchMismatch = true;
                break;
              }
            }
            if (branchMismatch) {
              continue;
            }
            if (modCriteria) {
              newCriteria = m1.replaceAll("");
              moddedCriteria = true;
            }
            //TESTED (complex_criteria_test)
                        
            // Check branches (write)
            String branchYes = null;
            String branchNo = null;
            Matcher m2 = BRANCH_MAP_SET.matcher(newCriteria);
            modCriteria = false;
            if (m2.find()) {
              modCriteria = true;
              branchYes = m2.group(1);
              branchNo = m2.group(2);
            }            
            if (modCriteria) {
              newCriteria = m2.replaceAll("");
              moddedCriteria = true;
            }
            //TESTED (complex_criteria_test)
            
            if (!moddedCriteria || !newCriteria.isEmpty()) {
              if (!newCriteria.startsWith("$SCRIPT")) {
                newCriteria= "$SCRIPT(" + newCriteria + ")";
              }//TESTED (basic_criteria_test)
              
              if ((null != branchYes) && (null == currentBranches)) {
                currentBranches = new HashSet<String>();
              }
              
              if (!_sah.rejectDoc(newCriteria, doc, false)) {
                if (null != branchNo) {
                  currentBranches.add(branchNo);                
                  Set<String> parentBranches = this._branchMappings.get(branchNo);
                  if (null != parentBranches) {
                    currentBranches.addAll(parentBranches);
                  }
                }
                continue;
              }
              else {
                if (null != branchYes) {
                  currentBranches.add(branchYes);                
                  Set<String> parentBranches = this._branchMappings.get(branchYes);
                  if (null != parentBranches) {
                    currentBranches.addAll(parentBranches);
                  }
                }
              }
              //TESTED (complex_criteria_test)
            }
          }//TESTED (basic_criteria_test)
          
          //TODO (INF-2218): improve performance of doc serialization by only updating spec'd fields (note: need to change the js engine)
          // and by sharing engine state between the SAH and UAH
          
          // Save metadata state so we know if we need to re-serialize the document         
          int nCurrMetaFields = 0;
          Object ptr = doc.getMetadata();
          // (Only needed for text engine or feature engine - otherwise the SAH cache is reset as needed)
          if ((null != pxPipe.featureEngine) || (null != pxPipe.textEngine)) {
            if ((null != _sah) && (null != ptr)) {
              nCurrMetaFields = doc.getMetadata().size();
            }          
          }//TESTED (metadata_doc_cache_reset)
          
          try {          
            // 3] Create new documents from existing ones
            
            if (null != pxPipe.splitter) {
              if (null == splitterList) {
                splitterList = new LinkedList<DocumentPojo>();
              }
              try {
                splitDocuments(doc, source, pxPipe, splitterList);
              }
              catch (Exception e) {} // do nothing, still want to keep doc unless otherwise specified below
              
              if ((null == pxPipe.splitter.getDeleteExisting()) || pxPipe.splitter.getDeleteExisting()) {
                // Don't keep original doc
                docIt.remove();
                doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                break;                
              }//TESTED (test1,test2)
              
            }//TESTED (doc_splitter)


            // 4] Text and linked document extraction
            
            if (null != pxPipe.text) {
              // IN: doc (xpath/regex) or json(doc) (js)
              // OUT: doc.fullText, doc.title, doc.desc, (less common) doc.metadata.*
              // POST: reset
  
              updateInterDocDelayState(doc, false);
              
              String cachedFullText = _uah.doManualTextEnrichment(doc, pxPipe.text, source.getRssConfig());
              if (null != _sah) {
                _sah.resetDocumentCache();        
              }
              
              // Cache the full text if available
              if ((null == _cachedRawFullText) && _cachedRawFullText_available) {
                _cachedRawFullText = cachedFullText;
              }//(TESTED: cache available: text_raw_to_boilerpipe, no cache available: text_then_raw_then_content*)
            }
            //TESTED (fulltext_regexTests.json, basic_web_uahRawText.json, text_raw_to_boilerpipe)
            
            if (null != pxPipe.textEngine) { 
              // IN: doc
              // OUT: doc.* 
              // POST: reset sah ent cache (_should_ change only metadata and text (+ents/assocs) so don't need to reset sah doc cache) 
                          
              if (!handleTextEngine(pxPipe, doc, source)) {
                error_on_feed_count++;
                
                if ((null == pxPipe.textEngine.exitOnError) || pxPipe.textEngine.exitOnError) {
                  doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                  docIt.remove();
                  break; // (no more processing)
                }//TESTED (engines_exit_on_error)
                
              }
            } //TESTED (basic_web_test_ocOptions.json, basic_web_test_textaaOptions.json)
            
            // 5] Document level fields
            
            if (null != pxPipe.docMetadata) {
              // IN: sah.doc
              // OUT: doc.*
              // POST: reset
              
              _sah.setDocumentMetadata(doc, pxPipe.docMetadata);
              _sah.resetDocumentCache();
            }
            //TESTED (fulltext_docMetaTest.json)
            
            if (null != pxPipe.contentMetadata) {
              // IN: doc (xpath/regex) or json(doc) (js)
              // OUT: doc.meta.*
              // POST: reset
              
              updateInterDocDelayState(doc, false);
              
              _uah.processMetadataChain(doc, pxPipe.contentMetadata, source.getRssConfig(), unstoredFields);
              if (null != _sah) {
                _sah.resetDocumentCache();        
              }
              
              // Cache the full text if available
              if ((null == _cachedRawFullText) && _cachedRawFullText_available) {
                _cachedRawFullText = doc.getFullText();
              }//(TESTED: ((cache available) text_content_then_raw_to_boilerpipe (not available) text_default_then_content_then_default_test.json)
            }
            //TESTED (fulltext_regexTests.json, basic_web_uahRawText.json)
            
            // 6] Entities and Associations
            
            if (null != pxPipe.entities) {
              // IN: sah.doc.*, sah.doc.metadadata.*, 
                //(recalculate from scratch then use: sah.entityMap, sah.geoMap)
              // OUT: doc.entities, sah.entityMap, sah.geoMap
              // POST: no need to reset anything, sah.entities never read 
              
              _sah.setEntities(doc, pxPipe.entities);
            }
            //TESTED (fulltext_ents_and_assocs.json)
            
            if (null != pxPipe.associations) {
              // IN: sah.doc.*, sah.doc.metadadata.*, doc.entities, sah.entityMap, sah.geoMap
              // OUT: doc.associations
              // POST: no need to reset anything, sah.associations never read  
              
              _sah.setAssociations(doc, pxPipe.associations);
            }
            //TESTED (fulltext_ents_and_assocs.json)
            
            if (null != pxPipe.featureEngine) {
              // IN: doc
              // OUT: doc.* 
              // POST: reset sah ent cache (_should_ change only metadata, ents and assocs so don't need to reset sah doc cache)  
              
              if (!handleFeatureEngine(pxPipe, doc, source)) {
                error_on_feed_count++;
                
                if ((null == pxPipe.featureEngine.exitOnError) || pxPipe.featureEngine.exitOnError) {
                  doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                  docIt.remove();
                  break; // (no more processing)
                  
                }//TESTED (engines_exit_on_error_test)
              }
              
            } //TESTED (basic_web_test_ocOptions.json, basic_web_test_textaaOptions.json)


            // 7] Finishing steps:
            
            if (null != pxPipe.storageSettings) {              
              // IN: doc
              // OUT: doc.metadata.*
              // POST: reset if metadata settings present
              
              if (!handleStorageSettings(pxPipe, doc)) {
                // (this is a manual rejection not an error so we're good)
                doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
                docIt.remove();
                break; // (no more processing for this document)
              }              
              if ((null != pxPipe.storageSettings.exitPipeline) && pxPipe.storageSettings.exitPipeline) {                
                break; // (no more processing for this document)
              }//TESTED (basic_criteria_test)
              
            }//TESTED (storageSettings_test; not update - need more infrastructure)   
            
          }
          catch (Exception e) { // For now we'll just handle any exception by nuking either the doc or source
            // (in the future we could consider continuing on depending on which pipeline element had died
            //  or perhaps even have a common option "continue on error")
            throw e;
          }
          finally {}
          
          // Check metadata state so we know if we need to re-ingest the document
          // (Only needed for text engine or feature engine - otherwise the SAH cache is reset as needed)
          if ((null != pxPipe.featureEngine) || (null != pxPipe.textEngine)) {
            Object ptrAfter = doc.getMetadata();
            int nCurrMetaFieldsAfter = 0;
            if (null != _sah) {
              if (null != ptrAfter) {
                nCurrMetaFieldsAfter = doc.getMetadata().size();            
              }
              if ((ptr != ptrAfter) || (nCurrMetaFieldsAfter != nCurrMetaFields))
              {
                _sah.resetDocumentCache();
              }
            }
          }//TESTED (metadata_doc_cache_reset)
          
        }//end loop over per-document processing pipeline elements
      }
      catch (ExtractorSourceLevelException e) { // Delete all docs, log
        this.handleDocOrSourceError(source, doc, docIt, e, true);  
        break;
      } //TESTED (c/p file_textError)
      catch (ExtractorDailyLimitExceededException e) {
        this.handleDocOrSourceError(source, doc, docIt, e, true);  
        break;
      } //TESTED (c/p file_textError) 
      catch (ExtractorSourceLevelMajorException e) {
        this.handleDocOrSourceError(source, doc, docIt, e, true);  
        break;
      } //TESTED (file_textError) 
      catch (ExtractorSourceLevelTransientException e) {
        this.handleDocOrSourceError(source, doc, docIt, e, true);  
        break;
      } //TESTED (c/p file_textError)
      catch (Exception e) { // Misc doc error
        //e.printStackTrace();
        
        error_on_feed_count++;
        this.handleDocOrSourceError(source, doc, docIt, e, false);  
        // (don't break)
      } //TESTED (web_errors_test)
      finally {}


      if (!unstoredFields.isEmpty()) {
        if (null != doc.getMetadata()) {
          for (String fieldToDelete: unstoredFields) {
            doc.getMetadata().remove(fieldToDelete);
          }
        }
      } //TESTED (storageSettings_advanced.json)
      
    }//end loop over documents

View Full Code Here

          if (null == doc.getSourceUrl()) { // (if sourceUrl != null, bypass it's because it's been generated by a file so is being deleted anyway)
            if (_hc.getDuplicateManager().isDuplicate_Url(newDocInfo.url, source, null)) {
              continue;
            }
          }          
          DocumentPojo newDoc = new DocumentPojo();
          newDoc.setCreated(doc.getCreated());
          newDoc.setModified(doc.getModified());
          newDoc.setUrl(newDocInfo.url);
          newDoc.setTitle(newDocInfo.title);
          newDoc.setDescription(newDocInfo.description);
          newDoc.setFullText(newDocInfo.fullText);
          // Published date is a bit more complex
          if (null != newDocInfo.publishedDate) {
            try {
              newDoc.setPublishedDate(new Date(DateUtility.parseDate(newDocInfo.publishedDate)));            
            }
            catch (Exception e) {}
          }//TESTED (test3,test4)
          if (null == newDoc.getPublishedDate()) {
            newDoc.setPublishedDate(doc.getPublishedDate());
          }//TESTED (test1)
          if (null == newDoc.getPublishedDate()) {
            newDoc.setPublishedDate(doc.getCreated());
          }//TESTED (test2)
          newDoc.setTempSource(source);
          newDoc.setSource(doc.getSource());
          newDoc.setSourceKey(doc.getSourceKey());
          newDoc.setSourceUrl(doc.getSourceUrl()); // (otherwise won't be able to delete child docs that come from a file)
          newDoc.setCommunityId(doc.getCommunityId());
          newDoc.setDocGeo(doc.getDocGeo());
          newDoc.setIndex(doc.getIndex());
          
          newDoc.setSpawnedFrom(splitter);
          docs.add(newDoc);
        }//end loop over URLs
      }//TESTED
    }
    catch (Exception e) {

View Full Code Here

        stats.setSavedScores(0, 0);
        rp.setStats(stats);
        ArrayList<BasicDBObject> toAddTemp = new ArrayList<BasicDBObject>(1);
        testFederatedQuery.postQueryActivities(queryId, toAddTemp, rp);
        for (BasicDBObject docObj: toAddTemp) {
          DocumentPojo doc = DocumentPojo.fromDb(docObj, DocumentPojo.class);
          if (null != doc.getEntities()) {
            federatedQueryEnts += doc.getEntities().size();
          }
          
          //Metadata workaround:
          @SuppressWarnings("unchecked")
          LinkedHashMap<String, Object[]> meta = (LinkedHashMap<String, Object[]>) docObj.get(DocumentPojo.metadata_);
          if (null != meta) {
            Object metaJson = meta.get("json");
            if (metaJson instanceof Object[]) { // (in this case ... non-cached, need to recopy in, I forget why)
              doc.addToMetadata("json", (Object[])metaJson);
            }
          }          
          toAdd.add(doc);
        }
        // (currently can't run harvest source federated query)
        if (0 == federatedQueryEnts) { // (more fed query exceptions)
          source.getHarvestStatus().setHarvest_message("Warning: no entities extracted, probably docConversionMap is wrong?");
        }
        else {
          source.getHarvestStatus().setHarvest_message(federatedQueryEnts + " entities extracted");
        }
        
      }//TESTED (END FEDERATED QUERY TEST MODE, WHICH IS A BIT DIFFERENT)
      else {
        harvester.harvestSource(source, toAdd, toUpdate, toRemove);
      }      
      
      // (don't parrot the old message back - v confusing)
      if (oldMessage == source.getHarvestStatus().getHarvest_message()) { // (ptr ==)
        source.getHarvestStatus().setHarvest_message("(no documents extracted - likely a source or configuration error)");        
      }//TESTED
      
      String message = null;
      if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getHarvest_message())) {
        message = source.getHarvestStatus().getHarvest_message();
      }
      else {
        message = "";
      }
      List<String> errMessagesFromSourceDeser = apiMap.getErrorMessages();
      if (null != errMessagesFromSourceDeser) {
        StringBuffer sbApiMapErr = new StringBuffer("Substitution errors:\n"); 
        for (String err: errMessagesFromSourceDeser) {
          sbApiMapErr.append(err).append("\n");
        }
        message = message + "\n" + sbApiMapErr.toString();
      }//TESTED (by hand)
      
      if ((null != source.getHarvestStatus()) && (HarvestEnum.error == source.getHarvestStatus().getHarvest_status())) {
        rp.setResponse(new ResponseObject("Test Source",false,"source error: " + message));      
        rp.setData(toAdd, new DocumentPojoApiMap());        
      }
      else {
        if ((null == message) || message.isEmpty()) {
          message = "no messages from harvester";
        }
        rp.setResponse(new ResponseObject("Test Source",true,"successfully returned " + toAdd.size() + " docs: " + message));      
        try {
          // If grabbing full text
          // Also some logstash specific logic - these aren't docs so just output the entire record
          boolean isLogstash = (null != source.getExtractType()) && source.getExtractType().equalsIgnoreCase("logstash");
          List<BasicDBObject> logstashRecords = null;
          if (bReturnFullText || isLogstash) {
            for (DocumentPojo doc: toAdd) {
              if (isLogstash) {
                if (null == logstashRecords) {
                  logstashRecords = new ArrayList<BasicDBObject>(toAdd.size());                  
                }
                BasicDBObject dbo = (BasicDBObject) doc.getMetadata().get("logstash_record")[0];
                Object test = dbo.get("_id");
                if ((null != test) && (test instanceof ObjectId)) {
                  dbo.remove("_id"); // (unless it's a custom _id added from logstash then remove it)
                }
                logstashRecords.add(dbo);
              }//TESTED
              else if (bReturnFullText) {
                doc.makeFullTextNonTransient();
              }
            }
          }//TESTED
          if (null != logstashRecords) {
            rp.setData(logstashRecords, (BasePojoApiMap<BasicDBObject>)null);

View Full Code Here

              ObjectId dupId = qr.getLastDuplicateId();
              
              if ((null != dupModDate) && (null != dupId)) {
                if (dupModDate.getTime() + source.getRssConfig().getUpdateCycle_secs()*1000 < nNow) {
                  
                  DocumentPojo doc = buildDocument(url, entry, source, duplicateSources);
                  if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                    // (Use dummy TitleEx to create a "fake" full text block)
                    doc.setFullText(entry.getSource().getDescription());
                  }
                  doc.setUpdateId(dupId); // (set _id to document I'm going to overwrite) 
                  this.docsToUpdate.add(doc);
                  
                  if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                    source.setReachedMaxDocs();
                    break; // (that's enough documents)
                  }
                }
              }
            }//TESTED (duplicates we update instead of ignoring)
            
            if (!duplicate) {
              DocumentPojo doc = buildDocument(url, entry, source, duplicateSources);
              if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                // (Use dummy TitleEx to create a "fake" full text block)
                doc.setFullText(entry.getSource().getDescription());
              }
              this.docsToAdd.add(doc);


              if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                source.setReachedMaxDocs();

View Full Code Here

  }


  private DocumentPojo buildDocument(String cleansedUrl, SyndEntry entry, SourcePojo source, LinkedList<String> duplicateSources) {


    // create the feed pojo
    DocumentPojo doc = new DocumentPojo();


    doc.setUrl(cleansedUrl);
    doc.setCreated(new Date());
    doc.setModified(doc.getCreated());


    // Strip out html if it is present
    if ( entry.getTitle() != null )
      doc.setTitle(entry.getTitle().replaceAll("\\<.*?\\>", "").trim());
    if ( entry.getDescription() != null )
      doc.setDescription(entry.getDescription().getValue().replaceAll("\\<.*?\\>", "").trim());
    if ( entry.getPublishedDate() != null ) {
      doc.setPublishedDate(entry.getPublishedDate());
    }
    else {
      doc.setPublishedDate(doc.getCreated());
    }


    // Clone from an existing source if we can:
    if (!duplicateSources.isEmpty() && (null == doc.getUpdateId())) { // (can't duplicate updating document)
      doc.setDuplicateFrom(duplicateSources.getFirst());
    }
    
    //GeoRSS
    GeoRSSModule geoRSSModule = GeoRSSUtils.getGeoRSS(entry); //currently does not handle <georss:circle>
    if (null != geoRSSModule)
    {
      if (null != geoRSSModule.getPosition())
      {
        double lat = geoRSSModule.getPosition().getLatitude();
        double lon = geoRSSModule.getPosition().getLongitude();
        GeoPojo gp = new GeoPojo();
        gp.lat = lat;
        gp.lon = lon;
        doc.setDocGeo(gp);
      }
      if (null != geoRSSModule.getGeometry())
      {
        AbstractGeometry ag = geoRSSModule.getGeometry();
        if(ag.getClass().equals(new LineString().getClass()))
        { //<georss:line>
          LineString ls = ((LineString)geoRSSModule.getGeometry());
          
          double latAvg = 0.0;
          double lonAvg = 0.0;
          int length = ls.getPositionList().size();
          for (int i = 0; i < length; i ++)
          {
            latAvg += ls.getPositionList().getLatitude(i);
            lonAvg += ls.getPositionList().getLongitude(i);
          }
          latAvg = latAvg/length;
          lonAvg = lonAvg/length;
          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        }
        else if (ag.getClass().equals(new Polygon().getClass())) //<georss:polygon>
        {
          Polygon poly = ((Polygon)geoRSSModule.getGeometry());
          AbstractRing ar = poly.getExterior();
          LinearRing lr = (LinearRing)ar;


          double latAvg = 0.0;
          double lonAvg = 0.0;
          int length = lr.getPositionList().size();
          for (int i = 0; i < length; i ++)
          {
            latAvg += lr.getPositionList().getLatitude(i);
            lonAvg += lr.getPositionList().getLongitude(i);
          }
          latAvg = latAvg/length;
          lonAvg = lonAvg/length;
          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        }
        else if(ag.getClass().equals(new Envelope().getClass()))
        { //<georss:box>
          Envelope env = ((Envelope)geoRSSModule.getGeometry());
          
          double latAvg = (env.getMaxLatitude()+env.getMinLatitude())/2;
          double lonAvg = (env.getMaxLongitude()+env.getMinLongitude())/2;


          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        }
      }
    }// end if GeoRSS
    
    // Arbitrary other metadata:


    if (null != entry.getForeignMarkup()) {
      JSONObject rssMetadata = new JSONObject();
      
      @SuppressWarnings("unchecked")
      List<Element> fms = (List<Element>) entry.getForeignMarkup();
      for (Element fm : fms) {
        try {
          JSONObject subObj = XML.toJSONObject(new XMLOutputter().outputString(fm));
          if (1 == subObj.length()) {
            for (String name: JSONObject.getNames(subObj)) {
              rssMetadata.put(name, subObj.get(name));              
            }
          }
          else { // (this will never happen in practice?)
            rssMetadata.put(fm.getName(), subObj);
          }
        } 
        catch (JSONException e) {} // (do nothing just carry on)
      }
      if (!fms.isEmpty()) {
        doc.addToMetadata("_FEED_METADATA_", XmlToMetadataParser.convertJsonObjectToLinkedHashMap(rssMetadata));
      }
    }//TESTED (longs converted to string, eg edgar:assistantDirector from "http.www.sec.gov.archives.edgar.usgaap.rss.xml")


    return doc;
  }

View Full Code Here

          
          // Finally, if we wanted to delete the files then go ahead now:
          if (_deleteExistingFilesBySourceKey) {            
            // For now, support only "non-append" mode efficiently:
            // Always delete all the old docs, updated docs will work but inefficiently (will delete and re-create)
            DocumentPojo docRepresentingSrcKey = new DocumentPojo();
            if (null != source.getDistributionFactor()) {
              // If split across multiple docs then need a more expensive delete (note: still indexed)
              docRepresentingSrcKey.setId(customLastRecordId);
            }
            docRepresentingSrcKey.setCommunityId(source.getCommunityIds().iterator().next());
            docRepresentingSrcKey.setSourceKey(source.getKey());
            this.docsToRemove.add(docRepresentingSrcKey);            
          }//TESTED
        }
        else { // share - this is much simpler:
          if (!_context.getDuplicateManager().needsUpdated_Url(new Date(file.getDate()), null, source)) {

View Full Code Here

  private void parse( InfiniteFile f, SourcePojo source ) throws MalformedURLException, URISyntaxException {


    //NOTE: we only ever break out of here because of max docs in standalone mode
    // (because we don't know how to continue reading)
    
    DocumentPojo doc = null;    
    //Determine File Extension
    String fileName = f.getName().toString();
    
    int mid= fileName.lastIndexOf(".");
    String extension = fileName.substring(mid+1,fileName.length()); 


    //Checked to save processing time
    long fileTimestamp = (f.getDate()/1000)*1000;
      // (ensure truncated to seconds, since some operation somewhere hear does this...)
    
    Date modDate = new Date(fileTimestamp);
    //XML Data gets placed into MetaData
    
    boolean bIsXml = false;
    boolean bIsJson = false;
    boolean bIsLineOriented = false;
    if ((null != source.getFileConfig()) && (null != source.getFileConfig().type)) {
      extension = source.getFileConfig().type;
    }
    bIsXml = extension.equalsIgnoreCase("xml");
    bIsJson = extension.equalsIgnoreCase("json");
    bIsLineOriented = extension.endsWith("sv");
    
    if (bIsXml || bIsJson || bIsLineOriented)
    {
      int debugMaxDocs =  Integer.MAX_VALUE; // by default don't set this, it's only for debug mode
      if (_context.isStandalone()) { // debug mode
        debugMaxDocs = maxDocsPerCycle; 
      }      
      
      //fast check to see if the file has changed before processing (or if it never existed)
      if(needsUpdated_SourceUrl(modDate, f.getUrlString(), source))
      {
        if (0 != modDate.getTime()) { // if it ==0 then sourceUrl doesn't exist at all, no need to delete
          // This file already exists - in normal/managed mode will re-create
          // In streaming mode, simple skip over
          if (_streaming) {
            return;
          }//TESTED
          
          DocumentPojo docRepresentingSrcUrl = new DocumentPojo();
          docRepresentingSrcUrl.setSourceUrl(f.getUrlString());
          docRepresentingSrcUrl.setSourceKey(source.getKey());
          docRepresentingSrcUrl.setCommunityId(source.getCommunityIds().iterator().next());
          sourceUrlsGettingUpdated.add(docRepresentingSrcUrl.getSourceUrl());
          this.docsToRemove.add(docRepresentingSrcUrl);
            // (can add documents with just source URL, are treated differently in the core libraries)          
        }
        
        SourceFileConfigPojo fileSystem = source.getFileConfig();
        if ((null == fileSystem) && (bIsXml || bIsJson)) {
          fileSystem = new SourceFileConfigPojo();
        }
        XmlToMetadataParser xmlParser = null;
        JsonToMetadataParser jsonParser = null;
        String urlType = extension;
        if (bIsXml) {
          xmlParser = new XmlToMetadataParser(fileSystem.XmlRootLevelValues, 
                    fileSystem.XmlIgnoreValues, fileSystem.XmlSourceName, fileSystem.XmlPrimaryKey, 
                    fileSystem.XmlAttributePrefix, fileSystem.XmlPreserveCase, debugMaxDocs);
        }//TESTED
        else if (bIsJson) {
          jsonParser = new JsonToMetadataParser(fileSystem.XmlSourceName, fileSystem.XmlRootLevelValues, fileSystem.XmlPrimaryKey, fileSystem.XmlIgnoreValues, debugMaxDocs);
        }//TESTED
        
        List<DocumentPojo> partials = null;
        try {
          if (bIsXml) {
            XMLStreamReader xmlStreamReader = null;
            XMLInputFactory factory = XMLInputFactory.newInstance();
            factory.setProperty(XMLInputFactory.IS_COALESCING, true);
            factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
            try {              
              xmlStreamReader = factory.createXMLStreamReader(f.getInputStream());
              partials = xmlParser.parseDocument(xmlStreamReader);
              _memUsage += xmlParser.getMemUsage();
            }
            finally {
              if (null != xmlStreamReader) xmlStreamReader.close();
            }
          }//TESTED
          else if (bIsJson) {
            JsonReader jsonReader = null;
            try {              
              jsonReader = new JsonReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
              jsonReader.setLenient(true);
              partials = jsonParser.parseDocument(jsonReader);
              _memUsage += jsonParser.getMemUsage();
            }
            finally {
              if (null != jsonReader) jsonReader.close();
            }
          }//TESTED
          else if (bIsLineOriented) { // Just generate a document for every line
            
            BufferedReader lineReader = null;
            try {
              lineReader = new BufferedReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
              CsvToMetadataParser lineParser = new CsvToMetadataParser(debugMaxDocs);
              partials = lineParser.parseDocument(lineReader, source);
              _memUsage += lineParser.getMemUsage();              
            }
            finally {
              if (null != lineReader) lineReader.close();
            }
          }//TESTED


          MessageDigest md5 = null; // (generates unique urls if the user doesn't below)
          try {
            md5 = MessageDigest.getInstance("MD5");
          } catch (NoSuchAlgorithmException e) {
            // Do nothing, unlikely to happen...
          }          
          int nIndex = 0;
          int numPartials = partials.size();          
          for (DocumentPojo doctoAdd : partials)
          {
            nIndex++;
            doctoAdd.setSource(source.getTitle());
            doctoAdd.setSourceKey(source.getKey());
            doctoAdd.setMediaType(source.getMediaType());
            doctoAdd.setModified(new Date(fileTimestamp));
            doctoAdd.setCreated(new Date());        
            
            if(null == doctoAdd.getUrl()) { // Can be set in the parser or here
              doctoAdd.setHasDefaultUrl(true); // (ie cannot occur in a different src URL)
              
              if (1 == numPartials) {
                String urlString = f.getUrlString();
                if (urlString.endsWith(urlType)) {
                  doctoAdd.setUrl(urlString);
                }
                else {
                  doctoAdd.setUrl(new StringBuffer(urlString).append('.').append(urlType).toString());
                }
                // (we always set sourceUrl as the true url of the file, so want to differentiate the URL with
                //  some useful information)
              }
              else if (null == doctoAdd.getMetadata()) { // Line oriented case
                doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(nIndex).append('.').append(urlType).toString());
              }
              else {
                if (null == md5) { // Will never happen, MD5 always exists
                  doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(doctoAdd.getMetadata().hashCode()).append('.').append(urlType).toString());
                }
                else { // This is the standard call if the XML parser has not been configured to build the URL
                  doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(DigestUtils.md5Hex(doctoAdd.getMetadata().toString())).append('.').append(urlType).toString());
                }
              }//TESTED
            }            
            doctoAdd.setTitle(f.getName().toString());
            doctoAdd.setPublishedDate(new Date(fileTimestamp));
            doctoAdd.setSourceUrl(f.getUrlString());


            // Always add to files because I'm deleting the source URL
            files.add(doctoAdd);            
          }//TESTED 
          
        } catch (XMLStreamException e1) {
          errors++;
          _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
        } catch (FactoryConfigurationError e1) {
          errors++;
          _context.getHarvestStatus().logMessage(e1.getMessage(), true);
          
        } catch (IOException e1) {
          errors++;
          _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
        }
        catch (Exception e1) {
          errors++;
          _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);          
        }
      }//(end if needs updated)
    }
    else //Tika supports Excel,Word,Powerpoint,Visio, & Outlook Documents
    {
      // (This dedup tells me if it's an add/update vs ignore - qr.isDuplicate higher up tells me if I need to add or update)
      if(needsUpdated_Url(modDate, f.getUrlString(), source))
      {


        Metadata metadata = null;
        InputStream in = null;
        try {


          doc = new DocumentPojo();
          
          // Create a tika object (first time only)
          if (null == _tika) {
            this.initializeTika(_context, source);
          }

View Full Code Here

          // Check to see if the record has already been added
          // If it has been added then we need to update it with the new information
          if (!qr.isDuplicate_Url(docUrl, source, duplicateSources)) 
          {
            nAdded++;
            DocumentPojo newDoc = createDoc(CommitType.insert, rs, md, source, docUrl);
            if (!duplicateSources.isEmpty()) {
              newDoc.setDuplicateFrom(duplicateSources.getFirst());
            }
            this.docsToAdd.add(newDoc);
          }
          else {
            //TODO (INF-1300): update, I guess need to check if the record has changed?

View Full Code Here

        // Set up the primary variables
          String primaryKey = rs.getString(source.getDatabaseConfig().getPrimaryKey());
          String docUrl = source.getUrl() + "/" + primaryKey;
          
          // Get our system id from the record
          DocumentPojo docToRemove = new DocumentPojo();
          docToRemove.setUrl(docUrl);
          docsToDelete.add(docToRemove);
      }
    } 
      catch (SQLException e) 
      {

View Full Code Here

0 1 2 3 4

TOP

Related Classes of com.ikanow.infinit.e.data_model.store.document.DocumentPojo

com.google.gson.JsonArray

com.google.gson.JsonElement

com.google.gson.JsonObject

com.ikanow.infinit.e.api.config.source.SourceHandler

com.ikanow.infinit.e.api.knowledge.DocumentHandler

com.ikanow.infinit.e.api.test.QueryExtensionTestCode

com.ikanow.infinit.e.data_model.api.knowledge.DocumentPojoApiMap$DocumentPojoDeserializer

com.ikanow.infinit.e.data_model.test.TestCode

com.ikanow.infinit.e.harvest.enrichment.custom.StructuredAnalysisHarvester

com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.