Examples of com.ikanow.infinit.e.data_model.store.document.DocumentPojo

Package com.ikanow.infinit.e.data_model.store.document

Examples of com.ikanow.infinit.e.data_model.store.document.DocumentPojo

com.ikanow.infinit.e.data_model.store.document.DocumentPojo
@author apiggottThe generic document data model

    // Store the knowledge in the feeds collection in the harvester db      
    int errors = 0;
    Exception savedException = null;
    Iterator<DocumentPojo> it = docs.iterator();
    while (it.hasNext()) {
      DocumentPojo f = it.next();
      
      // Set an _id before writing it to the datastore,
      // so the same _id gets written to the index
      // NOTE WE OVERWRITE ANY TRANSIENT IDS THAT MIGHT HAVE BEEN SET eg BY REMOVE CODE
      f.setId(new ObjectId());
      
      // Check geo-size: need to add to a different index if so, for memory usage reasons
      if (null == f.getLocs()) { // (can be set by update/deletion code also)
        if (DocumentPojoIndexMap.hasManyGeos(f)) {
          f.setIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_);
          // (note this check isn't stateless, it actually populates "locs" at the same time)
          // therefore...
        }
      }
      Set<String> locs = f.getLocs();
      f.setLocs(null);
      
      try {
        addToDatastore(col, f);
      }
      catch (Exception e) {
        errors++;
        if ((errors > 0) || (null == e.getMessage())) {
          savedException = e;
        }
        it.remove();
        continue;
      }
      
      f.setLocs(locs);
    }
    if (errors > 0) {
      logger.error("addToDatastore: errors=" + errors + " 1sterror=" + savedException.getMessage());
    }
  }//TESTED

View Full Code Here

        
        if (bStoreContent && bDocHasExternalContent) {
          try
          {
            String rawText = this.bStoreRawContent ? doc.getRawFullText() : null; 
            DocumentPojo meta = bStoreMetadataAsContent ? doc : null; 
            CompressedFullTextPojo gzippedContent = new CompressedFullTextPojo(doc.getUrl(), doc.getSourceKey(), doc.getCommunityId(),
                                              doc.getFullText(), rawText, meta, nMaxContentLen_bytes);
            
            if (null != gzippedContent.getUrl())  {
              // Be efficient and write field-by-field vs using JSON conversion

View Full Code Here

    TreeMap<String,DocumentPojo> sourceUrlToKeyMap = null;
    HashSet<String> deletedSources = null;
    // Store the knowledge in the feeds collection in the harvester db
    Iterator<DocumentPojo> docIt = docs.iterator();
    while (docIt.hasNext()) {
      DocumentPojo f = docIt.next();
      nextId = f.getId(); // (only interested in the pruneSource case, in which case _id is set on input)
      
      if ((null != f.getSourceUrl()) && (null == f.getUrl())) { // special case ... delete all these documents...
        if ((null == deletedSources) || !deletedSources.contains(f.getSourceKey())) { // (don't bother deleting sourceURL if deleting source)
          if (null == sourceUrlToKeyMap) {
            sourceUrlToKeyMap = new TreeMap<String,DocumentPojo>();
          }
          sourceUrlToKeyMap.put(f.getSourceUrl(), f);        
        }//TESTED


        docIt.remove(); // (so don't miscount number of docs; processed below)
      }
      else if (null != f.getSourceKey() && (null == f.getSourceUrl()) && (null == f.getUrl())) {
        // Even more special case: delete entire sourceKey
        if (null == deletedSources) {
          deletedSources = new HashSet<String>();
        }
        if (!deletedSources.contains(f.getSourceKey())) {
          deletedSources.add(f.getSourceKey());
          long srcRemoved = removeFromDatastoreAndIndex_bySourceKey(f.getSourceKey(), f.getId(), true, f.getCommunityId().toString());
          if (srcRemoved > 0) {
            updateDocCountsOnTheFly(-srcRemoved, f.getSourceKey(), f.getCommunityId());            
          }
        }
        docIt.remove(); // (so don't miscount number of docs)
      }//TESTED
      else {
        removeFromDatastore_byURL(col, f, fields, 
            StoreAndIndexManager.docHasExternalContent(f.getUrl(), f.getSourceUrl()));
          // (adds "_id", "index")
      }
    }//TESTED


    // Now tidy up sourceUrls, do some caching across sourceKey/community for performance
    String sourceKey = null; // (if deleting sourceKey don't bother deleting any sourceUrls)
    long removed = 0; // (from special operations)
    String cachedSourceKey = null; // (will handle multiple source keys, although that can't currently happen in practice)
    ObjectId communityId = null;
    if (null != sourceUrlToKeyMap) for (Map.Entry<String, DocumentPojo> entry: sourceUrlToKeyMap.entrySet()) {
      String srcUrl = entry.getKey();
      DocumentPojo doc = entry.getValue();
      sourceKey = doc.getSourceKey();
      communityId = doc.getCommunityId();
      if (sourceKey != cachedSourceKey) { // ptr comparison by design
        if (removed > 0) {
          updateDocCountsOnTheFly(-removed, sourceKey, communityId);
          removed = 0;
        }//TESTED

View Full Code Here

    // 2.1.1] Only adding:
    // (+Check that the feeds' entities statistics have been updated:)
//    pxControl_feed.processDocuments(InfiniteEnums.FEEDS, toAdd_feed, toUpdate_feed, toDelete_feed); // (add, update, delete)
//    System.out.println(new GsonBuilder().setPrettyPrinting().create().toJson(toAdd_feed));
    // 2.1.2] Updating:
    DocumentPojo docToUpdate1 = toAdd_feed.get(0);
    DocumentPojo docToUpdate2 = toAdd_feed.get(1);
    // Couple of options: 
    docToUpdate1.setId(null); // (should be unnecessary, _id shouldn't have been set)
    docToUpdate2.setId(null); 
    docToUpdate2.setUpdateId(new ObjectId("5277fec1256f16a6d3def633")); // (Doesn't matter what it is, just check the diagnostic from StoreAndIndexManager.removeFromSearch matches)
    toUpdate_feed.add(docToUpdate1);
    toUpdate_feed.add(docToUpdate2);
    pxControl_feed.processDocuments(InfiniteEnums.FEEDS, toAdd_feed, toUpdate_feed, toDelete_feed); // (add, update, delete)
    // 2.1.3] Deleting some docs, adding others
//    toDelete_feed.add(toAdd_feed.pop());

View Full Code Here

        {
          if (bKillMeNow) {
            return fixcount;
          }
                  
          DocumentPojo doc = DocumentPojo.fromDb(cur.next(), DocumentPojo.class);
          if (null != doc.getId()) {
            dbCache.add(doc.getId().toString());
          }
          
          // Get index of doc to check in:
          String sNewIndex = doc.getIndex();
          if (null == sNewIndex) {
            sIndex = null;
            esm = esm_base;
          }
          else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
            sIndex = sNewIndex;
            if (sNewIndex.equals("document_index")) {
              esm = esm_base;
            }
            else {
              esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
            }
          }        
          
          //Compare mongo doc to search doc
          Map<String, GetField> results = esm.getDocument(doc.getId().toString(),DocumentPojo.url_);
          if ( null == results || results.isEmpty() )
          {
            //either too many entries (duplicates) or no entry
            //delete this doc from both
            logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey() + " not found in search (or duplicate)");            
            docs_to_remove.add(doc);          
            documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
            contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));            
            contentDb.remove(contentQ);
            fixcount++;
          }
        } //end loop over new docs for this source
        storeManager.removeFromSearch(docs_to_remove);
        
        //NOW VERIFY ALL OLD FEEDS
        int iteration = 1;
        boolean removedAll = true;
        docs_to_remove.clear();
        while (removedAll)
        {
          int rows = iteration*iteration*10; //10x^2 exponentially check more docs
          int oldfixes = 0;
          BasicDBObject queryOLD = new BasicDBObject();  
          queryOLD.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList) ); //source aspect
          BasicDBObject sortOLD = new BasicDBObject(DocumentPojo._id_, 1);
          
          DBCursor curOLD = documentDb.find(queryOLD, queryFields).sort(sortOLD).limit(rows);
          while (curOLD.hasNext())
          {
            DocumentPojo doc = DocumentPojo.fromDb(curOLD.next(), DocumentPojo.class);        
            if (null != doc.getId()) {
              dbCache.add(doc.getId().toString());
            }
            
            // Get index of doc to check in:
            String sNewIndex = doc.getIndex();
            if (null == sNewIndex) {
              sIndex = null;
              esm = esm_base;
            }
            else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
              sIndex = sNewIndex;
              if (sNewIndex.equals("document_index")) {
                esm = esm_base;
              }
              else {
                esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
              }
            }
            
            //Compare mongo doc to search doc
            Map<String, GetField> results = esm.getDocument(doc.getId().toString(),DocumentPojo.url_);
            if ( null == results || results.isEmpty() )
            {
              //either too many entries (duplicates) or no entry
              //delete this doc from both
              logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey() + " not found in search (or duplicate)");            
              docs_to_remove.add(doc);            
              documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
              contentDb.remove(new BasicDBObject(DocumentPojo.url_, doc.getUrl()));
              fixcount++;
              oldfixes++;
            }          
          }
          if ( oldfixes != rows )

View Full Code Here

              found = dbCache.contains(idStr);
            }//TESTED
            if (!found) 
            {        
              ObjectId id = new ObjectId(idStr);
              DocumentPojo doc = new DocumentPojo();
              doc.setId(id);
              doc.setIndex(hit.getIndex() + "/document_index");
              docs_to_remove.add(doc);
              logger.info("db sync removing doc: " + id + "/" + hit.getIndex() + "/" + source_index + " not found in mongo");
              fixcount++;
            } // end if not found
          } // end loop over docs to check
          
          nSkip += docs.length;
        }// until no more hits
        if (!docs_to_remove.isEmpty()) {
          storeManager.removeFromSearch(docs_to_remove);
          docs_to_remove.clear();
        }
        
        //CHECK OLD FEEDS 10 at atime
        int iteration = 1;
        boolean removedAll = true; 
        while (removedAll )
        {
          int rows = iteration*iteration*10;//exponential scaling 10x^2
          iteration++;
          int oldfixes = 0;
          
          //get old docs from es
          SearchRequestBuilder searchOptionsOLD = esm.getSearchOptions();
          BoolQueryBuilder boolQueryOLD = QueryBuilders.boolQuery();
          boolQueryOLD.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime));
          boolQueryOLD.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey()));
          searchOptionsOLD.addSort(DocumentPojo.created_, SortOrder.ASC);
          searchOptionsOLD.setSize(rows);
          SearchResponse rspOLD = esm.doQuery(boolQueryOLD, searchOptionsOLD);
          SearchHit[] docsOLD = rspOLD.getHits().getHits();
          
          //Check all solr docs against mongodb
          
          for (SearchHit hit: docsOLD)         
          {
            String idStr = hit.getId();
            boolean found = true;
            if (null == dbCache) {
              //OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE):
              ObjectId id = new ObjectId(idStr);
              BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id);
              DBObject dbo = documentDb.findOne(queryOLD, queryFields);
              found = (dbo != null);
            }//TESTED
            else {
               found = dbCache.contains(idStr);
            }//TESTED
            if (!found)
            {        
              // Also need to check the DB since dbCache is not guaranteed to be populated with the same
              // number of "final" docs
              ObjectId id = new ObjectId(idStr);
              if (rows > 10) { // (dbCache always loaded with the first 10 rows)
                BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id);
                if (null != documentDb.findOne(queryOLD, queryFields)) { // it is actually present
                  continue;
                }
              }
              DocumentPojo doc = new DocumentPojo();
              doc.setId(id);
              doc.setIndex(hit.getIndex() + "/document_index");
              docs_to_remove.add(doc);
              logger.info("db sync removing doc: " + idStr + "/" + source_index + " not found in mongo");
              oldfixes++;
              fixcount++;
            }

View Full Code Here

    LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>(); 
    Map<ObjectId, LinkedList<DocumentPojo>> communityList = null;
    ObjectId currCommunityId = null;
    while (dbc.hasNext()) {
      BasicDBObject dbo = (BasicDBObject)dbc.next();
      DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
      String sDocIndex = doc.getIndex();
      if (null == sDocIndex) {
        sDocIndex = "document_index";
      }
      if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) {
        _deletedIndex.add(sDocIndex);
        rebuildIndex(sDocIndex);
        try { // (Just in case the index requires some time to sort itself out)
          Thread.sleep(1000);
        } catch (InterruptedException e) {}
      }
      
      //Debug:
      //System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl());
      
      // Get the content:
      if ((0 != nMaxContentSize_bytes) && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl()))
      {
        BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
        contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
        BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);


        BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ, fields);
        if (null != dboContent) {
          byte[] compressedData = ((byte[])dboContent.get(CompressedFullTextPojo.gzip_content_));
          ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
          GZIPInputStream gzip = new GZIPInputStream(in);        
          int nRead = 0;
          StringBuffer output = new StringBuffer();
          while (nRead >= 0) {
            nRead = gzip.read(storageArray, 0, 200000);
            if (nRead > 0) {
              String s = new String(storageArray, 0, nRead, "UTF-8");
              output.append(s);
            }
          }
          doc.setFullText(output.toString());
        }
      }
      // (else document has full text already)
      
      // Get tags, if necessary:
      // Always overwrite tags - one of the reasons we might choose to migrate
      // Also may need source in order to support source index filtering
      SourcePojo src = _sourceCache.get(doc.getSourceKey());
      if (null == src) {
        //TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?)
        BasicDBObject srcDbo = (BasicDBObject) sourcesDB.findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey()));
        if (null != srcDbo) {
          src = SourcePojo.fromDb(srcDbo, SourcePojo.class);
          
          if (null != src.getProcessingPipeline()) {
            try {
              // Set the index settings
              HarvestController hc = new HarvestController();
              HarvestControllerPipeline hcPipe = new HarvestControllerPipeline();
              hcPipe.extractSource_preProcessingPipeline(src, hc);
            }
            catch (Exception e) {
              //DEBUG
              e.printStackTrace();
            }
          }//TESTED (by hand)
          
          _sourceCache.put(doc.getSourceKey(), src);
        }
      }
      doc.setTempSource(src); // (needed for source index filtering)
      if (null != src) {
        if (null != src.getTags()) {
          Set<String> tagsTidied = new TreeSet<String>();
          for (String s: src.getTags()) {
            String ss = s.trim().toLowerCase();
            tagsTidied.add(ss);
          }
          
          // May also want to write this back to the DB:
          //TODO (INF-2223): Handle append tags or not in the pipeline...
          if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) {          
            if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) {
              BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getSourceKey());
              updateQuery.put(DocumentPojo._id_, doc.getId());
              docsDB.update(updateQuery, new BasicDBObject(DbManager.addToSet_, new BasicDBObject(
                          DocumentPojo.tags_, new BasicDBObject(DbManager.each_, tagsTidied))));
            }
            doc.setTags(tagsTidied); // (just copy ptr across)
          }
        }
      }


// 2. Update the index with the new document        
      
      // (Optionally also update entity and assoc features)
      
      if (bAggregate) {
        if (null == currCommunityId) {
          currCommunityId = doc.getCommunityId();
        }
        else if (!currCommunityId.equals(doc.getCommunityId())) {          
          LinkedList<DocumentPojo> perCommunityDocList = null;
          if (null == communityList) { // (very first time we see > 1 community)
            communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>();
            perCommunityDocList = new LinkedList<DocumentPojo>();
            perCommunityDocList.addAll(docsToTransfer);  //(NOT including doc, this hasn't been added to docsToTransfer yet)
            communityList.put(currCommunityId, perCommunityDocList);
          }
          currCommunityId = doc.getCommunityId();
          perCommunityDocList = communityList.get(currCommunityId);
          if (null == perCommunityDocList) {
            perCommunityDocList = new LinkedList<DocumentPojo>();
            communityList.put(currCommunityId, perCommunityDocList);
          }

View Full Code Here

        aggManager.runScheduledSynchronization();
      }
    }//TESTED
    
    // Finally, need to update all the docs (ick)
    DocumentPojo dummy = new DocumentPojo();
    for (DocumentPojo doc: singleList) {
      boolean bEnts = (null != doc.getEntities()) && !doc.getEntities().isEmpty();
      boolean bAssocs = (null != doc.getAssociations()) && !doc.getAssociations().isEmpty();
      
      if (bEnts || bAssocs) {        
        dummy.setEntities(doc.getEntities());
        dummy.setAssociations(doc.getAssociations());
        DBObject toWrite = dummy.toDb();
        BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getSourceKey());
        updateQuery.put(DocumentPojo._id_, doc.getId());
        MongoDbManager.getDocument().getMetadata().update(updateQuery, new BasicDBObject(MongoDbManager.set_, toWrite)); 
      }//TESTED

View Full Code Here

      
      byte[] storageArray = new byte[200000];
      
      while (dbc.hasNext()) {
        BasicDBObject dbo = (BasicDBObject)dbc.next();
        DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
        
        //Debug:
        System.out.println("Getting content..." + doc.getTitle() + " / " + doc.getUrl());
        
        // Get the content:
        BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
        contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
        BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ);
        if (null != dboContent) {
          byte[] compressedData = ((byte[])dboContent.get("gzip_content"));        
          ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
          GZIPInputStream gzip = new GZIPInputStream(in);        
          int nRead = gzip.read(storageArray, 0, 200000);
          String s = new String(storageArray, 0, nRead, "UTF-8");
          doc.setFullText(s);
        }        
        // Get tag:
        SourcePojo src = _sourceCache.get(doc.getSourceKey());
        if (null == src) {
          BasicDBObject srcDbo = (BasicDBObject) sourcesDB.findOne(new BasicDBObject("key", doc.getSourceKey()));
          if (null != srcDbo) {
            src = new Gson().fromJson(srcDbo.toString(), SourcePojo.class);
            
            _sourceCache.put(doc.getSourceKey(), src);
          }
        }
        if (null != src) {
          Set<String> tagsTidied = new TreeSet<String>();
          for (String s: src.getTags()) {
            String ss = s.trim().toLowerCase();
            tagsTidied.add(ss);
          }
          doc.setTags(tagsTidied);
        }
        
        //TEST: set dynamic field
        // Lots of testing of dynamic dates:
//        feed.addToMetadata("my_dateISO", Date.parse(feed.getCreated().toGMTString()));

View Full Code Here

        public void map( Object key, BSONObject value, Context context ) throws IOException, InterruptedException
        {          
          value.removeField("associations");
          value.removeField("entities");
          value.removeField("metadata");
          DocumentPojo doc = DocumentPojo.fromDb( (BasicDBObject) value, DocumentPojo.class );
          String source = doc.getSource();
          
          if ( source != null )
          {
            one.put("countobject", new BasicDBObject("count", 1));
            word.set(source.toString());

View Full Code Here

0 1 2 3 4

TOP

Related Classes of com.ikanow.infinit.e.data_model.store.document.DocumentPojo

com.google.gson.JsonArray

com.google.gson.JsonElement

com.google.gson.JsonObject

com.ikanow.infinit.e.api.config.source.SourceHandler

com.ikanow.infinit.e.api.knowledge.DocumentHandler

com.ikanow.infinit.e.api.test.QueryExtensionTestCode

com.ikanow.infinit.e.data_model.api.knowledge.DocumentPojoApiMap$DocumentPojoDeserializer

com.ikanow.infinit.e.data_model.test.TestCode

com.ikanow.infinit.e.harvest.enrichment.custom.StructuredAnalysisHarvester

com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.