/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package com.ikanow.infinit.e.utility;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.bson.BSONObject;
import org.bson.types.ObjectId;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.ImmutableSettings.Builder;
import com.google.gson.Gson;
import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.social.community.CommunityPojo;
import com.ikanow.infinit.e.harvest.HarvestController;
import com.ikanow.infinit.e.harvest.HarvestControllerPipeline;
import com.ikanow.infinit.e.processing.generic.GenericProcessingController;
import com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager;
import com.ikanow.infinit.e.processing.generic.aggregation.AssociationBackgroundAggregationManager;
import com.ikanow.infinit.e.processing.generic.aggregation.EntityBackgroundAggregationManager;
import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager;
import com.ikanow.infinit.e.processing.generic.utils.PropertiesManager;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.MongoException;
public class MongoDocumentTxfer {
//___________________________________________________________________________________________________
// MAIN
/**
* @param args: 0,1 is the location of the MongoDB host/port, 2/3 is the location of the ES index host/port
* @throws MongoException
* @throws NumberFormatException
* @throws IOException
*/
public static void main(String sConfigPath, String sQuery, boolean bDelete, boolean bRebuildIndex, boolean bVerifyIndex, boolean bUpdateFeatures, int nSkip, int nLimit, String chunksDescription) throws NumberFormatException, MongoException, IOException {
// Command line processing
com.ikanow.infinit.e.data_model.Globals.setIdentity(com.ikanow.infinit.e.data_model.Globals.Identity.IDENTITY_SERVICE);
if (null != sConfigPath) {
com.ikanow.infinit.e.data_model.Globals.overrideConfigLocation(sConfigPath);
}
boolean bRebuildIndexOnFly = false;
if (bRebuildIndex && ((null == sQuery) || sQuery.equals("{}"))) { // (else will do them 1-by-1)
new GenericProcessingController().InitializeIndex(true, false, false);
}
else {
// Have seen odd transport timeouts on occasion: this should ensure they never happen
new GenericProcessingController().InitializeIndex(false, false, false, bVerifyIndex);
// (don't delete anything, but do recalc)
if (bRebuildIndex) {
bRebuildIndexOnFly = true;
}
}
if (bVerifyIndex && (0 == nLimit) && (null == sQuery)) {
// Index verifcation with nothing else to do
return;
}
MongoDocumentTxfer txferManager = new MongoDocumentTxfer(bRebuildIndexOnFly);
BasicDBObject query = null;
if (null == sQuery) {
query = new BasicDBObject();
}
else {
query = (BasicDBObject) com.mongodb.util.JSON.parse(sQuery);
}
if (!bDelete) {
if (null != chunksDescription) {
txferManager.doChunkedTransfer(query, nSkip, nLimit, bUpdateFeatures, chunksDescription);
}
else {
txferManager.doTransfer(query, nSkip, nLimit, bUpdateFeatures, null);
}
}
else {
txferManager.doDelete(query, nLimit);
}
}
public MongoDocumentTxfer(boolean bRebuildIndexOnFly) {
if (bRebuildIndexOnFly) {
_deletedIndex = new TreeSet<String>();
_deletedIndex.add(DocumentPojoIndexMap.manyGeoDocumentIndex_); // (don't ever delete this on the fly, it contains docs matching other queries)
}
}
//___________________________________________________________________________________________________
// Wrapper for doing transfer in chunks:
private void doChunkedTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, String chunksDescription) throws IOException
{
List<BasicDBObject> chunkList = MongoIndexerUtils.getChunks("doc_metadata.metadata", chunksDescription);
System.out.println("CHUNKS: Found " + chunkList.size() + " chunks");
//DEBUG
//System.out.println("Chunklist= " + chunkList);
for (BasicDBObject chunk: chunkList) {
BasicDBObject cleanQuery = new BasicDBObject();
cleanQuery.putAll((BSONObject)query);
String id = null;
try {
id = (String) chunk.remove("$id");
System.out.println("CHUNK: " + id);
doTransfer(cleanQuery, 0, 0, bAggregate, chunk);
}
catch (Exception e) {
System.out.println("FAILED CHUNK: " + id + " ... " + e.getMessage());
}
}
}//TESTED
//___________________________________________________________________________________________________
// PROCESSING LOOP (new interface)
private Map<String, SourcePojo> _sourceCache = new HashMap<String, SourcePojo>();
private TreeSet<String> _deletedIndex = null;
private void doTransfer(BasicDBObject query, int nSkip, int nLimit, boolean bAggregate, BasicDBObject chunk) throws IOException
{
PropertiesManager pm = new PropertiesManager();
int nMaxContentSize_bytes = pm.getMaxContentSize();
// Initialize the DB:
DBCollection docsDB = DbManager.getDocument().getMetadata();
DBCollection contentDB = DbManager.getDocument().getContent();
DBCollection sourcesDB = DbManager.getIngest().getSource();
ElasticSearchManager.setDefaultClusterName("infinite-aws");
// 1. Get the documents from the DB (combining data + metadata and refreshing source meta)
// (Ignore soft-deleted records:)
if (null == query) {
query = new BasicDBObject();
}
// Optimize communityId into sourceKeys...
if ((null != query.get(DocumentPojo.communityId_)) && (null == query.get(DocumentPojo.sourceKey_)))
{
try {
ObjectId commId = query.getObjectId(DocumentPojo.communityId_);
DBCursor dbc = sourcesDB.find(new BasicDBObject(SourcePojo.communityIds_, commId));
String[] sourceKeys = new String[dbc.count()];
int added = 0;
for (DBObject dbo: dbc) {
sourceKeys[added++] = (String) dbo.get(SourcePojo.key_);
}
query.put(DocumentPojo.sourceKey_, new BasicDBObject(DbManager.in_, sourceKeys));
System.out.println("(Optimized simple community query to " + sourceKeys.length + " source key(s))");
}
catch (Exception e) {
System.out.println("(Can't optimize complex community query)");
}
}
// Ignored delete objects
Object sourceKeyQuery = query.get(DocumentPojo.sourceKey_);
if (null == sourceKeyQuery) {
query.put(DocumentPojo.sourceKey_, Pattern.compile("^[^?]")); // (ie nothing starting with ?)
}//TESTED
else if (sourceKeyQuery instanceof BasicDBObject) {
((BasicDBObject) sourceKeyQuery).append("$regex", "^[^?]");
}//TESTED
//DEBUG
//System.out.println("COMBINED QUERY= " + query.toString());
// If aggregating, kick off the background aggregation thread
if (bAggregate) {
EntityBackgroundAggregationManager.startThread();
AssociationBackgroundAggregationManager.startThread();
}
//Debug:
DBCursor dbc = null;
dbc = docsDB.find(query);
if (null != chunk) {
if (chunk.containsField(DbManager.min_)) {
dbc = dbc.addSpecial(DbManager.min_, chunk.get(DbManager.min_));
}
if (chunk.containsField(DbManager.max_)) {
dbc = dbc.addSpecial(DbManager.max_, chunk.get(DbManager.max_));
}
}
dbc = dbc.skip(nSkip).limit(nLimit).batchSize(1000);
if (null == chunk) {
int nCount = dbc.count() - nSkip;
if (nCount < 0) nCount = 0;
System.out.println("Found " + nCount + " records to sync, process first " + (0==nLimit?nCount:nLimit));
if (0 == nCount) { // Nothing to do...
return;
}
}
byte[] storageArray = new byte[200000];
int nSynced = 0;
LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>();
Map<ObjectId, LinkedList<DocumentPojo>> communityList = null;
ObjectId currCommunityId = null;
while (dbc.hasNext()) {
BasicDBObject dbo = (BasicDBObject)dbc.next();
DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
String sDocIndex = doc.getIndex();
if (null == sDocIndex) {
sDocIndex = "document_index";
}
if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) {
_deletedIndex.add(sDocIndex);
rebuildIndex(sDocIndex);
try { // (Just in case the index requires some time to sort itself out)
Thread.sleep(1000);
} catch (InterruptedException e) {}
}
//Debug:
//System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl());
// Get the content:
if ((0 != nMaxContentSize_bytes) && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl()))
{
BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ, fields);
if (null != dboContent) {
byte[] compressedData = ((byte[])dboContent.get(CompressedFullTextPojo.gzip_content_));
ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
GZIPInputStream gzip = new GZIPInputStream(in);
int nRead = 0;
StringBuffer output = new StringBuffer();
while (nRead >= 0) {
nRead = gzip.read(storageArray, 0, 200000);
if (nRead > 0) {
String s = new String(storageArray, 0, nRead, "UTF-8");
output.append(s);
}
}
doc.setFullText(output.toString());
}
}
// (else document has full text already)
// Get tags, if necessary:
// Always overwrite tags - one of the reasons we might choose to migrate
// Also may need source in order to support source index filtering
SourcePojo src = _sourceCache.get(doc.getSourceKey());
if (null == src) {
//TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?)
BasicDBObject srcDbo = (BasicDBObject) sourcesDB.findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey()));
if (null != srcDbo) {
src = SourcePojo.fromDb(srcDbo, SourcePojo.class);
if (null != src.getProcessingPipeline()) {
try {
// Set the index settings
HarvestController hc = new HarvestController();
HarvestControllerPipeline hcPipe = new HarvestControllerPipeline();
hcPipe.extractSource_preProcessingPipeline(src, hc);
}
catch (Exception e) {
//DEBUG
e.printStackTrace();
}
}//TESTED (by hand)
_sourceCache.put(doc.getSourceKey(), src);
}
}
doc.setTempSource(src); // (needed for source index filtering)
if (null != src) {
if (null != src.getTags()) {
Set<String> tagsTidied = new TreeSet<String>();
for (String s: src.getTags()) {
String ss = s.trim().toLowerCase();
tagsTidied.add(ss);
}
// May also want to write this back to the DB:
//TODO (INF-2223): Handle append tags or not in the pipeline...
if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) {
if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) {
BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getSourceKey());
updateQuery.put(DocumentPojo._id_, doc.getId());
docsDB.update(updateQuery, new BasicDBObject(DbManager.addToSet_, new BasicDBObject(
DocumentPojo.tags_, new BasicDBObject(DbManager.each_, tagsTidied))));
}
doc.setTags(tagsTidied); // (just copy ptr across)
}
}
}
// 2. Update the index with the new document
// (Optionally also update entity and assoc features)
if (bAggregate) {
if (null == currCommunityId) {
currCommunityId = doc.getCommunityId();
}
else if (!currCommunityId.equals(doc.getCommunityId())) {
LinkedList<DocumentPojo> perCommunityDocList = null;
if (null == communityList) { // (very first time we see > 1 community)
communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>();
perCommunityDocList = new LinkedList<DocumentPojo>();
perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet)
communityList.put(currCommunityId, perCommunityDocList);
}
currCommunityId = doc.getCommunityId();
perCommunityDocList = communityList.get(currCommunityId);
if (null == perCommunityDocList) {
perCommunityDocList = new LinkedList<DocumentPojo>();
communityList.put(currCommunityId, perCommunityDocList);
}
perCommunityDocList.add(doc);
}
}//TESTED
nSynced++;
docsToTransfer.add(doc);
if (0 == (nSynced % 10000)) {
StoreAndIndexManager manager = new StoreAndIndexManager();
if (bAggregate) {
// Loop over communities and aggregate each one then store the modified entities/assocs
doAggregation(communityList, docsToTransfer);
communityList = null; // (in case the next 10,000 docs are all in the same community!)
currCommunityId = null;
}//TOTEST
manager.addToSearch(docsToTransfer);
docsToTransfer.clear();
System.out.println("(Synced " + nSynced + " records)");
}
} // (End loop over docs)
// Sync remaining docs
if (!docsToTransfer.isEmpty()) {
if (bAggregate) {
// Loop over communities and aggregate each one then store the modified entities/assocs
doAggregation(communityList, docsToTransfer);
}
StoreAndIndexManager manager = new StoreAndIndexManager();
manager.addToSearch(docsToTransfer);
}
if (null != chunk) {
System.out.println("Found " + nSynced + " records to sync in chunk");
}
if (bAggregate) {
System.out.println("Completed. You can hit CTRL+C at any time.");
System.out.println("By default it will keep running for 5 minutes while the background aggregation runs to update the documents' entities.");
try {
Thread.sleep(300000);
} catch (InterruptedException e) {}
// Turn off so we can exit
EntityBackgroundAggregationManager.stopThreadAndWait();
AssociationBackgroundAggregationManager.stopThreadAndWait();
}
}
//___________________________________________________________________________________________________
private void doAggregation(Map<ObjectId, LinkedList<DocumentPojo>> communityList, LinkedList<DocumentPojo> singleList) {
if (null == communityList) { // just one community this one is easy
AggregationManager aggManager = new AggregationManager();
aggManager.doAggregation(singleList, new LinkedList<DocumentPojo>());
aggManager.createOrUpdateFeatureEntries();
aggManager.applyAggregationToDocs(singleList);
aggManager.runScheduledDocumentUpdates();
aggManager.runScheduledSynchronization();
}
else {
for (Map.Entry<ObjectId, LinkedList<DocumentPojo>> entry: communityList.entrySet()) {
AggregationManager aggManager = new AggregationManager();
aggManager.doAggregation(entry.getValue(), new LinkedList<DocumentPojo>());
aggManager.createOrUpdateFeatureEntries();
aggManager.applyAggregationToDocs(entry.getValue());
aggManager.runScheduledDocumentUpdates();
aggManager.runScheduledSynchronization();
}
}//TESTED
// Finally, need to update all the docs (ick)
DocumentPojo dummy = new DocumentPojo();
for (DocumentPojo doc: singleList) {
boolean bEnts = (null != doc.getEntities()) && !doc.getEntities().isEmpty();
boolean bAssocs = (null != doc.getAssociations()) && !doc.getAssociations().isEmpty();
if (bEnts || bAssocs) {
dummy.setEntities(doc.getEntities());
dummy.setAssociations(doc.getAssociations());
DBObject toWrite = dummy.toDb();
BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getSourceKey());
updateQuery.put(DocumentPojo._id_, doc.getId());
MongoDbManager.getDocument().getMetadata().update(updateQuery, new BasicDBObject(MongoDbManager.set_, toWrite));
}//TESTED
}// (end loop over docs)
}//TESTED
//___________________________________________________________________________________________________
// Utility function for the above, rebuilds an index
private void rebuildIndex(String indexName) {
if (indexName.startsWith("doc_")) { // Else not eligible...
try {
ObjectId communityId = new ObjectId(indexName.substring(4));
//OK ... issue here with child communities .. you can't just rebuild the index because it will delete the parent index also
BasicDBObject query = new BasicDBObject("_id", communityId);
BasicDBObject fields = new BasicDBObject("parentId", 1);
fields.put("name", 1);
CommunityPojo community = CommunityPojo.fromDb(DbManager.getSocial().getCommunity().findOne(query, fields), CommunityPojo.class);
if (null == community) {
System.out.println("WARNING_COMM_EXIST: community " + communityId + " does not exist, this will likely cause problems");
return;
}
if (null != community.getParentId()) {
if (null == community.getParentName()) {
CommunityPojo parentComm = CommunityPojo.fromDb(DbManager.getSocial().getCommunity().findOne(new BasicDBObject("_id", community.getParentId())), CommunityPojo.class);
if (null == parentComm) {
System.out.println("WARNING_COMM_EXIST: community " + community.getParentId() + " does not exist, this will likely cause problems");
}
else {
community.setParentName(parentComm.getName());
}
}
System.out.println("WARNING_CHILD_COMM: " + "commid=" + communityId + ", community" + community.getName() + " has a parent, parent_id=" + community.getParentId() + " (name " + community.getParentName() + "). " +
"This community will not be rebuilt, and you should ensure that it is re-indexed if the parent community is subsequently rebuilt.");
return;
}//TESTED (by hand - works normally on non-child communities, refuses to delete child communities)
GenericProcessingController.recreateCommunityDocIndex_unknownFields(communityId, true);
}
catch (Exception e) { // I guess this wasn't a valid community?!
e.printStackTrace();
}
}
}
//TESTED (by hand, it's a straight call of tested GPC code anyway)
//___________________________________________________________________________________________________
// DELETE DOCUMENTS FROM A QUERY
private void doDelete(BasicDBObject query, int nLimit)
{
try {
// Get the documents to delete
BasicDBObject queryFields = new BasicDBObject(DocumentPojo.sourceKey_, 1);
queryFields.put(DocumentPojo.sourceUrl_, 1);
queryFields.put(DocumentPojo.url_, 1);
queryFields.put(DocumentPojo.communityId_, 1);
queryFields.put(DocumentPojo.index_, 1);
DBCursor cur = DbManager.getDocument().getMetadata().find(query, queryFields).limit(nLimit);
// (this internally works in batches of 1000)
System.out.println("Found " + cur.count() + " records to delete");
if (nLimit > 0) {
System.out.println("(limited to " + nLimit + " records)");
}
List<DocumentPojo> docs = DocumentPojo.listFromDb(cur, DocumentPojo.listType());
// Keep track of number of docs per community getting deleted
Map<ObjectId, Integer> communityMap = new HashMap<ObjectId, Integer>();
Map<String, Integer> sourceKeyMap = new HashMap<String, Integer>();
for (DocumentPojo doc: docs) {
if (null != doc.getSourceKey()) { // (can only happen by error, still)
ObjectId community = doc.getCommunityId();
Integer count = communityMap.get(community);
communityMap.put(community, (count == null ? 1 : count + 1));
int nSpecialFormat = doc.getSourceKey().indexOf('#');
String sourceKey = doc.getSourceKey();
if (nSpecialFormat > 0) {
sourceKey = sourceKey.substring(0, nSpecialFormat);
}
Integer count2 = sourceKeyMap.get(sourceKey);
sourceKeyMap.put(sourceKey, (count2 == null ? 1 : count2 + 1));
}
}
StoreAndIndexManager dataStore = new StoreAndIndexManager();
dataStore.removeFromDatastore_byURL(docs);
AggregationManager.updateEntitiesFromDeletedDocuments(dataStore.getUUID());
dataStore.removeSoftDeletedDocuments();
AggregationManager.updateDocEntitiesFromDeletedDocuments(dataStore.getUUID());
// Actually update the DB counts:
for (Map.Entry<ObjectId, Integer> communityInfo: communityMap.entrySet()) {
System.out.println("Removed " + communityInfo.getValue() + " records from community " + communityInfo.getKey());
DbManager.getDocument().getCounts().update(new BasicDBObject("_id", communityInfo.getKey()),
new BasicDBObject("$inc", new BasicDBObject("doccount", -communityInfo.getValue())));
}
for (Map.Entry<String, Integer> sourceInfo: sourceKeyMap.entrySet()) {
System.out.println("Removed " + sourceInfo.getValue() + " records from source " + sourceInfo.getKey());
DbManager.getIngest().getSource().update(new BasicDBObject("key", sourceInfo.getKey()),
new BasicDBObject("$inc", new BasicDBObject("harvest.doccount", -sourceInfo.getValue())));
}
} catch (Exception e) {
e.printStackTrace();
}
}
//___________________________________________________________________________________________________
//___________________________________________________________________________________________________
//___________________________________________________________________________________________________
//___________________________________________________________________________________________________
// UNIT/FUNCTIONAL/COVERAGE TEST CODE
@SuppressWarnings("unused")
private void doUnitTest(String sMongoDbHost, String sMongoDbPort, String sElasticHost, String sElasticPort,
BasicDBObject query, int nLimit)
{
ElasticSearchManager elasticManager = null;
try {
// Initialize the DB:
DBCollection feedsDB = DbManager.getDocument().getMetadata();
DBCollection contentDB = DbManager.getDocument().getContent();
DBCollection sourcesDB = DbManager.getIngest().getSource();
String indexName = "document_index";
// Test/debug recreate the index
if (true) {
// (delete the index)
System.out.println("Deleting index...");
elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);
elasticManager.deleteMe();
//(also deletes the child index - same index, different type)
// Create the index if necessary
String sMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(), DocumentPojoIndexMap.Mapping.class);
Builder localSettings = ImmutableSettings.settingsBuilder();
localSettings.put("number_of_shards", 10).put("number_of_replicas", 2);
System.out.println("Creating index..." + sMapping);
elasticManager = ElasticSearchManager.createIndex
(indexName, null, false,
sElasticHost + ":" + sElasticPort,
sMapping, localSettings);
}
// Get the index (necessary if already created)
if (null == elasticManager) {
elasticManager = ElasticSearchManager.getIndex(indexName, sElasticHost + ":" + sElasticPort);
}
// Get the feeds from the DB:
//Debug:
// System.out.println("Querying DB...");
DBCursor dbc = feedsDB.find(query).limit(nLimit);
byte[] storageArray = new byte[200000];
while (dbc.hasNext()) {
BasicDBObject dbo = (BasicDBObject)dbc.next();
DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
//Debug:
System.out.println("Getting content..." + doc.getTitle() + " / " + doc.getUrl());
// Get the content:
BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ);
if (null != dboContent) {
byte[] compressedData = ((byte[])dboContent.get("gzip_content"));
ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
GZIPInputStream gzip = new GZIPInputStream(in);
int nRead = gzip.read(storageArray, 0, 200000);
String s = new String(storageArray, 0, nRead, "UTF-8");
doc.setFullText(s);
}
// Get tag:
SourcePojo src = _sourceCache.get(doc.getSourceKey());
if (null == src) {
BasicDBObject srcDbo = (BasicDBObject) sourcesDB.findOne(new BasicDBObject("key", doc.getSourceKey()));
if (null != srcDbo) {
src = new Gson().fromJson(srcDbo.toString(), SourcePojo.class);
_sourceCache.put(doc.getSourceKey(), src);
}
}
if (null != src) {
Set<String> tagsTidied = new TreeSet<String>();
for (String s: src.getTags()) {
String ss = s.trim().toLowerCase();
tagsTidied.add(ss);
}
doc.setTags(tagsTidied);
}
//TEST: set dynamic field
// Lots of testing of dynamic dates:
// feed.addToMetadata("my_dateISO", Date.parse(feed.getCreated().toGMTString()));
// String s1 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss").format(feed.getCreated());
// feed.addToMetadata("another_dateISO", s1);
// String s1_5 = new SimpleDateFormat().format(feed.getCreated());
// feed.addToMetadata("another_dateTimeJava", s1_5);
// String s2 = new SimpleDateFormat("yyyyMMdd").format(feed.getCreated());
// feed.addToMetadata("another_dateYYYYMMDD", s2);
// String s3 = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z").format(feed.getCreated());
// feed.addToMetadata("another_dateRFC822", s3);
// feed.addToMetadata("another_dateGMT", feed.getCreated().toGMTString());
// // Testing of the string field types
// feed.addToMetadata("my_comment", "Testing this ABCDEFG");
// feed.addToMetadata("my_term", "Testing this UVWXYZ");
// feed.addToMetadata("my_text", "Testing this 123456");
// // Test an array of longs:
// Long tl[] = new Long[4]; tl[0] = 0L; tl[1] = 1L; tl[2] = 2L; tl[3] = 3L;
// feed.addToMetadata("md_long", tl);
//TEST: some dummy event timestamp adding code (not seeing much/any in the data)
// if (null != feed.getEvents()) {
// int i = 0;
// for (EventPojo evt: feed.getEvents()) {
// //1: Add single date
// if (0 == i) {
// evt.time_start = "2011-01-01";
// }
// //2: Add short span
// if (1 == i) {
// evt.time_start = "2010-04-06";
// evt.time_end = "2010-08-09";
// }
// //3: Add cross-yr span
// if (2 == i) {
// evt.time_start = "2012-06-05";
// evt.time_end = "2013-09-05";
// }
// //4: Add too long span
// if (3 == i) {
// evt.time_start = "2012-04-06";
// evt.time_end = "2014-04-09";
// }
// i++;
// }
// }
// For event adding, see data_model.test.TestCode
}
} catch (IOException e) {
e.printStackTrace();
}
finally {
//nothing to do
}
}
}