LinkedList<DocumentPojo> docsToTransfer = new LinkedList<DocumentPojo>();
Map<ObjectId, LinkedList<DocumentPojo>> communityList = null;
ObjectId currCommunityId = null;
while (dbc.hasNext()) {
BasicDBObject dbo = (BasicDBObject)dbc.next();
DocumentPojo doc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
String sDocIndex = doc.getIndex();
if (null == sDocIndex) {
sDocIndex = "document_index";
}
if ((null != _deletedIndex) && !_deletedIndex.contains(sDocIndex)) {
_deletedIndex.add(sDocIndex);
rebuildIndex(sDocIndex);
try { // (Just in case the index requires some time to sort itself out)
Thread.sleep(1000);
} catch (InterruptedException e) {}
}
//Debug:
//System.out.println("Getting content..." + feed.getTitle() + " / " + feed.getUrl());
// Get the content:
if ((0 != nMaxContentSize_bytes) && StoreAndIndexManager.docHasExternalContent(doc.getUrl(), doc.getSourceUrl()))
{
BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));
BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
BasicDBObject dboContent = (BasicDBObject) contentDB.findOne(contentQ, fields);
if (null != dboContent) {
byte[] compressedData = ((byte[])dboContent.get(CompressedFullTextPojo.gzip_content_));
ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
GZIPInputStream gzip = new GZIPInputStream(in);
int nRead = 0;
StringBuffer output = new StringBuffer();
while (nRead >= 0) {
nRead = gzip.read(storageArray, 0, 200000);
if (nRead > 0) {
String s = new String(storageArray, 0, nRead, "UTF-8");
output.append(s);
}
}
doc.setFullText(output.toString());
}
}
// (else document has full text already)
// Get tags, if necessary:
// Always overwrite tags - one of the reasons we might choose to migrate
// Also may need source in order to support source index filtering
SourcePojo src = _sourceCache.get(doc.getSourceKey());
if (null == src) {
//TODO (INF-2265): handle search index settings in pipeline mode... (also didn't seem to work?)
BasicDBObject srcDbo = (BasicDBObject) sourcesDB.findOne(new BasicDBObject(SourcePojo.key_, doc.getSourceKey()));
if (null != srcDbo) {
src = SourcePojo.fromDb(srcDbo, SourcePojo.class);
if (null != src.getProcessingPipeline()) {
try {
// Set the index settings
HarvestController hc = new HarvestController();
HarvestControllerPipeline hcPipe = new HarvestControllerPipeline();
hcPipe.extractSource_preProcessingPipeline(src, hc);
}
catch (Exception e) {
//DEBUG
e.printStackTrace();
}
}//TESTED (by hand)
_sourceCache.put(doc.getSourceKey(), src);
}
}
doc.setTempSource(src); // (needed for source index filtering)
if (null != src) {
if (null != src.getTags()) {
Set<String> tagsTidied = new TreeSet<String>();
for (String s: src.getTags()) {
String ss = s.trim().toLowerCase();
tagsTidied.add(ss);
}
// May also want to write this back to the DB:
//TODO (INF-2223): Handle append tags or not in the pipeline...
if ((null == src.getAppendTagsToDocs()) || src.getAppendTagsToDocs()) {
if ((null == doc.getTags()) || (doc.getTags().size() < tagsTidied.size())) {
BasicDBObject updateQuery = new BasicDBObject(DocumentPojo.sourceKey_, doc.getSourceKey());
updateQuery.put(DocumentPojo._id_, doc.getId());
docsDB.update(updateQuery, new BasicDBObject(DbManager.addToSet_, new BasicDBObject(
DocumentPojo.tags_, new BasicDBObject(DbManager.each_, tagsTidied))));
}
doc.setTags(tagsTidied); // (just copy ptr across)
}
}
}
// 2. Update the index with the new document
// (Optionally also update entity and assoc features)
if (bAggregate) {
if (null == currCommunityId) {
currCommunityId = doc.getCommunityId();
}
else if (!currCommunityId.equals(doc.getCommunityId())) {
LinkedList<DocumentPojo> perCommunityDocList = null;
if (null == communityList) { // (very first time we see > 1 community)
communityList = new TreeMap<ObjectId, LinkedList<DocumentPojo>>();
perCommunityDocList = new LinkedList<DocumentPojo>();
perCommunityDocList.addAll(docsToTransfer); //(NOT including doc, this hasn't been added to docsToTransfer yet)
communityList.put(currCommunityId, perCommunityDocList);
}
currCommunityId = doc.getCommunityId();
perCommunityDocList = communityList.get(currCommunityId);
if (null == perCommunityDocList) {
perCommunityDocList = new LinkedList<DocumentPojo>();
communityList.put(currCommunityId, perCommunityDocList);
}