/*******************************************************************************
* Copyright 2012 The Infinit.e Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
/**
*
*/
package com.ikanow.infinit.e.data_model.store.document;
import java.lang.reflect.Type;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.lang.ArrayUtils;
import org.bson.types.ObjectId;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonArray;
import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParseException;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;
import com.google.gson.reflect.TypeToken;
import com.ikanow.infinit.e.data_model.store.BaseDbPojo;
import com.ikanow.infinit.e.data_model.store.MongoDbUtil;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.mongodb.BasicDBList;
/**
* @author apiggott
* The generic document data model
*/
public class DocumentPojo extends BaseDbPojo {
// Standard static function for readability
@SuppressWarnings("unchecked")
static public TypeToken<List<DocumentPojo>> listType() { return new TypeToken<List<DocumentPojo>>(){}; }
//*** IMPORTANT: don't add to this list without considering the ES mapping in DocumentIndexPojoMap
// Storage (Mongo) data model
// API data model is the same except where otherwise specified (DocumentApiPojoMap converts)
// For index data model see DocumentIndexPojoMap
////////////////////////////////////////////////////////////////////////////////
// Stored Fields:
// Basic metadata
private ObjectId _id = null;
final public static String _id_ = "_id";
// (API-side, this is an immutable id for the doc, DB-side this the DB _id and changes with every update)
private ObjectId updateId = null;
final public static String updateId_ = "updateId";
// (API-side, this is the current DB id, DB-side this is the original _id, or null if this doc has never been updated)
private String title = null;
final public static String title_ = "title";
private String url = null;
final public static String url_ = "url";
private Date created = null;
final public static String created_ = "created";
private Date modified = null;
final public static String modified_ = "modified";
private Date publishedDate = null;
final public static String publishedDate_ = "publishedDate";
// Data source
private String source = null; // (API side is Set<String>)
final public static String source_ = "source";
private String sourceKey = null; // (API side is Set<String>)
final public static String sourceKey_ = "sourceKey";
private String mediaType = null; // (API side is Set<String>)
final public static String mediaType_ = "mediaType";
transient String sourceType = null; //feed, db, or filesys
final public static String sourceType_ = "sourceType";
// Content
private String description = null;
final public static String description_ = "description";
// Enriched content
private List<EntityPojo> entities = null;
final public static String entities_ = "entities";
// (moved metadata to beta because of wholesale changes)
// Data source/Content
private Set<String> tags = null;
final public static String tags_ = "tags";
private String displayUrl = null;
final public static String displayUrl_ = "displayUrl";
// Data source
private ObjectId communityId = null;
final public static String communityId_ = "communityId";
// (note as far as the API is concerned this a Set<String>)
//currently only used for xml files
private String sourceUrl = null;
final public static String sourceUrl_ = "sourceUrl";
// Enriched content
private List<AssociationPojo> associations = null;
final public static String associations_ = "associations";
private LinkedHashMap<String, Object[]> metadata = null; // has to be [] to allow for 1+
final public static String metadata_ = "metadata";
private GeoPojo docGeo = null; // holds the location of the document, if it has one separate to its entities and events
final public static String docGeo_ = "docGeo";
// Mongo/Elasticsearch-specific field
private String index = null; // The name of the index to which the feed's been added
final public static String index_ = "index";
// Only used for query responses
private Object explain = null;
final public static String explain_ = "explain";
/////////////////////////////////////////////////////////////////////////////////////////////////
// The following won't be stored in the DB (either created by index map or transient)
// Alpha unstored (eg index or API fields)
// Content
private String fullText = null;
final public static String fullText_ = "fullText";
// Per query (transient, created on the way to the API for query, not currently stored anywhere)
private Double aggregateSignif; // The document significance normalized against Lucene relevance
final public static String aggregateSignif_ = "aggregateSignif";
private Double queryRelevance; // The Lucene relevance normalized against Infinit.e significance
final public static String queryRelevance_ = "queryRelevance";
private Double score; // The combined scores (vs the query weighting)
final public static String score_ = "score";
// Alpha transient:
private transient String tmpFullText = null; // (temporary storage until obj written to MongoDB)
private transient String rawFullText = null; // (stores a pointer to the first full text set, ie normally directly from URL/file)
// Beta unstored (eg index or API fields)
// Index-specific fields (ElasticSearch):
private Set<String> locs = null;
final public static String locs_ = "locs";
@SuppressWarnings("unused")
private List<GeoPojo> timeRanges = null; // (won't be used for beta - allow encapsulation of time ranges as 2d points)
final public static String timeRanges_ = "timeRanges";
private Set<Integer> months = null; // (dates represented as YYYYMM - used to generate histograms, nothing else)
final public static String months_ = "months";
// Beta transient:
private transient SourcePojo _source = null; // (handy accessor for the "parent" source info)
//header & Footer Data (doesn't persist in the DB - used for extraction and enrichment)
private transient int headerEndIndex = 0; // (obv starts at 0)
private transient int footerStartIndex = Integer.MAX_VALUE; // (obv ends at the end of the document)
private transient Set<String> headerFields = null;
private transient Set<String> footerFields = null;
private transient String headerText = null; // (\n-separated list of headerFields)
private transient String footerText = null; // (\n-separated list of headerFields)
// V0 transient
// multi-community/source handling
private transient String duplicateFrom = null; // Indicates this document should be cloned from the DB entry with matching URL, "duplicateFrom" source
private transient DocumentPojo cloneFrom = null; // Indicate this document should be cloned from the "cloneFrom" in memory copy after enrichment
private transient SourcePipelinePojo spawnedFrom = null; // Indicates this document was spawned from a "document splitter" (so should ignore previous pipeline elements)
private transient boolean hasDefaultUrl = false; // (for files only) if true then can skip an extra dedup step
////////////////////////////////////////////////////////////////////////////////
// Alpha gets and sets
public DocumentPojo()
{
}
public ObjectId getId() {
return _id;
}
public void setId(ObjectId _id) {
this._id = _id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
/**
* @param created the created to set
*/
public Date getCreated() {
return this.created;
}
public void setCreated(Date created) {
this.created = created;
}
public Date getModified() {
return this.modified;
}
public void setModified(Date modified) {
this.modified = modified;
}
public Date getPublishedDate() {
return this.publishedDate;
}
public void setPublishedDate(Date publishedDate) {
this.publishedDate = publishedDate;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getSourceKey() {
return sourceKey;
}
public void setSourceKey(String sourceKey) {
this.sourceKey = sourceKey;
}
public void setEntities(List<EntityPojo> entities) {
this.entities = entities;
}
public List<EntityPojo> getEntities() {
return entities;
}
public String getMediaType() {
return mediaType;
}
public void setMediaType(String mediaType) {
this.mediaType = mediaType;
}
public String getFullText() {
return (null == fullText)?tmpFullText:fullText;
}
public void setFullText(String fullText) {
if (null == this.rawFullText) { // very first time, set the raw full text
rawFullText = fullText;
}
this.tmpFullText = fullText;
}
public void makeFullTextNonTransient() {
this.fullText = this.tmpFullText;
}
// This is used for convenience, also used as a hacky flag to spot update documents
// that have been discarded from the update list.
public SourcePojo getTempSource() { return _source; }
public void setTempSource(SourcePojo tempSource) { _source = tempSource; }
////////////////////////////////////////////////////////////////////////////////
// Alpha utility
////////////////////////////////////////////////////////////////////////////////
// Beta gets and sets
public void setAssociations(List<AssociationPojo> events)
{
this.associations = events;
}
public List<AssociationPojo> getAssociations()
{
return this.associations;
}
public void addToMetadata(String fieldName, Object fieldVal) {
if (null == metadata) {
metadata = new LinkedHashMap<String, Object[]>();
}
Object obj[] = new Object[1]; obj[0] = fieldVal;
Object[] current = metadata.get(fieldName);
if (null != current) {
metadata.put(fieldName, ArrayUtils.add(current, obj));
}
else {
metadata.put(fieldName, obj);
}
}
public void addToMetadata(String fieldName, Object[] fieldVals) {
if (null == metadata) {
metadata = new LinkedHashMap<String, Object[]>();
}
Object[] current = metadata.get(fieldName);
if (null != current) {
metadata.put(fieldName, ArrayUtils.addAll(current, fieldVals));
}
else {
metadata.put(fieldName, fieldVals);
}
}
public void setMetadata(LinkedHashMap<String, Object[]> metadata)
{
this.metadata = metadata;
}
public LinkedHashMap<String, Object[]> getMetadata()
{
return this.metadata;
}
public LinkedHashMap<String, Object[]> getMetaData() {
return metadata;
}
public Set<String> getTags() {
return tags;
}
public void setTags(Set<String> tags_) {
tags = tags_;
}
public void addTags(Set<String> tags_) {
tags.addAll(tags_);
}
public void setCommunityId(ObjectId communityId) {
this.communityId = communityId;
}
public ObjectId getCommunityId() {
return this.communityId;
}
public GeoPojo getDocGeo() {
return docGeo;
}
public void setDocGeo(GeoPojo docGeo) {
this.docGeo = GeoPojo.cleanseBadGeotag(docGeo);
}
/**
* @param locs the locs to set
*/
public void setLocs(Set<String> locs) {
this.locs = locs;
}
/**
* @return the locs
*/
public Set<String> getLocs() {
return locs;
}
/**
* @param months the months to set
*/
public void setMonths(Set<Integer> months) {
this.months = months;
}
/**
* @return the months
*/
public Set<Integer> getMonths() {
return months;
}
/**
* @param sourceUrl the sourceUrl to set
*/
public void setSourceUrl(String sourceUrl) {
this.sourceUrl = sourceUrl;
}
/**
* @return the sourceUrl
*/
public String getSourceUrl() {
return sourceUrl;
}
/**
* @return the index
*/
public String getIndex() {
return index;
}
/**
* @param index the index to set
*/
public void setIndex(String index) {
this.index = index;
}
////////////////////////////////////////////////////////////////////////////////
// Beta utility
// Add the metadata as separate lines to perform extraction on them
public String metaDataToText() {
StringBuffer sb = new StringBuffer();
for ( Object md : metadata.values())
{
sb.append(md).append('\n');
}
return sb.toString();
}//TOTEST - to be done during DB integration
////////////////////////////////////////////////////////////////////////////////
//(Still beta) Header Footer Stuff ... can be used by entity extractors
/**
* @return the headerStartIndex
*/
@Deprecated
public int getHeaderEndIndex() {
return headerEndIndex;
}
/**
* @param headerStartIndex the headerStartIndex to set
*/
@Deprecated
public void setHeaderEndIndex(int headerEndIndex) {
this.headerEndIndex = headerEndIndex;
}
/**
* @return the footerStartIndex
*/
@Deprecated
public int getFooterStartIndex() {
return footerStartIndex;
}
/**
* @param footerEndIndex the footerEndIndex to set
*/
@Deprecated
public void setFooterStartIndex(int footerStartIndex) {
this.footerStartIndex = footerStartIndex;
}
@Deprecated
public void addToHeader(String sHeaderField) {
if (headerFields == null)
headerFields = new HashSet<String>();
headerFields.add(sHeaderField.toLowerCase());
}
@Deprecated
public void addToFooter(String sFooterField) {
if (footerFields == null)
footerFields = new HashSet<String>();
footerFields.add(sFooterField.toLowerCase());
}
@Deprecated
public Set<String> getHeaderFields() {
return headerFields;
}
@Deprecated
public Set<String> getFooterFields() {
return footerFields;
}
@Deprecated
public String getHeader() {
if (null == headerFields) {
return "";
}
return headerText;
}
@Deprecated
public String getFooter() {
if (null == footerFields) {
return "";
}
return footerText;
}
@Deprecated
public String getBody() {
if (null == getFullText())
{
return null;
}
else{
if (footerStartIndex == Integer.MAX_VALUE && headerEndIndex == 0 )
{
return getFullText();
}
else if (footerStartIndex > getFullText().length()) {
return getFullText().substring(headerEndIndex);
}
else {
return getFullText().substring(headerEndIndex, footerStartIndex);
}
}
}
////////////////////////////////////////////////////////////////////////////////
// V0 gets and sets
public void setDuplicateFrom(String sourceKey) {
duplicateFrom = sourceKey;
}
public String getDuplicateFrom() {
return duplicateFrom;
}
public void setCloneFrom(DocumentPojo masterClone) {
cloneFrom = masterClone;
}
public DocumentPojo getCloneFrom() {
return cloneFrom;
}
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// Base overrides:
public GsonBuilder extendBuilder(GsonBuilder gp) {
return gp.registerTypeAdapter(DocumentPojo.class, new DocumentPojoDeserializer()).
registerTypeAdapter(DocumentPojo.class, new DocumentPojoSerializer());
}
protected static class DocumentPojoSerializer implements JsonSerializer<DocumentPojo>
{
@Override
public JsonElement serialize(DocumentPojo doc, Type typeOfT, JsonSerializationContext context)
{
// GSON transformation:
JsonElement je = DocumentPojo.getDefaultBuilder().create().toJsonTree(doc, typeOfT);
// Convert object names in metadata
if ((null != doc.getMetadata()) && !doc.getMetadata().isEmpty()) {
if (je.isJsonObject()) {
JsonElement metadata = je.getAsJsonObject().get("metadata");
if (null != metadata) {
enforceTypeNamingPolicy(metadata, 0);
}
}
}
return je;
}
//////////////////////////////////////////////////////////////////////////////////////////
// Utility function for encoding "."s and "%"s (also duplicate in index)
private static boolean enforceTypeNamingPolicy(JsonElement je, int nDepth) {
if (je.isJsonPrimitive()) {
return false; // Done
}
else if (je.isJsonArray()) {
JsonArray ja = je.getAsJsonArray();
if (0 == ja.size()) {
return false; // No idea, carry on
}
JsonElement jaje = ja.get(0);
return enforceTypeNamingPolicy(jaje, nDepth + 1); // keep going until you find primitive/object
}
else if (je.isJsonObject()) {
JsonObject jo = je.getAsJsonObject();
// Nested variables:
Iterator<Entry<String, JsonElement>> it = jo.entrySet().iterator();
Map<String, JsonElement> toFixList = null;
while (it.hasNext()) {
boolean bFix = false;
Entry<String, JsonElement> el = it.next();
String currKey = el.getKey();
if ((currKey.indexOf('.') >= 0) || (currKey.indexOf('%') >= 0)) {
it.remove();
currKey = currKey.replace("%", "%25").replace(".", "%2e");
bFix = true;
}
if (null == el.getValue()) {
if (!bFix) it.remove(); // nice easy case, just get rid of it (if bFix, it's already removed)
bFix = false;
}
else {
enforceTypeNamingPolicy(el.getValue(), nDepth + 1);
}
if (bFix) {
if (null == toFixList) {
toFixList = new HashMap<String, JsonElement>();
}
toFixList.put(currKey, el.getValue());
}
} // (end loop over params)
if (null != toFixList) {
for (Entry<String, JsonElement> el: toFixList.entrySet()) {
jo.add(el.getKey(), el.getValue());
}
}
return true; // (in any case, I get renamed by calling parent)
}
return false;
}
//TESTED (see DOC_META in test/TestCode)
}
protected static class DocumentPojoDeserializer implements JsonDeserializer<DocumentPojo>
{
@Override
public DocumentPojo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException
{
JsonObject metadata = json.getAsJsonObject().getAsJsonObject("metadata");
if (null != metadata) {
json.getAsJsonObject().remove("metadata");
}
DocumentPojo doc = BaseDbPojo.getDefaultBuilder().create().fromJson(json, DocumentPojo.class);
if (null != metadata) {
for (Entry<String, JsonElement> entry: metadata.entrySet()) {
if (entry.getValue().isJsonArray()) {
doc.addToMetadata(entry.getKey(), MongoDbUtil.encodeArray(entry.getValue().getAsJsonArray()).toArray());
}
else {
BasicDBList dbl = new BasicDBList();
dbl.add(MongoDbUtil.encodeUnknown(entry.getValue()));
doc.addToMetadata(entry.getKey(), dbl);
}
}//TESTED
}
return doc;
}
}
////////////////////////////////////////////////////////////////////////////////
// Per query (transient, created on the way to the API for query, not currently stored anywhere)
public Double getAggregateSignif() {
return aggregateSignif;
}
public void setAggregateSignif(Double aggregateSignif) {
this.aggregateSignif = aggregateSignif;
}
public Double getQueryRelevance() {
return queryRelevance;
}
public void setQueryRelevance(Double queryRelevance) {
this.queryRelevance = queryRelevance;
}
public Double getScore() {
return score;
}
public void setScore(Double score) {
this.score = score;
}
public void setUpdateId(ObjectId updateId) {
this.updateId = updateId;
}
public ObjectId getUpdateId() {
return updateId;
}
public void setDisplayUrl(String displayUrl) {
this.displayUrl = displayUrl;
}
public String getDisplayUrl() {
return displayUrl;
}
public void setExplain(Object explain) {
this.explain = explain;
}
public Object getExplain() {
return explain;
}
public void resetRawFullText() {
this.rawFullText = null;
}
public String getRawFullText() {
return rawFullText;
}
public SourcePipelinePojo getSpawnedFrom() {
return spawnedFrom;
}
public void setSpawnedFrom(SourcePipelinePojo spawnedFrom) {
this.spawnedFrom = spawnedFrom;
}
public boolean getHasDefaultUrl() {
return hasDefaultUrl;
}
public void setHasDefaultUrl(boolean hasDefaultUrl) {
this.hasDefaultUrl = hasDefaultUrl;
}
}