Package com.ikanow.infinit.e.data_model.store.document

Source Code of com.ikanow.infinit.e.data_model.store.document.DocumentPojo

/*******************************************************************************
* Copyright 2012 The Infinit.e Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
/**
*
*/
package com.ikanow.infinit.e.data_model.store.document;

import java.lang.reflect.Type;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.lang.ArrayUtils;
import org.bson.types.ObjectId;

import com.google.gson.GsonBuilder;
import com.google.gson.JsonArray;
import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParseException;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;
import com.google.gson.reflect.TypeToken;
import com.ikanow.infinit.e.data_model.store.BaseDbPojo;
import com.ikanow.infinit.e.data_model.store.MongoDbUtil;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.mongodb.BasicDBList;

/**
* @author apiggott
* The generic document data model
*/
public class DocumentPojo extends BaseDbPojo {
  // Standard static function for readability
  @SuppressWarnings("unchecked")
  static public TypeToken<List<DocumentPojo>> listType() { return new TypeToken<List<DocumentPojo>>(){}; }

  //*** IMPORTANT: don't add to this list without considering the ES mapping in DocumentIndexPojoMap

  // Storage (Mongo) data model
  // API data model is the same except where otherwise specified (DocumentApiPojoMap converts)
  // For index data model see DocumentIndexPojoMap

  //////////////////////////////////////////////////////////////////////////////// 

  // Stored Fields:

  // Basic metadata
  private ObjectId _id = null;
  final public static String _id_ = "_id";
    // (API-side, this is an immutable id for the doc, DB-side this the DB _id and changes with every update)
  private ObjectId updateId = null;
  final public static String updateId_ = "updateId";
    // (API-side, this is the current DB id, DB-side this is the original _id, or null if this doc has never been updated)
  private String title = null;
  final public static String title_ = "title";
  private String url = null;
  final public static String url_ = "url";
  private Date created = null;
  final public static String created_ = "created";
  private Date modified = null;
  final public static String modified_ = "modified";
  private Date publishedDate = null;
  final public static String publishedDate_ = "publishedDate";

  // Data source
  private String source = null; // (API side is Set<String>)
  final public static String source_ = "source";
  private String sourceKey = null; // (API side is Set<String>)
  final public static String sourceKey_ = "sourceKey";
  private String mediaType = null; // (API side is Set<String>)
  final public static String mediaType_ = "mediaType";
  transient String sourceType = null; //feed, db, or filesys 
  final public static String sourceType_ = "sourceType";

  // Content
  private String description = null;
  final public static String description_ = "description";
  // Enriched content
  private List<EntityPojo> entities = null;
  final public static String entities_ = "entities";
  // (moved metadata to beta because of wholesale changes)
 
  // Data source/Content
  private Set<String> tags = null;
  final public static String tags_ = "tags";
  private String displayUrl = null;
  final public static String displayUrl_ = "displayUrl";

  // Data source
  private ObjectId communityId = null;
  final public static String communityId_ = "communityId";
    // (note as far as the API is concerned this a Set<String>)

  //currently only used for xml files
  private String sourceUrl = null;
  final public static String sourceUrl_ = "sourceUrl";

  // Enriched content
  private List<AssociationPojo> associations = null;
  final public static String associations_ = "associations";
  private LinkedHashMap<String, Object[]> metadata = null; // has to be [] to allow for 1+ 
  final public static String metadata_ = "metadata";
  private GeoPojo docGeo = null; // holds the location of the document, if it has one separate to its entities and events
  final public static String docGeo_ = "docGeo";

  // Mongo/Elasticsearch-specific field
  private String index = null; // The name of the index to which the feed's been added
  final public static String index_ = "index";

  // Only used for query responses
  private Object explain = null;
  final public static String explain_ = "explain";
 
///////////////////////////////////////////////////////////////////////////////////////////////// 
 
// The following won't be stored in the DB (either created by index map or transient)
 
  // Alpha unstored (eg index or API fields)
 
  // Content
  private String fullText = null
  final public static String fullText_ = "fullText";

  // Per query (transient, created on the way to the API for query, not currently stored anywhere)
 
  private Double aggregateSignif; // The document significance normalized against Lucene relevance
  final public static String aggregateSignif_ = "aggregateSignif";
  private Double queryRelevance; // The Lucene relevance normalized against Infinit.e significance
  final public static String queryRelevance_ = "queryRelevance";
  private Double score; // The combined scores (vs the query weighting) 
  final public static String score_ = "score";
 
  // Alpha transient:
 
  private transient String tmpFullText = null; // (temporary storage until obj written to MongoDB)
  private transient String rawFullText = null; // (stores a pointer to the first full text set, ie normally directly from URL/file)

  // Beta unstored (eg index or API fields)
 
  // Index-specific fields (ElasticSearch):
  private Set<String> locs = null;
  final public static String locs_ = "locs";

  @SuppressWarnings("unused")
  private List<GeoPojo> timeRanges = null; // (won't be used for beta - allow encapsulation of time ranges as 2d points)
  final public static String timeRanges_ = "timeRanges";
  private Set<Integer> months = null; // (dates represented as YYYYMM - used to generate histograms, nothing else)
  final public static String months_ = "months";

  // Beta transient:
 
  private transient SourcePojo _source = null; // (handy accessor for the "parent" source info)

  //header & Footer Data (doesn't persist in the DB - used for extraction and enrichment)
  private transient int headerEndIndex = 0; // (obv starts at 0)
  private transient int footerStartIndex = Integer.MAX_VALUE; // (obv ends at the end of the document)
  private transient Set<String> headerFields = null;
  private transient Set<String> footerFields = null;
  private transient String headerText = null; // (\n-separated list of headerFields)
  private transient String footerText = null; // (\n-separated list of headerFields)

  // V0 transient
 
  // multi-community/source handling
  private transient String duplicateFrom = null; // Indicates this document should be cloned from the DB entry with matching URL, "duplicateFrom" source
  private transient DocumentPojo cloneFrom = null; // Indicate this document should be cloned from the "cloneFrom" in memory copy after enrichment
  private transient SourcePipelinePojo spawnedFrom = null; // Indicates this document was spawned from a "document splitter" (so should ignore previous pipeline elements)
  private transient boolean hasDefaultUrl = false; // (for files only) if true then can skip an extra dedup step 
 
  ////////////////////////////////////////////////////////////////////////////////

  // Alpha gets and sets 

  public DocumentPojo()
  {
  }

  public ObjectId getId() {
    return _id;
  }
  public void setId(ObjectId _id) {
    this._id = _id;
  }
  public String getTitle() {
    return title;
  }
  public void setTitle(String title) {
    this.title = title;
  }
  public String getDescription() {
    return description;
  }
  public void setDescription(String description) {
    this.description = description;
  }
  public String getUrl() {
    return url;
  }
  public void setUrl(String url) {
    this.url = url;
  }
  /**
   * @param created the created to set
   */
  public Date getCreated() {
    return this.created;
  }
  public void setCreated(Date created) {
    this.created = created;
  }
  public Date getModified() {
    return this.modified;
  }
  public void setModified(Date modified) {
    this.modified = modified;
  }
  public Date getPublishedDate() {
    return this.publishedDate;
  }
  public void setPublishedDate(Date publishedDate) {
    this.publishedDate = publishedDate;
  }
  public String getSource() {
    return source;
  }
  public void setSource(String source) {
    this.source = source;
  }
  public String getSourceKey() {
    return sourceKey;
  }
  public void setSourceKey(String sourceKey) {
    this.sourceKey = sourceKey;
  }

  public void setEntities(List<EntityPojo> entities) {
    this.entities = entities;
  }
  public List<EntityPojo> getEntities() {
    return entities;
  }

  public String getMediaType() {
    return mediaType;
  }
  public void setMediaType(String mediaType) {
    this.mediaType = mediaType;
  }
  public String getFullText() {
    return (null == fullText)?tmpFullText:fullText;
  }
  public void setFullText(String fullText) {
    if (null == this.rawFullText) { // very first time, set the raw full text
      rawFullText = fullText;
    }
    this.tmpFullText = fullText;
  }
  public void makeFullTextNonTransient() {
    this.fullText = this.tmpFullText;
  }

  // This is used for convenience, also used as a hacky flag to spot update documents
  // that have been discarded from the update list.
  public SourcePojo getTempSource() { return _source; }
  public void setTempSource(SourcePojo tempSource) { _source = tempSource; }

  ////////////////////////////////////////////////////////////////////////////////

  // Alpha utility 

  ////////////////////////////////////////////////////////////////////////////////

  // Beta gets and sets

  public void setAssociations(List<AssociationPojo> events)
  {
    this.associations = events;
  }

  public List<AssociationPojo> getAssociations()
  {
    return this.associations;
  }
  public void addToMetadata(String fieldName, Object fieldVal) {
    if (null == metadata) {
      metadata = new LinkedHashMap<String, Object[]>();
    }
    Object obj[] = new Object[1]; obj[0] = fieldVal;
    Object[] current = metadata.get(fieldName);
    if (null != current) {
      metadata.put(fieldName, ArrayUtils.add(current, obj));
    }
    else {
      metadata.put(fieldName, obj);
    }
  }
  public void addToMetadata(String fieldName, Object[] fieldVals) {
    if (null == metadata) {
      metadata = new LinkedHashMap<String, Object[]>();
    }
    Object[] current = metadata.get(fieldName);
    if (null != current) {
      metadata.put(fieldName, ArrayUtils.addAll(current, fieldVals));
    }
    else {
      metadata.put(fieldName, fieldVals);
    }
  }

  public void setMetadata(LinkedHashMap<String, Object[]> metadata)
  {
    this.metadata = metadata;
  }

  public LinkedHashMap<String, Object[]> getMetadata()
  {
    return this.metadata;
  }

  public LinkedHashMap<String, Object[]> getMetaData() {
    return metadata;
  }

  public Set<String> getTags() {
    return tags;
  }
  public void setTags(Set<String> tags_) {
    tags = tags_;
  }
  public void addTags(Set<String> tags_) {
    tags.addAll(tags_);
  }
  public void setCommunityId(ObjectId communityId) {
    this.communityId = communityId;
  }
  public ObjectId getCommunityId() {
    return this.communityId;
  }

  public GeoPojo getDocGeo() {
    return docGeo;
  }

  public void setDocGeo(GeoPojo docGeo) {
    this.docGeo = GeoPojo.cleanseBadGeotag(docGeo);
  }

  /**
   * @param locs the locs to set
   */
  public void setLocs(Set<String> locs) {
    this.locs = locs;
  }

  /**
   * @return the locs
   */
  public Set<String> getLocs() {
    return locs;
  }

  /**
   * @param months the months to set
   */
  public void setMonths(Set<Integer> months) {
    this.months = months;
  }

  /**
   * @return the months
   */
  public Set<Integer> getMonths() {
    return months;
  }

  /**
   * @param sourceUrl the sourceUrl to set
   */
  public void setSourceUrl(String sourceUrl) {
    this.sourceUrl = sourceUrl;
  }

  /**
   * @return the sourceUrl
   */
  public String getSourceUrl() {
    return sourceUrl;
  }

  /**
   * @return the index
   */
  public String getIndex() {
    return index;
  }

  /**
   * @param index the index to set
   */
  public void setIndex(String index) {
    this.index = index;
  }

  ////////////////////////////////////////////////////////////////////////////////

  // Beta utility

  // Add the metadata as separate lines to perform extraction on them

  public String metaDataToText() {
    StringBuffer sb = new StringBuffer();
    for ( Object md : metadata.values())
    {
      sb.append(md).append('\n');
    }
    return sb.toString();
  }//TOTEST - to be done during DB integration

  ////////////////////////////////////////////////////////////////////////////////

  //(Still beta) Header Footer Stuff ... can be used by entity extractors

  /**
   * @return the headerStartIndex
   */
  @Deprecated
  public int getHeaderEndIndex() {
    return headerEndIndex;
  }

  /**
   * @param headerStartIndex the headerStartIndex to set
   */
  @Deprecated
  public void setHeaderEndIndex(int headerEndIndex) {
    this.headerEndIndex = headerEndIndex;
  }

  /**
   * @return the footerStartIndex
   */
  @Deprecated
  public int getFooterStartIndex() {
    return footerStartIndex;
  }

  /**
   * @param footerEndIndex the footerEndIndex to set
   */
  @Deprecated
  public void setFooterStartIndex(int footerStartIndex) {
    this.footerStartIndex = footerStartIndex;
  }

  @Deprecated
  public void addToHeader(String sHeaderField) {
    if (headerFields == null)
      headerFields = new HashSet<String>();
    headerFields.add(sHeaderField.toLowerCase());
  }
  @Deprecated
  public void addToFooter(String sFooterField) {
    if (footerFields == null)
      footerFields = new HashSet<String>();
    footerFields.add(sFooterField.toLowerCase());
  }
  @Deprecated
  public Set<String> getHeaderFields() {
    return headerFields;
  }
  @Deprecated
  public Set<String> getFooterFields() {
    return footerFields;
  }
  @Deprecated
  public String getHeader() {
    if (null == headerFields) {
      return "";
    }
    return headerText;
  }
  @Deprecated
  public String getFooter() {
    if (null == footerFields) {
      return "";
    }
    return footerText;
  }
  @Deprecated
  public String getBody() {
    if (null == getFullText())
    {
      return null;
    }
    else{
      if (footerStartIndex == Integer.MAX_VALUE && headerEndIndex == 0 )
      {
        return getFullText();
      }
      else if (footerStartIndex > getFullText().length()) {
        return getFullText().substring(headerEndIndex);
      }
      else {
        return getFullText().substring(headerEndIndex, footerStartIndex);
      }
    }
  }

  ////////////////////////////////////////////////////////////////////////////////

  // V0 gets and sets
 
  public void setDuplicateFrom(String sourceKey) {
    duplicateFrom = sourceKey;
  }
  public String getDuplicateFrom() {
    return duplicateFrom;
  }
  public void setCloneFrom(DocumentPojo masterClone) {
    cloneFrom = masterClone;
  }
  public DocumentPojo getCloneFrom() {
    return cloneFrom;
  }
 
  ////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////

  // Base overrides:

  public GsonBuilder extendBuilder(GsonBuilder gp) {
    return gp.registerTypeAdapter(DocumentPojo.class, new DocumentPojoDeserializer()).
        registerTypeAdapter(DocumentPojo.class, new DocumentPojoSerializer());
  }
  protected static class DocumentPojoSerializer implements JsonSerializer<DocumentPojo>
  {
    @Override
    public JsonElement serialize(DocumentPojo doc, Type typeOfT, JsonSerializationContext context)
    {
      // GSON transformation:
      JsonElement je = DocumentPojo.getDefaultBuilder().create().toJsonTree(doc, typeOfT)

      // Convert object names in metadata
      if ((null != doc.getMetadata()) && !doc.getMetadata().isEmpty()) {
        if (je.isJsonObject()) {
          JsonElement metadata = je.getAsJsonObject().get("metadata");
          if (null != metadata) {
            enforceTypeNamingPolicy(metadata, 0);
          }
        }
      }
      return je;
    }   
    //////////////////////////////////////////////////////////////////////////////////////////
   
    // Utility function for encoding "."s and "%"s (also duplicate in index)
   
    private static boolean enforceTypeNamingPolicy(JsonElement je, int nDepth) {
     
      if (je.isJsonPrimitive()) {
        return false; // Done
      }
      else if (je.isJsonArray()) {
        JsonArray ja = je.getAsJsonArray();
        if (0 == ja.size()) {
          return false; // No idea, carry on
        }
        JsonElement jaje = ja.get(0);
        return enforceTypeNamingPolicy(jaje, nDepth + 1); // keep going until you find primitive/object
      }
      else if (je.isJsonObject()) {
        JsonObject jo = je.getAsJsonObject();
        // Nested variables:
        Iterator<Entry<String, JsonElement>> it = jo.entrySet().iterator();
        Map<String, JsonElement> toFixList = null;
        while (it.hasNext()) {
          boolean bFix = false;
          Entry<String, JsonElement> el = it.next();
          String currKey = el.getKey();
         
          if ((currKey.indexOf('.') >= 0) || (currKey.indexOf('%') >= 0)) {
            it.remove();
            currKey = currKey.replace("%", "%25").replace(".", "%2e");
            bFix = true;
          }       
          if (null == el.getValue()) {
            if (!bFix) it.remove(); // nice easy case, just get rid of it (if bFix, it's already removed)
            bFix = false;
          }
          else {
            enforceTypeNamingPolicy(el.getValue(), nDepth + 1);
          }
          if (bFix) {
            if (null == toFixList) {
              toFixList = new HashMap<String, JsonElement>();
            }
            toFixList.put(currKey, el.getValue());         
          }
        } // (end loop over params) 
        if (null != toFixList) {
          for (Entry<String, JsonElement> el: toFixList.entrySet()) {
            jo.add(el.getKey(), el.getValue());
          }
        }
        return true; // (in any case, I get renamed by calling parent)
      }
      return false;
    }
    //TESTED (see DOC_META in test/TestCode)
  }
  protected static class DocumentPojoDeserializer implements JsonDeserializer<DocumentPojo>
  {
    @Override
    public DocumentPojo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException
    {
      JsonObject metadata = json.getAsJsonObject().getAsJsonObject("metadata");
      if (null != metadata) {
        json.getAsJsonObject().remove("metadata");
      }
      DocumentPojo doc = BaseDbPojo.getDefaultBuilder().create().fromJson(json, DocumentPojo.class)
      if (null != metadata) {       
        for (Entry<String, JsonElement> entry: metadata.entrySet()) {
          if (entry.getValue().isJsonArray()) {
            doc.addToMetadata(entry.getKey(), MongoDbUtil.encodeArray(entry.getValue().getAsJsonArray()).toArray());
          }
          else {
            BasicDBList dbl = new BasicDBList();
            dbl.add(MongoDbUtil.encodeUnknown(entry.getValue()));
            doc.addToMetadata(entry.getKey(), dbl);
          }
        }//TESTED       
      }
      return doc;
    }
  }
  //////////////////////////////////////////////////////////////////////////////// 

  // Per query (transient, created on the way to the API for query, not currently stored anywhere)
 
  public Double getAggregateSignif() {
    return aggregateSignif;
  }

  public void setAggregateSignif(Double aggregateSignif) {
    this.aggregateSignif = aggregateSignif;
  }

  public Double getQueryRelevance() {
    return queryRelevance;
  }

  public void setQueryRelevance(Double queryRelevance) {
    this.queryRelevance = queryRelevance;
  }

  public Double getScore() {
    return score;
  }

  public void setScore(Double score) {
    this.score = score;
  }

  public void setUpdateId(ObjectId updateId) {
    this.updateId = updateId;
  }

  public ObjectId getUpdateId() {
    return updateId;
  }

  public void setDisplayUrl(String displayUrl) {
    this.displayUrl = displayUrl;
  }

  public String getDisplayUrl() {
    return displayUrl;
  }

  public void setExplain(Object explain) {
    this.explain = explain;
  }

  public Object getExplain() {
    return explain;
  }

  public void resetRawFullText() {
    this.rawFullText = null;
  }

  public String getRawFullText() {
    return rawFullText;
  }

  public SourcePipelinePojo getSpawnedFrom() {
    return spawnedFrom;
  }

  public void setSpawnedFrom(SourcePipelinePojo spawnedFrom) {
    this.spawnedFrom = spawnedFrom;
  }

  public boolean getHasDefaultUrl() {
    return hasDefaultUrl;
  }

  public void setHasDefaultUrl(boolean hasDefaultUrl) {
    this.hasDefaultUrl = hasDefaultUrl;
  }

}
TOP

Related Classes of com.ikanow.infinit.e.data_model.store.document.DocumentPojo

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.