Package com.ikanow.infinit.e.harvest.extraction.document.file

Source Code of com.ikanow.infinit.e.harvest.extraction.document.file.JsonToMetadataParser$ObjectLength

package com.ikanow.infinit.e.harvest.extraction.document.file;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringEscapeUtils;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.stream.JsonReader;
import com.google.gson.stream.JsonToken;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;

public class JsonToMetadataParser {

  private HashSet<String> objectIdentifiers = new HashSet<String>();
  private HashSet<String> recursiveObjectIdentifiers = new HashSet<String>();
  private HashSet<String> fieldsThatNeedToExist = new HashSet<String>();
  private String primaryKey = null;
  private String sourceName = null;
  private boolean bRecurse = false;
  private int nMaxDocs = Integer.MAX_VALUE;
  private int nCurrDocs = 0;
 
  // Track approximate memory usage
  private ObjectLength _memUsage = new ObjectLength();     
  public long getMemUsage() {
    return _memUsage.memory*10; // 5x for overhead, 2x for string->byte
  }
 
  public JsonToMetadataParser(String sourceName, List<String> objectIdentifiers, String primaryKey, List<String> fieldsThatNeedToExist, int nMaxDocs)
  {
    if (nMaxDocs > 0) {
      this.nMaxDocs = nMaxDocs;
    }
    this.primaryKey = primaryKey;
    this.sourceName = sourceName;
    if (null != objectIdentifiers) {
      for (String objectId: objectIdentifiers) {
        if (objectId.startsWith("*")) {
          this.bRecurse = true;
          this.recursiveObjectIdentifiers.add(objectId.substring(1).toLowerCase());
        }
        this.objectIdentifiers.add(objectId.toLowerCase());
      }
    }
    if (null != fieldsThatNeedToExist) {
      this.fieldsThatNeedToExist.addAll(fieldsThatNeedToExist);
    }
  }
 
  /**
   * Parses XML and returns a new feed with the resulting HashMap as Metadata
   * @param reader XMLStreamReader using Stax to avoid out of memory errors
   * @return List of Feeds with their Metadata set
   * @throws IOException
   */
  public List<DocumentPojo> parseDocument(JsonReader reader) throws IOException {
    return parseDocument(reader, false);
  }//TESTED (used by FileHarvester in this form, UAH::meta (stream) with textOnly==true below)
 
  public List<DocumentPojo> parseDocument(JsonReader reader, boolean textOnly) throws IOException {
    List<DocumentPojo> docList = new ArrayList<DocumentPojo>();
    JsonParser parser = new JsonParser();
    nCurrDocs = 0;
    _memUsage.memory = 0;
   
    // Different cases:
    // {}
    // ^^ many of these
    // [ {}, {}, {} ]
    // For each of these 2/3 cases, you might either want to grab the entire object, or a field
    // within the object
   
    try {
      while (true) { // (use exceptions to get outta here)

        JsonToken tok = JsonToken.BEGIN_OBJECT;
        try {
          tok = reader.peek();
        }
        catch (Exception e) {
          // EOF or end of object, keep going and find out...
          tok = reader.peek();
        }
        //TESTED
       
        if (JsonToken.BEGIN_ARRAY == tok) {
          reader.beginArray();
          if (objectIdentifiers.isEmpty()) {
            while (reader.hasNext()) {
              DocumentPojo doc = convertJsonToDocument(reader, parser, textOnly);
              if (null != doc) {
                docList.add(doc);
                if (++nCurrDocs >= nMaxDocs) {
                  return docList;
                }
              }
            }
          }//TESTED
          else {
            while (reader.hasNext()) {
              getDocumentsFromJson(reader, parser, docList, false, textOnly);
            }
          }//TESTED
        }
        else if (JsonToken.BEGIN_OBJECT == tok) {
          if (objectIdentifiers.isEmpty()) {
            DocumentPojo doc = convertJsonToDocument(reader, parser, textOnly);
            if (null != doc) {
              docList.add(doc);
              if (++nCurrDocs >= nMaxDocs) {
                return docList;
              }
            }
          }//TESTED (single and multiple doc case)
          else {
            getDocumentsFromJson(reader, parser, docList, false, textOnly);
          }//TESTED (single and multiple doc case) 
        }
        else if ((JsonToken.END_DOCUMENT == tok) || (JsonToken.END_ARRAY == tok) || (JsonToken.END_OBJECT == tok))  {
          return docList;
        }
      } // (end loop forever - exception out)
    }
    catch (Exception e) {} // This is our EOF
   
    return docList;
  }
 
  private DocumentPojo convertJsonToDocument(JsonReader reader, JsonParser parser, boolean textOnly) {
   
    JsonElement meta = parser.parse(reader);
   
    // Check if all required fields exist:
    if (!textOnly && !checkIfMandatoryFieldsExist(meta)) {
      return null;
    }
    //TESTED
   
    // Primary key and create doc
    DocumentPojo doc = new DocumentPojo();
    if ((null != primaryKey) && (null != sourceName)) {
      String primaryKey = getPrimaryKey(meta);
      if (null != primaryKey) {
        doc.setUrl(sourceName + primaryKey);
      }
    }
    if (textOnly) {
      doc.setFullText(meta.toString());
    }
    else {
      doc.setFullText("");
      if (meta.isJsonArray()) {
        ArrayList<Object> metaArray = new ArrayList<Object>(meta.getAsJsonArray().size());
        for (JsonElement je: meta.getAsJsonArray()) {
          if (je.isJsonObject()) {
            metaArray.add(convertJsonObjectToLinkedHashMap(je.getAsJsonObject(), _memUsage));
          }
        }
        doc.addToMetadata("json", metaArray.toArray());
      }
      else if (meta.isJsonObject()) {
        doc.addToMetadata("json", convertJsonObjectToLinkedHashMap(meta.getAsJsonObject(), _memUsage));
      }     
    }
    return doc;
   
  } //TESTED
 
  ////////////////////////////

  // Look into the JSON object and find the object with the specified name
  // (for now the "path" is ignored - maybe later we allow "x.y" terminology)
 
  private void getDocumentsFromJson(JsonReader reader, JsonParser parser, List<DocumentPojo> docList, boolean bRecursing, boolean textOnly) throws IOException {   
    reader.beginObject();
   
    while (reader.hasNext()) {
      String name = reader.nextName();
     
      boolean bMatch = false;
      if (bRecursing) {
        bMatch = recursiveObjectIdentifiers.contains(name.toLowerCase());
      }
      else {
        bMatch = objectIdentifiers.contains(name.toLowerCase());       
      }//TESTED
     
      if (bMatch) {
        JsonElement meta = parser.parse(reader);
       
        if (meta.isJsonObject()) { // (basically duplicates logic from convertJsonToDocument)
          if (textOnly || checkIfMandatoryFieldsExist(meta)) {
            DocumentPojo doc = new DocumentPojo();
            if ((null != primaryKey) && (null != sourceName)) {
              String primaryKey = getPrimaryKey(meta);
              if (null != primaryKey) {
                doc.setUrl(sourceName + primaryKey);
              }
            }
            if (textOnly) {
              doc.setFullText(meta.toString());
            }
            else {
              doc.addToMetadata("json", convertJsonObjectToLinkedHashMap(meta.getAsJsonObject(), _memUsage));
            }
            docList.add(doc);
            if (++nCurrDocs >= nMaxDocs) {
              while (reader.hasNext()) {
                reader.skipValue();
              }
              reader.endObject();
              return;
            }
          }
        }//TESTED
        else if (meta.isJsonArray()) {
          for (JsonElement meta2: meta.getAsJsonArray()) {
            if (textOnly || checkIfMandatoryFieldsExist(meta2)) {
              DocumentPojo doc = new DocumentPojo();
              if ((null != primaryKey) && (null != sourceName)) {
                String primaryKey = getPrimaryKey(meta2);
                if (null != primaryKey) {
                  doc.setUrl(sourceName + primaryKey);
                }
              }
              if (textOnly) {
                doc.setFullText(meta2.toString());
              }
              else {
                doc.addToMetadata("json", convertJsonObjectToLinkedHashMap(meta2.getAsJsonObject(), _memUsage));
              }
              docList.add(doc);           
              if (++nCurrDocs >= nMaxDocs) {
                while (reader.hasNext()) {
                  reader.skipValue();
                }
                reader.endObject();
                return;
              }
            }
          }
        }//TESTED
       
      }//TESTED
      else {
        if (bRecurse) {
       
          JsonToken tok = reader.peek();
          if (JsonToken.BEGIN_OBJECT == tok) {
            getDocumentsFromJson(reader, parser, docList, true, textOnly);
          }//TESTED
          else if (JsonToken.BEGIN_ARRAY == tok) {
            reader.beginArray();         
            while (reader.hasNext()) {
              JsonToken tok2 = reader.peek();
             
              if (JsonToken.BEGIN_OBJECT == tok2) {
                getDocumentsFromJson(reader, parser, docList, true, textOnly);             
              }
              else {
                reader.skipValue();
              }//TESTED
             
            }//TESTED   
            reader.endArray();         
          }
          else {
            reader.skipValue();
          }//TESTED
        }
        else {
          reader.skipValue();
        }//TESTED
      }
     
    }//(end loop over reader)
   
    reader.endObject();
   
  } //TESTED

  /////////////////////////////////

  // Check the object is well formed
 
  private boolean checkIfMandatoryFieldsExist(JsonElement meta) {
    if ((null != this.fieldsThatNeedToExist) && !this.fieldsThatNeedToExist.isEmpty())
    {
      boolean fieldsExist = false;
   
      for (String field: this.fieldsThatNeedToExist) {
        String exists = getKey(meta, field, false);
        if (null != exists) {
          fieldsExist = true;
          break;
        }
      }     
      return fieldsExist;
    }
    return true;
  }//TESTED
 
  /////////////////////////////////
 
  // Utility - get the primary key (does handle recursion)
 
  private String getPrimaryKey(JsonElement meta) {
    return getKey(meta, primaryKey, true);
  }
  //TOTEST
 
  /////////////////////////////////
 
  // Utility - get an arbitrary key (does handle recursion)
 
  private String getKey(JsonElement meta, String key, boolean bPrimitiveOnly) {
    try {
      String[] components = key.split("\\.")
      JsonObject metaObj = meta.getAsJsonObject();
      for (String comp: components) {       
        meta = metaObj.get(comp);
       
        if (null == meta) {
          return null;
        }//TESTED
        else if (meta.isJsonObject()) {
          metaObj = meta.getAsJsonObject();
        }//TESTED
        else if (meta.isJsonPrimitive()) {
          return meta.getAsString();
        }//TESTED
        else if (bPrimitiveOnly) { // (meta isn't allowed to be an array, then you'd have too many primary keys!)
          return null;
        }//TOTEST
        else { // Check with first instance
          JsonArray array = meta.getAsJsonArray();
          meta = array.get(0);
          if (meta.isJsonObject()) {
            metaObj = meta.getAsJsonObject();
          }
        }//TESTED       
      }
      if (!bPrimitiveOnly) { // allow objects, we just care if the field exists...
        if (null != metaObj) {
          return "[Object]";
        }
      }//TESTED
    }
    catch (Exception e) {} // no primary key
   
    return null;
  }
  //TOTEST

  /////////////////////////////////

  // Utility - conversion

  public static class ObjectLength {
    public long memory = 0;
  }
 
  /**
   * Converts a JsonObject to a LinkedHashMap.
   * @param json  JSONObject to convert
   */
  static private int capacity(int expectedSize) {
      if (expectedSize < 3) {
          return expectedSize + 1;
      }
        return expectedSize + expectedSize / 3;
  }
  static public LinkedHashMap<String,Object> convertJsonObjectToLinkedHashMap(JsonObject json)
  {
    return convertJsonObjectToLinkedHashMap(json, false);
  }
  static public LinkedHashMap<String,Object> convertJsonObjectToLinkedHashMap(JsonObject json, ObjectLength size)
  {
    return convertJsonObjectToLinkedHashMap(json, false, size);
  }
  static public LinkedHashMap<String,Object> convertJsonObjectToLinkedHashMap(JsonObject json, boolean bHtmlUnescape)
  {
    return convertJsonObjectToLinkedHashMap(json, bHtmlUnescape, null);
  }
  static public LinkedHashMap<String,Object> convertJsonObjectToLinkedHashMap(JsonObject json, boolean bHtmlUnescape, ObjectLength size)
  {
    int length = json.entrySet().size();
    LinkedHashMap<String,Object> list = new LinkedHashMap<String,Object>(capacity(length));
    for (Map.Entry<String, JsonElement> jsonKeyEl: json.entrySet())
    {
      JsonElement jsonEl = jsonKeyEl.getValue();
      if (null != size) {
        size.memory += jsonKeyEl.getKey().length();
      }
      if (jsonEl.isJsonArray()) {
        list.put(jsonKeyEl.getKey(), handleJsonArray(jsonEl.getAsJsonArray(), bHtmlUnescape, size));
      }
      else if (jsonEl.isJsonObject()) {
        list.put(jsonKeyEl.getKey(), convertJsonObjectToLinkedHashMap(jsonEl.getAsJsonObject(), bHtmlUnescape, size));       
      }
      else if (jsonEl.isJsonPrimitive()) {
        String val;
        if (bHtmlUnescape) {
          val = StringEscapeUtils.unescapeHtml(jsonEl.getAsString());
        }
        else {
          val = jsonEl.getAsString();
        }
        if (null != size) {
          size.memory += val.length();
        }
        list.put(jsonKeyEl.getKey(), val);
      }
    }
    if (list.size() > 0)
    {
      return list;
    }
    return null;
  }
  //TESTED
 
  static private Object[] handleJsonArray(JsonArray jarray, boolean bHtmlUnescape, ObjectLength size)
  {
    Object o[] = new Object[jarray.size()];
    for (int i = 0; i < jarray.size(); i++)
    {
      JsonElement jsonEl = jarray.get(i);
      if (jsonEl.isJsonObject()) {
        o[i] = convertJsonObjectToLinkedHashMap(jsonEl.getAsJsonObject(), bHtmlUnescape, size);
      }
      else if (jsonEl.isJsonArray()) {
        o[i] = handleJsonArray(jsonEl.getAsJsonArray(), bHtmlUnescape, size);       
      }
      else if (jsonEl.isJsonPrimitive()) {
        if (bHtmlUnescape) {
          o[i] = StringEscapeUtils.unescapeHtml(jsonEl.getAsString());         
        }
        else {
          o[i] = jsonEl.getAsString();
        }
        if (null != size) {
          size.memory += ((String)o[i]).length();
        }
      }
    }
    return o;
  }
  //TESTED
}
TOP

Related Classes of com.ikanow.infinit.e.harvest.extraction.document.file.JsonToMetadataParser$ObjectLength

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.