Package fr.eolya.crawler.cache.mongodb

Source Code of fr.eolya.crawler.cache.mongodb.MongoDBDocumentCache

package fr.eolya.crawler.cache.mongodb;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentFactory;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

import com.mongodb.BasicDBObject;

import fr.eolya.crawler.cache.DocumentCacheItem;
import fr.eolya.crawler.cache.IDocumentCache;
import fr.eolya.utils.Base64;
import fr.eolya.utils.XMLConfig;
import fr.eolya.utils.nosql.mongodb.MongoDBCollection;
import fr.eolya.utils.nosql.mongodb.MongoDBConnection;
import fr.eolya.utils.nosql.mongodb.MongoDBDatabase;

public class MongoDBDocumentCache implements IDocumentCache {

  private MongoDBConnection con = null;
  private MongoDBDatabase db = null;
  private String collName; 
  private MongoDBCollection coll = null;
  private final Object collMonitor = new Object();
  private long size;
  private long maxDocSize;

  public MongoDBDocumentCache(String sourceId, MongoDBConnection con, String dbName, String collName) {
    this.con = con;
    this.db = new MongoDBDatabase(this.con, dbName);
    this.collName = collName + "_" + sourceId;
    this.coll = new MongoDBCollection(db,this.collName);
    this.size = this.coll.size();
    this.maxDocSize = 0;
  }

  @Override
  public String put(String itemId, InputStream dataStream, long dataSize,
      HashMap<String, String> params, HashMap<String, String> metas, XMLConfig extra) {

    if (coll == null) return null;
    if (maxDocSize > 0 && dataSize > 0 && dataSize > maxDocSize) return null;

    BasicDBObject docsearch = new BasicDBObject();
    docsearch.put("item_id", itemId);

    BasicDBObject doc = new BasicDBObject();
    doc.put("item_id", itemId);
    doc.put("item_extra", extra.asXml());

    Document xml_params = DocumentFactory.getInstance().createDocument("utf-8");
    xml_params.setXMLEncoding("utf-8");
    Element xml_params_items = xml_params.addElement("params");

    for (Map.Entry<String, String> item : params.entrySet()) {
      String key = item.getKey();
      if (item.getValue()!=null) xml_params_items.addElement(key).addText(item.getValue());
    }
    doc.put("item_params", xml_params.asXML());

    Document xml_metas = DocumentFactory.getInstance().createDocument("utf-8");
    xml_metas.setXMLEncoding("utf-8");
    Element xml_metas_items = xml_metas.addElement("metas");

    for (Map.Entry<String, String> item : metas.entrySet()) {
      String key = item.getKey();
      if (item.getKey().startsWith("meta_")) {
        key = key.replace(':', '_').replace('-', '_').replace('.', '_').replace('/', '_');
      }
      if (item.getValue()!=null) xml_metas_items.addElement(key).addText(item.getValue());
   
    doc.put("item_metas", xml_metas.asXML());

    if (dataStream!=null) {
      String contentBase64 = "";
      try {
        //dataStream.reset();
        contentBase64 = Base64.inputStreamToStringBase64(dataStream);
        doc.put("content_base64", contentBase64);

      } catch (IOException e) {
        System.out.println(itemId);
        e.printStackTrace();
      }
    }
    remove(itemId);
    return coll.add(doc);
  }

  @Override
  public void remove(String itemId) {
    if (coll == null) return;
    BasicDBObject docsearch = new BasicDBObject();
    docsearch.put("item_id", itemId);
    coll.remove(docsearch);       
  }

  @Override
  public DocumentCacheItem get(String itemId) {
    if (coll == null) return null;
    BasicDBObject docsearch = new BasicDBObject();
    docsearch.put("item_id", itemId);
    BasicDBObject doc = coll.get(docsearch);
    return doc2DocumentCacheItem(doc);
  }

  @Override
  public boolean contains(String itemId) {
    if (coll == null) return false;
    BasicDBObject docsearch = new BasicDBObject();
    docsearch.put("item_id", itemId);
    return coll.contains(docsearch);
  }

  public static DocumentCacheItem doc2DocumentCacheItem(BasicDBObject doc) {
    try {
      String itemId = doc.get("item_id").toString();

      XMLConfig extra = null;
      if (doc.get("item_extra")!=null) {
        extra = new XMLConfig();
        extra.loadString(doc.get("item_extra").toString());
      }

      HashMap<String, String> params = new HashMap<String, String>();
      HashMap<String, String> metas = new HashMap<String, String>();

      Document document = null;
      SAXReader reader = new SAXReader();
      reader.setValidation(false);
      try {
        document = reader.read(new StringReader(doc.get("item_params").toString()));

        Element elmParams = (Element) document.selectSingleNode("params");
        if (elmParams!=null) {
          List<Element> items = elmParams.elements();
          Iterator<Element> iterItems = items.iterator();
          while (iterItems.hasNext()) {
            Element item = (Element) iterItems.next();
            params.put(item.getName(), item.getText());
          }
        }
      } catch (DocumentException de) {
        IOException ioe = new IOException();
        ioe.initCause(de);
        throw ioe;
      }

      reader = new SAXReader();
      reader.setValidation(false);
      try {
        document = reader.read(new StringReader(doc.get("item_metas").toString()));

        Element elmMetas = (Element) document.selectSingleNode("metas");
        if (elmMetas!=null) {
          List<Element> items = elmMetas.elements();
          Iterator<Element> iterItems = items.iterator();
          while (iterItems.hasNext()) {
            Element item = (Element) iterItems.next();
            metas.put(item.getName().replace(':', '_').replace('-', '_').replace('.', '_').replace('/', '_'), item.getText());
          }
        }
      } catch (DocumentException de) {
        IOException ioe = new IOException();
        ioe.initCause(de);
        throw ioe;
      }

      InputStream inputSource = new ByteArrayInputStream(doc.get("content_base64").toString().getBytes());
      InputStream streamData = new Base64.InputStream(inputSource, Base64.DECODE);

      return new DocumentCacheItem( itemId, streamData, params, metas, extra);
    } catch (IOException e) {
      e.printStackTrace();
    }
    return null;
  }

  @Override
  public long size() {
    return size;
  }

  @Override
  public void reset() {
    synchronized (collMonitor) {
      coll.drop();
      coll = new MongoDBCollection(db,this.collName);
    }   
    size = 0;
  }

  public long getMaxDocSize() {
    return maxDocSize;
  }

  public void setMaxDocSize(long maxDocSize) {
    this.maxDocSize = maxDocSize;
  }

//  @Override
//  public Iterator<DocumentCacheItem> getIterator() {
//    if (coll == null) return null;
//    DBCursor cur = coll.getCursor(new BasicDBObject());
//    return new  MongoDBDocumentCacheIterator(cur);
//  }

}
TOP

Related Classes of fr.eolya.crawler.cache.mongodb.MongoDBDocumentCache

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.