Package spiderman.plugin.duplicate

Source Code of spiderman.plugin.duplicate.DocIDServer

package spiderman.plugin.duplicate;

import java.io.File;

import org.eweb4j.spiderman.spider.Settings;
import org.eweb4j.spiderman.spider.SpiderListener;
import org.eweb4j.spiderman.task.TaskDbServer;
import org.eweb4j.util.CommonUtil;
import org.eweb4j.util.FileUtil;

import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
import com.sleepycat.je.OperationStatus;

public class DocIDServer implements TaskDbServer{
 
  private String name = null;
  public Environment env = null;
  public Database db = null;
//  private final Object mutex = new Object();
  private int lastDocID = 0;
 
  public DocIDServer(String name, SpiderListener listener) {
    this.name = name;
    File _dbEnv = new File(Settings.website_visited_folder());
    if (!_dbEnv.exists()) {
      String error = "dbEnv folder -> " + _dbEnv.getAbsolutePath() + " not found !";
      RuntimeException e = new RuntimeException(error);
      listener.onError(Thread.currentThread(), null, error, e);
      throw e;
    }
    File dir = new File(_dbEnv.getAbsolutePath()+"/"+name);
    if (!dir.exists())
      dir.mkdir();
   
    for (File f : dir.listFiles()){
      boolean flag = FileUtil.deleteFile(f);
      if (!flag) {
        String error = "file -> " + f.getAbsolutePath() + " can not delete !";
        RuntimeException e = new RuntimeException(error);
        listener.onError(Thread.currentThread(), null, error, e);
        throw e;
      }
      listener.onInfo(Thread.currentThread(), null, "file -> " + f.getAbsolutePath() + " delete success !");
    }
   
    DatabaseConfig dbConfig = new DatabaseConfig();
    dbConfig.setAllowCreate(true);
    EnvironmentConfig ec = new EnvironmentConfig();
    ec.setAllowCreate(true);
    env = new Environment(dir, ec);
    db = env.openDatabase(null, name, dbConfig);
    lastDocID = 0;
  }
 
  /**
   * Returns the docid of an already seen url.
   *
   * @param url
   *            the URL for which the docid is returned.
   * @return the docid of the url if it is seen before. Otherwise -1 is
   *         returned.
   */
  public synchronized int getDocId(String url) {
    OperationStatus result;
    DatabaseEntry value = new DatabaseEntry();
    try {
      DatabaseEntry key = new DatabaseEntry(url.getBytes());
      result = db.get(null, key, value, null);

      if (result == OperationStatus.SUCCESS
          && value.getData().length > 0) {
        return CommonUtil.byteArray2Int(value.getData());
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return -1;
  }

  public synchronized int newDocID(String url) {
    try {
      // Make sure that we have not already assigned a docid for this
      // URL
      int docid = getDocId(url);
      if (docid > 0) {
        return docid;
      }

      lastDocID++;
      db.put(null, new DatabaseEntry(url.getBytes()), new DatabaseEntry(CommonUtil.int2ByteArray(lastDocID)));
      return lastDocID;
    } catch (Exception e) {
      e.printStackTrace();
    }
    return -1;
  }

  public synchronized void addUrlAndDocId(String url, int docId) throws Exception {
    if (docId <= lastDocID) {
      throw new Exception("Requested doc id: " + docId + " is not larger than: " + lastDocID);
    }
   
    // Make sure that we have not already assigned a docid for this URL
    int prevDocid = getDocId(url);
    if (prevDocid > 0) {
      if (prevDocid == docId) {
        return;
      }
      throw new Exception("Doc id: " + prevDocid + " is already assigned to URL: " + url);
    }
   
    db.put(null, new DatabaseEntry(url.getBytes()), new DatabaseEntry(CommonUtil.int2ByteArray(docId)));
    lastDocID = docId;
  }
 
  public boolean isSeenBefore(String url) {
    return getDocId(url) != -1;
  }

  public int getDocCount() {
    try {
      return (int) db.count();
    } catch (DatabaseException e) {
      e.printStackTrace();
    }
    return -1;
  }

  public void sync() {
    try {
      db.sync();
    } catch (DatabaseException e) {
      e.printStackTrace();
    }
  }

  public void close() {
    try {
      db.close();
      env.removeDatabase(null, name);
      env.cleanLog();
      env.close();
    } catch (DatabaseException e) {
      e.printStackTrace();
    }
  }
}
TOP

Related Classes of spiderman.plugin.duplicate.DocIDServer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.