package org.apache.lucene.search.memcached;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.MapFieldSelector;
import org.apache.lucene.search.features.PwaIRankingFunction;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
//import org.apache.lucene.util.Base32;
import org.apache.commons.codec.binary.Base64;
import java.io.*;
import java.math.BigInteger;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
/**
* Load index into database
* @author Miguel Costa
*/
public class MemcachedTransactions {
// mime sub-stypes
private final static String SUBTYPES[]={"html","plain","pdf","postscript","xml","x-shockwave-flash","xhtml+xml","sgml","msword","mspowerpoint","vnd","vnd.ms-powerpoint","rtf","richtext"};
private final static long DUMP_FACTOR=10000000;
private final static String COLLECTION_STORED="STORED";
public final static String MAX_VERSIONS="MAX_VERSIONS";
public final static String MAX_SPAN="MAX_SPAN";
public final static String MIN_DATE="MIN_DATE";
public final static String MAX_DATE="MAX_DATE";
private static SimpleDateFormat dformat=new SimpleDateFormat("yyyyMMdd");
private static MessageDigest md = null;
/**
* Constructor
*/
public MemcachedTransactions() {
}
/**
* Load database with urls' timestamps
* @param reader index reader
* @param addresses memcached servers addresses
* @param collectionKey collection identifier
* @throws IOException
*/
public static void load(IndexReader reader, String addresses, String collectionId) throws IOException {
Memcached cache=new Memcached(addresses);
// check if it this collection is already stored in memcached
String value=(String)cache.get(collectionId);
if (value!=null) {
System.out.println("Collection is already stored in memcached.");
cache.close();
return;
}
cache.set(collectionId,COLLECTION_STORED);
Integer maxVersions=(Integer)cache.get(MAX_VERSIONS);
if (maxVersions==null) {
maxVersions=1;
}
Integer maxSpan=(Integer)cache.get(MAX_SPAN);
if (maxSpan==null) {
maxSpan=0;
}
Integer minDate=(Integer)cache.get(MIN_DATE);
if (minDate==null) {
minDate=Integer.MAX_VALUE;;
}
Integer maxDate=(Integer)cache.get(MAX_DATE);
if (maxDate==null) {
maxDate=0;
}
Document doc=null;
long count=0;
long countWrongType=0;
long countDynamic=0;
// initialize valid extensions
Hashtable<String,Boolean> validExtensions=new Hashtable<String,Boolean>();
for (int i=0; i<SUBTYPES.length; i++) {
validExtensions.put(SUBTYPES[i], true);
}
// read index
System.out.println("Reading index with "+reader.maxDoc()+" documents ...");
for (int i=0;i<reader.maxDoc();i++) {
int idate=-1;
String url=null;
String subtype=null;
doc = reader.document(i, new MapFieldSelector(new String[]{"date","url","subType"}));
Enumeration e = doc.fields();
while (e.hasMoreElements()) {
Field field = (Field)e.nextElement();
if (field.name().equals("date")) {
idate=stringdateToInt(field.stringValue());
}
else if (field.name().equals("url")) {
url=field.stringValue();
}
else if (field.name().equals("subType")) {
subtype=field.stringValue();
}
else {
throw new IOException("Wrong field read.");
}
}
//System.out.println("url: "+ url+" date:"+ldate+" subtype:"+subtype);
if (validExtensions.get(subtype)==null) {
countWrongType++;
}
else if (url.indexOf('?')!=-1) {
countDynamic++;
}
else { // store in cache
url=getUrlKey(url);
try {
UrlRow row=cache.getRow(url);
if (row==null) {
row=new UrlRow(1,idate,idate);
cache.addRow(url,row);
count++;
if (count%DUMP_FACTOR==0) {
System.out.println("Stored "+count+" urls.");
}
// set global minDate and MaxDate
if (idate>maxDate) {
maxDate=idate;
}
if (idate<minDate) {
minDate=idate;
}
}
else {
int minDateURL=(row.getMin()<idate) ? row.getMin() : idate;
int maxDateURL=(row.getMax()>idate) ? row.getMax() : idate;
row=new UrlRow(row.getNVersions()+1,minDateURL,maxDateURL);
cache.replaceRow(url,row);
// set maxVersions and maxSpan
if (row.getNVersions()>maxVersions) {
maxVersions=row.getNVersions();
}
long lMaxDateURL=intToLongdate(maxDateURL);
long lMinDateURL=intToLongdate(minDateURL);
float span=(lMaxDateURL-lMinDateURL)/PwaIRankingFunction.DAY_MILLISEC;
if (span>maxSpan) {
maxSpan=(int)span;
}
// set global minDate and MaxDate
if (maxDateURL>maxDate) {
maxDate=maxDateURL;
}
if (minDateURL<minDate) {
minDate=minDateURL;
}
}
}
catch (IllegalArgumentException ex) { // "Key is too long (maxlen = 250)"
System.err.println(ex.getMessage());
}
}
}
// store data
cache.set(MAX_VERSIONS,maxVersions);
cache.set(MAX_SPAN,maxSpan);
cache.set(MIN_DATE,minDate);
cache.set(MAX_DATE,maxDate);
// store in database
System.out.println("Stored "+count+" urls in memcached.");
System.out.println(countWrongType+" urls with wrong mime type filtered.");
System.out.println(countDynamic+" dynamic urls filtered.");
System.out.println(maxVersions+" is the maximum number of versions.");
System.out.println(maxSpan+" is the maximum span between versions.");
System.out.println(minDate+" is the minimum date of a version.");
System.out.println(maxDate+" is the maximum date of a version.");
cache.close();
}
/**
* Get url key
* @param url URL string
* @return
*/
public static String getUrlKey(String url) throws IOException {
if (md==null) {
try {
md = MessageDigest.getInstance("MD5");
}
catch (NoSuchAlgorithmException e) {
throw new IOException("Failed to get md5 digester: " + e.getMessage());
}
}
return new String(Base64.encodeBase64(md.digest(url.getBytes()))); // base64 of a md5 digest
}
/**
* Convert date in string format from index to a 4 bytes integer
* @param date
* @return
*/
public static int stringdateToInt(String date) {
long ldate=Long.parseLong(date)*1000;
String daux=dformat.format(new Date(ldate));
return Integer.parseInt(daux); // only 4 bytes to cache
}
/**
* Convert date from a 4 bytes integer to a 8 bytes long
* @param date
* @return
*/
public static long intToLongdate(int date) throws IOException {
try {
return dformat.parse(""+date).getTime();
}
catch (ParseException e) {
throw new IOException(e.getMessage());
}
}
/**
* Main
* @param args arguments
*/
public static void main(String[] args) throws Exception {
String usage="usage: load [index path] [address1=127.0.0.1:11211] [address2] ... [addressn]";
if (args.length<3) {
System.out.println(usage);
System.exit(0);
}
if (args[0].equals("load")) {
Directory idx = FSDirectory.getDirectory(args[1], false);
IndexReader reader = IndexReader.open(idx);
String addresses=new String();
for (int i=2;i<args.length;i++) {
addresses+=" "+args[i];
}
MemcachedTransactions.load(reader,addresses,args[1]);
reader.close();
}
else {
System.out.println(usage);
}
}
}