/**
*
*/
package org.apache.cassandra.loader;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.RowMutation;
import org.apache.cassandra.db.Table;
import org.apache.cassandra.io.SSTable;
import org.apache.cassandra.locator.EndPointSnitch;
import org.apache.cassandra.net.EndPoint;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.LogUtil;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.cassandra.utils.*;
/**
* This class is used to load the storage endpoints with the relevant data
* The data should be both what they are responsible for and what should be replicated on the specific
* endpoints.
* Population is done based on a xml file which should adhere to a schema.
*
* Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
*/
public class Loader
{
private static long siesta_ = 60*1000;
private static Logger logger_ = Logger.getLogger( Loader.class );
private Importer importer_;
private StorageService storageService_;
public Loader(StorageService storageService)
{
storageService_ = storageService;
}
/*
* This method loads all the keys into a special column family
* called "RecycleBin". This column family is used for temporary
* processing of data and then can be recycled. The idea is that
* after the load is complete we have all the keys in the system.
* Now we force a compaction and examine the single Index file
* that is generated to determine how the nodes need to relocate
* to be perfectly load balanced.
*
* param @ rootDirectory - rootDirectory at which the parsing begins.
* param @ table - table that will be populated.
* param @ cfName - name of the column that will be populated. This is
* passed in so that we do not unncessary allocate temporary String objects.
*/
private void preParse(File rootDirectory, String table, String cfName) throws Throwable
{
File[] files = rootDirectory.listFiles();
for ( File file : files )
{
if ( file.isDirectory() )
preParse(file, table, cfName);
else
{
String fileName = file.getName();
RowMutation rm = new RowMutation(table, fileName);
rm.add(cfName, fileName.getBytes(), 0);
rm.apply();
}
}
}
/*
* Merges a list of strings with a particular combiner.
*/
String merge( List<String> listFields, String combiner)
{
if(listFields.size() == 0 )
return null;
if(listFields.size() == 1)
return listFields.get(0);
String mergedKey = null;
for(String field: listFields)
{
if(mergedKey == null)
{
mergedKey = field;
}
else
{
mergedKey = mergedKey + combiner + field;
}
}
return mergedKey;
}
/*
* This function checks if the local storage endpoint
* is reponsible for storing this key .
*/
boolean checkIfProcessKey(String key)
{
EndPoint[] endPoints = storageService_.getNStorageEndPoint(key);
EndPoint localEndPoint = StorageService.getLocalStorageEndPoint();
for(EndPoint endPoint : endPoints)
{
if(endPoint.equals(localEndPoint))
return true;
}
return false;
}
/*
* This functions parses each file based on delimiters specified in the
* xml file. It also looks at all the parameters specified in teh xml and based
* on that populates the internal Row structure.
*/
void parse(String filepath) throws Throwable
{
BufferedReader bufReader = new BufferedReader(new InputStreamReader(
new FileInputStream(filepath)), 16 * 1024 * 1024);
String line = null;
String delimiter_ = new String(",");
RowMutation rm = null;
Map<String, RowMutation> rms = new HashMap<String, RowMutation>();
if(importer_.columnFamily.delimiter != null)
{
delimiter_ = importer_.columnFamily.delimiter;
}
while ((line = bufReader.readLine()) != null)
{
StringTokenizer st = new StringTokenizer(line, delimiter_);
List<String> tokenList = new ArrayList<String>();
String key = null;
while (st.hasMoreElements())
{
tokenList.add((String)st.nextElement());
}
/* Construct the Key */
List<String> keyFields = new ArrayList<String> ();
for(int fieldId: importer_.key.fields.field)
{
keyFields.add(tokenList.get(fieldId));
}
key = merge(keyFields, importer_.key.combiner);
if(importer_.key.optimizeIt != null && !importer_.key.optimizeIt)
{
if(!checkIfProcessKey(key))
{
continue;
}
}
rm = rms.get(key);
if( rm == null)
{
rm = new RowMutation(importer_.table, key);
rms.put(key, rm);
}
if(importer_.columnFamily.superColumn != null)
{
List<String> superColumnList = new ArrayList<String>();
for(int fieldId : importer_.columnFamily.superColumn.fields.field)
{
superColumnList.add(tokenList.get(fieldId));
}
String superColumnName = merge(superColumnList, " ");
superColumnList.clear();
if(importer_.columnFamily.superColumn.tokenize)
{
Analyzer analyzer = new StandardAnalyzer();
TokenStream ts = analyzer.tokenStream("superColumn", new StringReader(superColumnName));
Token token = null;
token = ts.next();
while(token != null)
{
superColumnList.add(token.termText());
token = ts.next();
}
}
else
{
superColumnList.add(superColumnName);
}
for(String sName : superColumnList)
{
String cfName = importer_.columnFamily.name + ":" + sName;
if(importer_.columnFamily.column != null)
{
for(ColumnType column : importer_.columnFamily.column )
{
String cfColumn = cfName +":" + (column.name == null ? tokenList.get(column.field):column.name);
rm.add(cfColumn, tokenList.get(column.value.field).getBytes(), Integer.parseInt(tokenList.get(column.timestamp.field)));
}
}
}
}
else
{
if(importer_.columnFamily.column != null)
{
for(ColumnType column : importer_.columnFamily.column )
{
String cfColumn = importer_.columnFamily.name +":" + (column.name == null ? tokenList.get(column.field):column.name);
rm.add(cfColumn, tokenList.get(column.value.field).getBytes(), Integer.parseInt(tokenList.get(column.timestamp.field)));
}
}
}
}
// Now apply the data for all keys
// TODO : Add checks for large data
// size maybe we want to check the
// data size and then apply.
Set<String> keys = rms.keySet();
for(String pKey : keys)
{
rm = rms.get(pKey);
if( rm != null)
{
rm.apply();
}
}
}
void parseFileList(File dir)
{
int fileCount = dir.list().length;
for ( int i = 0 ; i < fileCount ; i++ )
{
File file = new File(dir.list()[i]);
if ( file.isDirectory())
{
parseFileList(file);
}
else
{
try
{
if(importer_.key.optimizeIt != null && importer_.key.optimizeIt)
{
if(checkIfProcessKey(dir.list()[i]))
{
parse(dir.listFiles()[i].getAbsolutePath());
}
}
else
{
parse(dir.listFiles()[i].getAbsolutePath());
}
}
catch ( Throwable ex )
{
logger_.error(ex.toString());
}
}
}
}
void preLoad(File rootDirectory) throws Throwable
{
String table = DatabaseDescriptor.getTables().get(0);
String cfName = Table.recycleBin_ + ":" + "Keys";
/* populate just the keys. */
preParse(rootDirectory, table, cfName);
/* dump the memtables */
Table.open(table).flush(false);
/* force a compaction of the files. */
Table.open(table).forceCompaction(null,null,null);
/*
* This is a hack to let everyone finish. Just sleep for
* a couple of minutes.
*/
logger_.info("Taking a nap after forcing a compaction ...");
Thread.sleep(Loader.siesta_);
/* Figure out the keys in the index file to relocate the node */
List<String> ssTables = Table.open(table).getAllSSTablesOnDisk();
/* Load the indexes into memory */
for ( String df : ssTables )
{
SSTable ssTable = new SSTable(df);
ssTable.close();
}
/* We should have only one file since we just compacted. */
List<String> indexedKeys = SSTable.getIndexedKeys();
storageService_.relocate(indexedKeys.toArray( new String[0]) );
/*
* This is a hack to let everyone relocate and learn about
* each other. Just sleep for a couple of minutes.
*/
logger_.info("Taking a nap after relocating ...");
Thread.sleep(Loader.siesta_);
/*
* Do the cleanup necessary. Delete all commit logs and
* the SSTables and reset the load state in the StorageService.
*/
SSTable.delete(ssTables.get(0));
// File commitLogDirectory = new File( DatabaseDescriptor.getLogFileLocation() );
// FileUtils.delete(commitLogDirectory.listFiles());
storageService_.resetLoadState();
logger_.info("Finished all the requisite clean up ...");
}
void load(String xmlFile) throws Throwable
{
try
{
JAXBContext jc = JAXBContext.newInstance(this.getClass().getPackage().getName());
Unmarshaller u = jc.createUnmarshaller();
importer_ = (Importer)u.unmarshal(new FileInputStream( xmlFile ) );
String directory = importer_.columnFamily.directory;
File rootDirectory = new File(directory);
preLoad(rootDirectory);
parseFileList(rootDirectory);
}
catch (Exception e)
{
logger_.info(LogUtil.throwableToString(e));
}
}
/**
* @param args
*/
public static void main(String[] args) throws Throwable
{
LogUtil.init();
StorageService s = StorageService.instance();
s.start();
Loader loader = new Loader(s);
loader.load("mbox_importer.xml");
}
}