Package org.apache.cassandra.bulkloader

Source Code of org.apache.cassandra.bulkloader.CassandraBulkLoader$Map

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
  * Cassandra has a back door called the Binary Memtable. The purpose of this backdoor is to
  * mass import large amounts of data, without using the Thrift interface.
  *
  * Inserting data through the binary memtable, allows you to skip the commit log overhead, and an ack
  * from Thrift on every insert. The example below utilizes Hadoop to generate all the data necessary
  * to send to Cassandra, and sends it using the Binary Memtable interface. What Hadoop ends up doing is
  * creating the actual data that gets put into an SSTable as if you were using Thrift. With enough Hadoop nodes
  * inserting the data, the bottleneck at this point should become the network.
  *
  * We recommend adjusting the compaction threshold to 0, while the import is running. After the import, you need
  * to run `nodeprobe -host <IP> flush_binary <Keyspace>` on every node, as this will flush the remaining data still left
  * in memory to disk. Then it's recommended to adjust the compaction threshold to it's original value.
  *
  * The example below is a sample Hadoop job, and it inserts SuperColumns. It can be tweaked to work with normal Columns.
  *
  * You should construct your data you want to import as rows delimited by a new line. You end up grouping by <Key>
  * in the mapper, so that the end result generates the data set into a column oriented subset. Once you get to the
  * reduce aspect, you can generate the ColumnFamilies you want inserted, and send it to your nodes.
  *
  * For Cassandra 0.6.4, we modified this example to wait for acks from all Cassandra nodes for each row
  * before proceeding to the next.  This means to keep Cassandra similarly busy you can either
  * 1) add more reducer tasks,
  * 2) remove the "wait for acks" block of code,
  * 3) parallelize the writing of rows to Cassandra, e.g. with an Executor.
  *
  * THIS CANNOT RUN ON THE SAME IP ADDRESS AS A CASSANDRA INSTANCE.
  */
 
package org.apache.cassandra.bulkloader;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.clock.TimestampReconciler;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.filter.QueryPath;
import org.apache.cassandra.dht.BigIntegerToken;
import org.apache.cassandra.io.util.DataOutputBuffer;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.apache.cassandra.net.IAsyncResult;
import org.apache.cassandra.net.Message;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;

public class CassandraBulkLoader {
    public static class Map extends MapReduceBase implements Mapper<Text, Text, Text, Text> {
        private Text word = new Text();

        public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
            // This is a simple key/value mapper.
            output.collect(key, value);
        }
    }

    public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
        private Path[] localFiles;
        private ArrayList<String> tokens = new ArrayList<String>();
        private JobConf jobconf;

        public void configure(JobConf job) {
            this.jobconf = job;
            String cassConfig;

            // Get the cached files
            try
            {
                localFiles = DistributedCache.getLocalCacheFiles(job);
            }
            catch (IOException e)
            {
                throw new RuntimeException(e);
            }
            cassConfig = localFiles[0].getParent().toString();

            System.setProperty("storage-config",cassConfig);

            try
            {
                StorageService.instance.initClient();
            }
            catch (IOException e)
            {
                throw new RuntimeException(e);
            }
            try
            {
                Thread.sleep(10*1000);
            }
            catch (InterruptedException e)
            {
                throw new RuntimeException(e);
            }
        }

        public void close()
        {
            try
            {
                // release the cache
                DistributedCache.releaseCache(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), this.jobconf);
            }
            catch (IOException e)
            {
                throw new RuntimeException(e);
            }
            catch (URISyntaxException e)
            {
                throw new RuntimeException(e);
            }
            try
            {
                // Sleep just in case the number of keys we send over is small
                Thread.sleep(3*1000);
            }
            catch (InterruptedException e)
            {
                throw new RuntimeException(e);
            }
            StorageService.instance.stopClient();
        }

        public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException
        {
            ColumnFamily columnFamily;
            String keyspace = "Keyspace1";
            String cfName = "Super1";
            Message message;
            List<ColumnFamily> columnFamilies;
            columnFamilies = new LinkedList<ColumnFamily>();
            String line;

            /* Create a column family */
            columnFamily = ColumnFamily.create(keyspace, cfName);
            while (values.hasNext()) {
                // Split the value (line based on your own delimiter)
                line = values.next().toString();
                String[] fields = line.split("\1");
                String SuperColumnName = fields[1];
                String ColumnName = fields[2];
                String ColumnValue = fields[3];
                int timestamp = 0;
                columnFamily.addColumn(new QueryPath(cfName, SuperColumnName.getBytes("UTF-8"), ColumnName.getBytes("UTF-8")), ColumnValue.getBytes(), new TimestampClock(timestamp));
            }

            columnFamilies.add(columnFamily);

            /* Get serialized message to send to cluster */
            message = createMessage(keyspace, key.getBytes(), cfName, columnFamilies);
            List<IAsyncResult> results = new ArrayList<IAsyncResult>();
            for (InetAddress endpoint: StorageService.instance.getNaturalEndpoints(keyspace, key.getBytes()))
            {
                /* Send message to end point */
                results.add(MessagingService.instance.sendRR(message, endpoint));
            }
            /* wait for acks */
            for (IAsyncResult result : results)
            {
                try
                {
                    result.get(DatabaseDescriptor.getRpcTimeout(), TimeUnit.MILLISECONDS);
                }
                catch (TimeoutException e)
                {
                    // you should probably add retry logic here
                    throw new RuntimeException(e);
                }
            }
           
            output.collect(key, new Text(" inserted into Cassandra node(s)"));
        }
    }

    public static void runJob(String[] args)
    {
        JobConf conf = new JobConf(CassandraBulkLoader.class);

        if(args.length >= 4)
        {
          conf.setNumReduceTasks(new Integer(args[3]));
        }

        try
        {
            // We store the cassandra storage-conf.xml on the HDFS cluster
            DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf);
        }
        catch (URISyntaxException e)
        {
            throw new RuntimeException(e);
        }
        conf.setInputFormat(KeyValueTextInputFormat.class);
        conf.setJobName("CassandraBulkLoader_v2");
        conf.setMapperClass(Map.class);
        conf.setReducerClass(Reduce.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(conf, new Path(args[1]));
        FileOutputFormat.setOutputPath(conf, new Path(args[2]));
        try
        {
            JobClient.runJob(conf);
        }
        catch (IOException e)
        {
            throw new RuntimeException(e);
        }
    }

    public static Message createMessage(String Keyspace, byte[] Key, String CFName, List<ColumnFamily> ColumnFamiles)
    {
        ColumnFamily baseColumnFamily;
        DataOutputBuffer bufOut = new DataOutputBuffer();
        RowMutation rm;
        Message message;
        Column column;

        /* Get the first column family from list, this is just to get past validation */
        baseColumnFamily = new ColumnFamily(ColumnFamilyType.Standard,
                                            ClockType.Timestamp,
                                            DatabaseDescriptor.getComparator(Keyspace, CFName),
                                            DatabaseDescriptor.getSubComparator(Keyspace, CFName),
                                            new TimestampReconciler(),
                                            CFMetaData.getId(Keyspace, CFName));
       
        for(ColumnFamily cf : ColumnFamiles) {
            bufOut.reset();
            ColumnFamily.serializer().serializeWithIndexes(cf, bufOut);
            byte[] data = new byte[bufOut.getLength()];
            System.arraycopy(bufOut.getData(), 0, data, 0, bufOut.getLength());

            column = new Column(FBUtilities.toByteArray(cf.id()), data, new TimestampClock(0));
            baseColumnFamily.addColumn(column);
        }
        rm = new RowMutation(Keyspace, Key);
        rm.add(baseColumnFamily);

        try
        {
            /* Make message */
            message = rm.makeRowMutationMessage(StorageService.Verb.BINARY);
        }
        catch (IOException e)
        {
            throw new RuntimeException(e);
        }

        return message;
    }
    public static void main(String[] args) throws Exception
    {
        runJob(args);
    }
}
TOP

Related Classes of org.apache.cassandra.bulkloader.CassandraBulkLoader$Map

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.