/*
* Copyright 2012 NGDATA nv
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.lilyproject.repository.bulk;
import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import com.google.common.collect.Sets;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Put;
import org.apache.zookeeper.KeeperException;
import org.lilyproject.client.LilyClient;
import org.lilyproject.repository.api.Blob;
import org.lilyproject.repository.api.BlobAccess;
import org.lilyproject.repository.api.BlobException;
import org.lilyproject.repository.api.BlobManager;
import org.lilyproject.repository.api.BlobReference;
import org.lilyproject.repository.api.BlobStoreAccess;
import org.lilyproject.repository.api.FieldType;
import org.lilyproject.repository.api.FieldTypes;
import org.lilyproject.repository.api.IdGenerator;
import org.lilyproject.repository.api.QName;
import org.lilyproject.repository.api.Record;
import org.lilyproject.repository.api.RecordFactory;
import org.lilyproject.repository.api.RecordId;
import org.lilyproject.repository.api.Repository;
import org.lilyproject.repository.api.RepositoryException;
import org.lilyproject.repository.api.RepositoryManager;
import org.lilyproject.repository.api.TypeManager;
import org.lilyproject.repository.impl.HBaseRepository;
import org.lilyproject.repository.impl.HBaseRepository.FieldValueWriter;
import org.lilyproject.repository.impl.HBaseRepositoryManager;
import org.lilyproject.repository.impl.HBaseTypeManager;
import org.lilyproject.repository.impl.RecordFactoryImpl;
import org.lilyproject.repository.impl.id.IdGeneratorImpl;
import org.lilyproject.repository.model.api.RepositoryModel;
import org.lilyproject.repository.model.impl.RepositoryModelImpl;
import org.lilyproject.util.exception.ExceptionUtil;
import org.lilyproject.util.hbase.HBaseTableFactory;
import org.lilyproject.util.hbase.HBaseTableFactoryImpl;
import org.lilyproject.util.hbase.LilyHBaseSchema;
import org.lilyproject.util.hbase.RepoAndTableUtil;
import org.lilyproject.util.repo.RecordEvent;
import org.lilyproject.util.repo.RecordEvent.Type;
import org.lilyproject.util.zookeeper.ZkUtil;
import org.lilyproject.util.zookeeper.ZooKeeperItf;
import org.python.google.common.collect.Lists;
/**
* Writes Lily records in bulk to HBase. Also provides methods for creating HBase {@code Put} or
* {@code KeyValue} object for alternative methods of writing (e.g. via MapReduce).
*/
public class BulkIngester implements Closeable {
public static final int PUT_BUFFER_SIZE = 1000;
/**
* Bulk mode is default. If not in bulk mode, the bulk ingester merely delegates all operations to lily client.
*/
private boolean bulkMode = true;
private LilyClient lilyClient;
private HBaseRepository hbaseRepo;
private RecordFactory recordFactory;
private HTableInterface recordTable;
private FieldTypes fieldTypes;
private List<Put> putBuffer = Lists.newArrayListWithCapacity(PUT_BUFFER_SIZE);
public boolean isBulkMode() {
return bulkMode;
}
/**
* Factory method for creation of a {@code BulkIngester} that operates on the default repository table.
*
* @param zkConnString Connection string for ZooKeeper
* @param timeout ZooKeeper session timeout
* @return a new BulkIngester
*/
public static BulkIngester newBulkIngester(String zkConnString, int timeout) {
return newBulkIngester(zkConnString, timeout, RepoAndTableUtil.DEFAULT_REPOSITORY, LilyHBaseSchema.Table.RECORD.name,
true);
}
/**
* Factory method for creation of a {@code BulkIngester} that operates on a non-default repository table.
*
* @param zkConnString connection string for ZooKeeper
* @param timeout ZooKeeper session timeout
* @param tableName name of the repository table to write to
*/
public static BulkIngester newBulkIngester(String zkConnString, int timeout, String repositoryName, String tableName,
boolean bulkMode) {
try {
ZooKeeperItf zk = ZkUtil.connect(zkConnString, timeout);
// we need a lily client for non bulk access
LilyClient lilyClient = new LilyClient(zk);
// we need an HBaseRepository for bulk access
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", zkConnString);
HBaseTableFactory hbaseTableFactory = new HBaseTableFactoryImpl(conf);
HBaseRepository hbaseRepository = createHBaseRepository(repositoryName, tableName, zk, conf, hbaseTableFactory);
return new BulkIngester(
lilyClient,
hbaseRepository,
LilyHBaseSchema.getRecordTable(hbaseTableFactory, hbaseRepository.getRepositoryName(),
hbaseRepository.getTableName()),
bulkMode);
} catch (Exception e) {
ExceptionUtil.handleInterrupt(e);
throw new RuntimeException(e);
}
}
private static HBaseRepository createHBaseRepository(String repositoryName, String tableName, ZooKeeperItf zk,
Configuration conf, HBaseTableFactory hbaseTableFactory)
throws KeeperException, InterruptedException, IOException, RepositoryException {
RepositoryModel repositoryModel = new RepositoryModelImpl(zk);
IdGenerator idGenerator = new IdGeneratorImpl();
TypeManager typeManager = new HBaseTypeManager(idGenerator, conf, zk, hbaseTableFactory);
RecordFactory recordFactory = new RecordFactoryImpl();
RepositoryManager repositoryManager = new HBaseRepositoryManager(typeManager, idGenerator,
recordFactory, hbaseTableFactory, new BlobsNotSupportedBlobManager(), conf, repositoryModel);
HBaseRepository hbaseRepository;
if (tableName != null) {
hbaseRepository = (HBaseRepository) repositoryManager.getRepository(repositoryName).getTable(tableName);
} else {
hbaseRepository = (HBaseRepository) repositoryManager.getRepository(repositoryName);
}
return hbaseRepository;
}
BulkIngester(LilyClient lilyClient, HBaseRepository hbaseRepo, HTableInterface recordTable, boolean bulkMode)
throws InterruptedException {
this.lilyClient = lilyClient;
this.hbaseRepo = hbaseRepo;
this.recordFactory = hbaseRepo.getRecordFactory();
this.recordTable = recordTable;
this.fieldTypes = hbaseRepo.getTypeManager().getFieldTypesSnapshot();
this.bulkMode = bulkMode;
}
/**
* Factory method for creation of Records, with the same semantics as
* {@link Repository#newRecord()}.
*
* @return A newly-created record
*/
public Record newRecord() {
return recordFactory.newRecord();
}
/**
* Same as {@link Repository#getIdGenerator()}.
*
* @return The IdGenerator of the underlying repository
*/
public IdGenerator getIdGenerator() {
return hbaseRepo.getIdGenerator();
}
/**
* When in bulk mode, write a single record directly to HBase, circumventing any indexing or other secondary actions
* that are performed when using the standard Lily API.
* <p>
* <b>WARNING:</b>This method is not thread-safe.
* <p>
* Puts are first written to a buffer, which is flushed when it reaches {@link BulkIngester#PUT_BUFFER_SIZE}.
*
* When not in bulk mode, this merely delegates to createOrUpdate on the Lily HBase repository.
*
* @param record Record to be written
*/
public void write(Record record) throws InterruptedException, RepositoryException, IOException {
if (bulkMode) {
putBuffer.add(buildPut(record));
if (putBuffer.size() == PUT_BUFFER_SIZE) {
flush();
}
} else {
lilyClient.getRepository(hbaseRepo.getRepositoryName()).getTable(hbaseRepo.getTableName()).createOrUpdate(record);
}
}
/**
* Flush buffered Puts to the Lily record table.
* <p>
* This method is not thread-safe.
*/
public void flush() throws IOException {
if (!putBuffer.isEmpty()) {
recordTable.put(Lists.newArrayList(putBuffer));
putBuffer.clear();
}
}
/**
* Build the {@code Put} that represents a record for inserting into HBase.
*
* @param record The record to be translated into an HBase {@code Put}
* @return Put which can be directly written to HBase
*/
public Put buildPut(Record record) throws InterruptedException, RepositoryException {
RecordEvent recordEvent = new RecordEvent();
recordEvent.setType(Type.CREATE);
recordEvent.setTableName(hbaseRepo.getTableName());
// set empty IndexRecordFilterData to omit the warnings in the IndexEditFilter
recordEvent.setIndexRecordFilterData(new RecordEvent.IndexRecordFilterData());
if (record.getId() == null) {
record.setId(getIdGenerator().newRecordId());
}
Put put = hbaseRepo.buildPut(record, 1L, fieldTypes, recordEvent, Sets.<BlobReference>newHashSet(),
Sets.<BlobReference>newHashSet(), 1L);
put.add(LilyHBaseSchema.RecordCf.DATA.bytes, LilyHBaseSchema.RecordColumn.PAYLOAD.bytes, recordEvent.toJsonBytes());
return put;
}
/**
* Build a {@code Put} to update a record. No metadata updates are performed, and any existing metadata on the
* fields will be overwritten.
* <p>
* The record to be updated must exist, otherwise a "partial" record will be created. No checking is done to ensure
* that the record to be updated exists.
* <p>
* Additionally, records updated in this manner must be unversioned records.
* <p>
* In other words, use this method at your own risk. Unless you are very certain about the context you are working
* in, updates should go via the Lily API.
*
* @param recordId identifier of the record to be updated
* @param fieldValues map of field names and values to be updated on the record
* @return Put containing all field updates
*/
public Put buildRecordUpdate(RecordId recordId, Map<QName, Object> fieldValues) {
Put put = new Put(recordId.toBytes());
FieldValueWriter fieldValueWriter = hbaseRepo.newFieldValueWriter(put, null);
for (Entry<QName, Object> fieldEntry : fieldValues.entrySet()) {
try {
fieldValueWriter.addFieldValue(fieldTypes.getFieldType(fieldEntry.getKey()), fieldEntry.getValue(), null);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
return put;
}
@Override
public void close() throws IOException {
flush();
lilyClient.close();
}
/**
* Returns the underlying repository manager. This allows performing any other kind of
* repository-based task within the context of a bulk import job.
*
* @return the underlying RepositoryManager
*/
public RepositoryManager getRepositoryManager() {
return lilyClient;
}
/**
* BlobManager that ensures that blobs aren't supported in bulk import.
* <p>
* In the future, it may be interesting to support blobs, but at the moment it seems as this
* would likely lead to poor import performance.
*/
private static class BlobsNotSupportedBlobManager implements BlobManager {
private static final String NOT_SUPPORTED_MESSAGE = "Blobs are not supported for bulk imports";
@Override
public void incubateBlob(byte[] blobKey) throws IOException {
throw new UnsupportedOperationException(NOT_SUPPORTED_MESSAGE);
}
@Override
public Set<BlobReference> reserveBlobs(Set<BlobReference> blobs) throws IOException {
throw new UnsupportedOperationException(NOT_SUPPORTED_MESSAGE);
}
@Override
public void handleBlobReferences(RecordId recordId, Set<BlobReference> referencedBlobs,
Set<BlobReference> unReferencedBlobs) {
// no op
}
@Override
public OutputStream getOutputStream(Blob blob) throws BlobException {
throw new UnsupportedOperationException(NOT_SUPPORTED_MESSAGE);
}
@Override
public BlobAccess getBlobAccess(Record record, QName fieldName, FieldType fieldType, int... indexes)
throws BlobException {
throw new UnsupportedOperationException(NOT_SUPPORTED_MESSAGE);
}
@Override
public void register(BlobStoreAccess blobStoreAccess) {
throw new UnsupportedOperationException(NOT_SUPPORTED_MESSAGE);
}
@Override
public void delete(byte[] blobKey) throws BlobException {
throw new UnsupportedOperationException(NOT_SUPPORTED_MESSAGE);
}
}
}