/* This file is part of VoltDB.
* Copyright (C) 2008-2010 VoltDB Inc.
*
* VoltDB is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* VoltDB is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltdb.sysprocs.saverestore;
import java.io.EOFException;
import java.io.IOException;
import java.nio.BufferUnderflowException;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayDeque;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Semaphore;
import java.util.zip.CRC32;
import org.voltdb.messaging.FastDeserializer;
import org.voltdb.utils.DBBPool;
import org.voltdb.utils.DBBPool.BBContainer;
import org.voltdb.EELibraryLoader;
import edu.brown.hstore.HStoreConstants;
import edu.brown.hstore.PartitionExecutor.SystemProcedureExecutionContext;
import edu.brown.catalog.CatalogUtil;
import edu.brown.utils.CollectionUtil;
/**
* An abstraction around a table's save file for restore. Deserializes the
* meta-data that was stored when the table was saved and makes it available to
* clients. This follows the structure in src/ee/storage/TableDiskHeader.{h,cpp}
* and looks like: Header length - 4 octet integer version - 4 octet integer
* Host ID - 4 octet integer (this is the name, *not* the GUID) Cluster name -
* VoltDB serialized string (2 octet length followed by chars) Database name -
* VoltDB serialized string Table name - VoltDB serialized string isReplicated -
* 1 octet, indicates whether the table was replicated The following fields are
* conditional on isReplicated == false Partition Ids - Array of 4 octet integer
* ids for partitions in this file Total Hosts - The number of hosts for this
* table when it was saved
*/
public class TableSaveFile {
private static class Container extends BBContainer {
@SuppressWarnings("unused")
private final BBContainer m_origin;
Container(ByteBuffer b, long pointer, BBContainer origin) {
super(b, pointer);
m_origin = origin;
}
@Override
public void discard() {
m_buffers.add(this);
}
}
/**
* It is actually possible to make a bigger chunk then this if the table
* header is big enough...
*/
private static final int DEFAULT_CHUNKSIZE = org.voltdb.SnapshotSiteProcessor.m_snapshotBufferLength + (1024 * 256);
public TableSaveFile(FileChannel dataIn, int readAheadChunks, int relevantPartitionIds[]) throws IOException {
this(dataIn, readAheadChunks, relevantPartitionIds, false);
}
// XXX maybe consider an IOException subclass at some point
public TableSaveFile(FileChannel dataIn, int readAheadChunks, int relevantPartitionIds[], boolean continueOnCorruptedChunk) throws IOException {
try {
EELibraryLoader.loadExecutionEngineLibrary(true);
if (relevantPartitionIds == null) {
m_relevantPartitionIds = null;
} else {
m_relevantPartitionIds = new HashSet<Integer>();
for (Integer i : relevantPartitionIds) {
m_relevantPartitionIds.add(i);
}
}
m_chunkReads = new Semaphore(readAheadChunks);
m_saveFile = dataIn;
m_continueOnCorruptedChunk = continueOnCorruptedChunk;
final CRC32 crc = new CRC32();
/*
* If the CRC check fails because the file wasn't completed
*/
final CRC32 secondCRC = new CRC32();
/*
* Get the header with the save restore specific information
*/
final ByteBuffer lengthBuffer = ByteBuffer.allocate(8);
while (lengthBuffer.hasRemaining()) {
final int read = m_saveFile.read(lengthBuffer);
if (read == -1) {
throw new EOFException();
}
}
lengthBuffer.flip();
final int originalCRC = lengthBuffer.getInt();
int length = lengthBuffer.getInt();
crc.update(lengthBuffer.array(), 4, 4);
secondCRC.update(lengthBuffer.array(), 4, 4);
if (length < 0) {
throw new IOException("Corrupted save file has negative header length");
}
if (length > 2097152) {
throw new IOException("Corrupted save file has unreasonable header length > 2 megs");
}
final ByteBuffer saveRestoreHeader = ByteBuffer.allocate(length);
while (saveRestoreHeader.hasRemaining()) {
final int read = m_saveFile.read(saveRestoreHeader);
if (read == -1 || read < length) {
throw new EOFException();
}
}
saveRestoreHeader.flip();
crc.update(saveRestoreHeader.array());
secondCRC.update(new byte[] { 1 });
secondCRC.update(saveRestoreHeader.array(), 1, saveRestoreHeader.array().length - 1);
/*
* Get the template for the VoltTable serialization header. It will
* have an extra length value preceded to it so that it can be
* sucked straight into a buffer. This will not contain a row count
* since that varies from chunk to chunk and is supplied by the
* chunk
*/
lengthBuffer.clear();
lengthBuffer.limit(4);
/*
* Why this stupidity and no while loop? Because java is broken and
* complains about a random final elsewhere if you do.
*/
{
final int read = m_saveFile.read(lengthBuffer);
if (read == -1) {
throw new EOFException();
}
}
crc.update(lengthBuffer.array(), 0, 4);
secondCRC.update(lengthBuffer.array(), 0, 4);
lengthBuffer.flip();
length = lengthBuffer.getInt();
if (length < 4) {
throw new IOException("Corrupted save file has negative length or too small length for VoltTable header");
}
if (length > 2097152) {
throw new IOException("Corrupted save file has unreasonable VoltTable header length > 2 megs");
}
m_tableHeader = ByteBuffer.allocate(length + 4);
m_tableHeader.putInt(length);
while (m_tableHeader.hasRemaining()) {
final int read = m_saveFile.read(m_tableHeader);
if (read == -1) {
throw new EOFException();
}
}
crc.update(m_tableHeader.array(), 4, length);
secondCRC.update(m_tableHeader.array(), 4, length);
boolean failedCRCDueToNotCompleted = false;
final int actualCRC = (int) crc.getValue();
if (originalCRC != actualCRC) {
/*
* Check if the CRC mismatch is due to the snapshot not being
* completed
*/
final int secondCRCValue = (int) secondCRC.getValue();
if (secondCRCValue == originalCRC) {
failedCRCDueToNotCompleted = true;
} else {
throw new IOException("Checksum mismatch");
}
}
FastDeserializer fd = new FastDeserializer(saveRestoreHeader);
byte completedByte = fd.readByte();
m_completed = failedCRCDueToNotCompleted ? false : (completedByte == 1 ? true : false);
for (int ii = 0; ii < 4; ii++) {
m_versionNum[ii] = fd.readInt();
}
m_createTime = fd.readLong();
m_hostId = fd.readInt();
m_hostname = fd.readString();
m_clusterName = fd.readString();
m_databaseName = fd.readString();
m_tableName = fd.readString();
m_isReplicated = fd.readBoolean();
if (!m_isReplicated) {
m_partitionIds = (int[]) fd.readArray(int.class);
if (!m_completed) {
for (Integer partitionId : m_partitionIds) {
m_corruptedPartitions.add(partitionId);
}
}
m_totalPartitions = fd.readInt();
} else {
m_partitionIds = new int[] { 0 };
m_totalPartitions = 1;
if (!m_completed) {
m_corruptedPartitions.add(0);
}
}
//System.err.println("Tablename :" + m_tableName);
//System.err.println("Replicated :" + m_isReplicated);
//System.err.println("# Partitions :" + m_totalPartitions);
// System.err.println("Completed :"+m_completed);
//System.err.println("File Channel Size :" + m_saveFile.size());
//System.err.println("File Channel Position :" + m_saveFile.position());
//System.err.println("-----");
/*
* Several runtime exceptions can be thrown in valid failure cases
* where a corrupt save file is being detected.
*/
} catch (BufferUnderflowException e) {
throw new IOException(e);
} catch (BufferOverflowException e) {
throw new IOException(e);
} catch (IndexOutOfBoundsException e) {
throw new IOException(e);
}
}
public int[] getVersionNumber() {
return m_versionNum;
}
public int getHostId() {
return m_hostId;
}
public String getHostname() {
return m_hostname;
}
public String getClusterName() {
return m_clusterName;
}
public String getDatabaseName() {
return m_databaseName;
}
public String getTableName() {
return m_tableName;
}
public int[] getPartitionIds() {
return m_partitionIds;
}
public boolean isReplicated() {
return m_isReplicated;
}
public int getTotalPartitions() {
return m_totalPartitions;
}
public boolean getCompleted() {
return m_completed;
}
public long getCreateTime() {
return m_createTime;
}
public FileChannel getFileChannel() {
return m_saveFile;
}
public void setFilePath(String path) {
m_filePath = path;
}
public String getFilePath() {
return m_filePath;
}
public void close() throws IOException {
if (m_chunkReaderThread != null) {
m_chunkReaderThread.interrupt();
try {
m_chunkReaderThread.join();
} catch (InterruptedException e) {
throw new IOException(e);
}
}
synchronized (this) {
while (!m_availableChunks.isEmpty()) {
m_availableChunks.poll().discard();
}
notifyAll();
}
}
public Set<Integer> getCorruptedPartitionIds() {
return m_corruptedPartitions;
}
public ByteBuffer getTableHeader() {
return m_tableHeader;
}
// Will get the next chunk of the table that is just over the chunk size
public synchronized BBContainer getNextChunk() throws IOException {
if (m_chunkReaderException != null) {
throw m_chunkReaderException;
}
if (!m_hasMoreChunks) {
return m_availableChunks.poll();
}
if (m_chunkReader == null) {
m_chunkReader = new ChunkReader();
m_chunkReaderThread = new Thread(m_chunkReader, "ChunkReader");
m_chunkReaderThread.start();
}
Container c = null;
while (c == null && (m_hasMoreChunks || !m_availableChunks.isEmpty())) {
c = m_availableChunks.poll();
if (c == null) {
try {
wait();
} catch (InterruptedException e) {
e.printStackTrace();
throw new IOException(e);
}
}
}
if (c != null) {
m_chunkReads.release();
}
return c;
}
public synchronized boolean hasMoreChunks() throws IOException {
if (m_chunkReaderException != null) {
throw m_chunkReaderException;
}
return m_hasMoreChunks || !m_availableChunks.isEmpty();
}
//
// /**
// * A wrapper for the in memory storage for a table chunk
// * that counts the number of times the chunk is discarded
// * and only returns the memory back to the pool when the
// * chunk has been read by enough times. This is necessary
// * for replicated tables so that they only have to
// *
// */
// private class ChunkCounter {
//
// private ChunkCounter(BBContainer c, int chunkIndex) {
// m_container = c;
// m_chunkIndex = chunkIndex;
// }
//
// private BBContainer fetch() {
// m_fetches++;
// if (m_fetches == m_fetchCount) {
// return m_container;
// }
// }
//
// private final BBContainer m_container;
// private int m_chunkIndex;
// private int m_fetches = 0;
// }
// /**
// * Number of times a chunk must be fetched before its buffer can
// * be returned to the pool
// */
// private final int m_fetchCount;
private final FileChannel m_saveFile;
private final ByteBuffer m_tableHeader;
private final boolean m_completed;
private final int m_versionNum[] = new int[4];
private final int m_hostId;
private final String m_hostname;
private final String m_clusterName;
private final String m_databaseName;
private final String m_tableName;
private final boolean m_isReplicated;
private final int m_partitionIds[];
private final int m_totalPartitions;
private final long m_createTime;
private boolean m_hasMoreChunks = true;
private static ConcurrentLinkedQueue<Container> m_buffers = new ConcurrentLinkedQueue<Container>();
private final ArrayDeque<Container> m_availableChunks = new ArrayDeque<Container>();
private final HashSet<Integer> m_relevantPartitionIds;
private String m_filePath;
/**
* Maintain a list of corrupted partitions. It is possible for uncorrupted
* partitions to be recovered from a save file in the future
*/
private final HashSet<Integer> m_corruptedPartitions = new HashSet<Integer>();
/**
* Ignore corrupted chunks and continue validation of the rest of the
* chunks.
*/
private final boolean m_continueOnCorruptedChunk;
/**
* The thread reading chunks will read at most this number of chunks
*/
private final Semaphore m_chunkReads;
private ChunkReader m_chunkReader = null;
private Thread m_chunkReaderThread = null;
private IOException m_chunkReaderException = null;
/**
* Thread to read chunks from the disk
*/
private class ChunkReader implements Runnable {
private void readChunks() {
int chunksRead = 0;
while (m_hasMoreChunks) {
/*
* Limit the number of chunk reads at any one time.
*/
try {
m_chunkReads.acquire();
} catch (InterruptedException e) {
return;
}
boolean expectedAnotherChunk = false;
try {
/*
* Get the length of the next chunk, partition id, crc for
* partition id,
*/
ByteBuffer chunkLengthB = ByteBuffer.allocate(16);
while (chunkLengthB.hasRemaining()) {
final int read = m_saveFile.read(chunkLengthB);
if (read == -1) {
throw new EOFException();
}
}
chunkLengthB.flip();
final int nextChunkLength = chunkLengthB.getInt();
expectedAnotherChunk = true;
/*
* Get the partition id and its CRC and validate it.
* Validating the partition ID for the chunk separately
* makes it possible to continue processing chunks from
* other partitions if only one partition has corrupt chunks
* in the file.
*/
final CRC32 partitionIdCRC = new CRC32();
chunkLengthB.mark();
final int nextChunkPartitionId = chunkLengthB.getInt();
final int nextChunkPartitionIdCRC = chunkLengthB.getInt();
chunkLengthB.reset();
byte partitionIdBytes[] = new byte[4];
chunkLengthB.get(partitionIdBytes);
partitionIdCRC.update(partitionIdBytes);
int generatedValue = (int) partitionIdCRC.getValue();
if (generatedValue != nextChunkPartitionIdCRC) {
chunkLengthB.position(0);
for (int partitionId : m_partitionIds) {
m_corruptedPartitions.add(partitionId);
}
throw new IOException("Chunk partition ID CRC check failed. " + "This corrupts all partitions in this file");
}
//System.err.println("nextChunkPartitionId :"+nextChunkPartitionId);
//System.err.println("nextChunkLength :"+nextChunkLength);
/*
* CRC for the data portion of the chunk
*/
chunkLengthB.position(chunkLengthB.position() + 4);
final int nextChunkCRC = chunkLengthB.getInt();
/*
* Sanity check the length value to ensure there isn't a
* runtime exception or OOM.
*/
if (nextChunkLength < 0) {
throw new IOException("Corrupted TableSaveFile chunk has negative chunk length");
}
if (nextChunkLength > DEFAULT_CHUNKSIZE) {
throw new IOException("Corrupted TableSaveFile chunk has unreasonable length " + "> DEFAULT_CHUNKSIZE bytes");
}
/*
* Now allocate space to store the chunk using the VoltTable
* serialization representation. The chunk will contain an
* integer row count preceding it so it can be sucked
* straight in. There is a little funny business to
* overwrite the partition id that is not part of the
* serialization format
*/
Container c = m_buffers.poll();
if (c == null) {
final BBContainer originContainer = DBBPool.allocateDirect(DEFAULT_CHUNKSIZE);
final ByteBuffer b = originContainer.b;
final long pointer = org.voltdb.utils.DBBPool.getBufferAddress(b);
c = new Container(b, pointer, originContainer);
}
/*
* If the length value is wrong or not all data made it to
* disk this read will not complete correctly. There could
* be overflow, underflow etc. so use a try finally block to
* indicate that all partitions are now corrupt. The
* enclosing exception handlers will do the right thing WRT
* to propagating the error and closing the file.
*/
boolean completedRead = false;
int checksumStartPosition = 0;
int rowCount = 0;
try {
/*
* Assemble a VoltTable out of the chunk of tuples. Put
* in the header that was cached in the constructor,
* then copy the tuple data. The row count is at the end
* because it isn't known until serialization is
* complete. It will have to be moved back to the
* beginning of the tuple data after the header once the
* CRC has been calculated.
*/
c.b.clear();
c.b.limit((nextChunkLength - 8) + m_tableHeader.capacity());
m_tableHeader.position(0);
c.b.put(m_tableHeader);
c.b.position(c.b.position() + 4);// Leave space for row
// count to be moved
// into
checksumStartPosition = c.b.position();
while (c.b.hasRemaining()) {
final int read = m_saveFile.read(c.b);
if (read == -1) {
throw new EOFException();
}
}
c.b.position(c.b.position() - 4);
rowCount = c.b.getInt();
c.b.position(checksumStartPosition);
completedRead = true;
} finally {
if (!completedRead) {
for (int partitionId : m_partitionIds) {
m_corruptedPartitions.add(partitionId);
}
}
}
/*
* Validate the rest of the chunk. This can fail if the data
* is corrupted or the length value was corrupted.
*/
final int calculatedCRC = DBBPool.getBufferCRC32(c.b, c.b.position(), c.b.remaining());
if (calculatedCRC != nextChunkCRC) {
m_corruptedPartitions.add(nextChunkPartitionId);
if (m_continueOnCorruptedChunk) {
c.discard();
m_chunkReads.release();
continue;
} else {
throw new IOException("CRC mismatch in saved table chunk");
}
}
/*
* Skip irrelevant chunks after CRC is calculated. Always
* calulate the CRC in case it is the length value that is
* corrupted
*/
if (m_relevantPartitionIds != null) {
if (!m_relevantPartitionIds.contains(nextChunkPartitionId)) {
c.discard();
m_chunkReads.release();
continue;
}
}
/*
* The row count which was stored on disk at the end (and
* for the CRC calc) is now moved to the appropriate place
* for the table serialization format. Update the limit to
* reflect that. Surrounded in a try finally just in case
* there is overflow/underflow. Shouldn't happen but I could
* be wrong.
*/
boolean success = false;
try {
c.b.limit(c.b.limit() - 4);
c.b.position(checksumStartPosition - 4);
c.b.putInt(rowCount);
c.b.position(0);
success = true;
} finally {
if (!success) {
for (int partitionId : m_partitionIds) {
m_corruptedPartitions.add(partitionId);
}
}
}
++chunksRead;
synchronized (TableSaveFile.this) {
m_availableChunks.offer(c);
TableSaveFile.this.notifyAll();
}
} catch (EOFException eof) {
synchronized (TableSaveFile.this) {
m_hasMoreChunks = false;
if (expectedAnotherChunk) {
m_chunkReaderException = new IOException("Expected to find another chunk but reached end of file instead");
}
TableSaveFile.this.notifyAll();
}
} catch (IOException e) {
synchronized (TableSaveFile.this) {
m_hasMoreChunks = false;
m_chunkReaderException = e;
TableSaveFile.this.notifyAll();
}
} catch (BufferUnderflowException e) {
synchronized (TableSaveFile.this) {
m_hasMoreChunks = false;
m_chunkReaderException = new IOException(e);
TableSaveFile.this.notifyAll();
}
} catch (BufferOverflowException e) {
synchronized (TableSaveFile.this) {
m_hasMoreChunks = false;
m_chunkReaderException = new IOException(e);
TableSaveFile.this.notifyAll();
}
} catch (IndexOutOfBoundsException e) {
synchronized (TableSaveFile.this) {
m_hasMoreChunks = false;
m_chunkReaderException = new IOException(e);
TableSaveFile.this.notifyAll();
}
}
}
}
@Override
public void run() {
try {
readChunks();
} finally {
synchronized (TableSaveFile.this) {
m_hasMoreChunks = false;
TableSaveFile.this.notifyAll();
try {
m_saveFile.close();
} catch (IOException e) {
}
}
}
}
}
}