/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fusesource.hawtjournal.api;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.Iterator;
import java.util.Set;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.zip.Adler32;
import java.util.zip.Checksum;
import org.fusesource.hawtbuf.Buffer;
import org.fusesource.hawtbuf.DataByteArrayOutputStream;
import org.fusesource.hawtjournal.util.IOHelper;
import static org.fusesource.hawtjournal.util.LogHelper.*;
/**
* Journal implementation based on append-only rotating logs and checksummed records, with fully concurrent writes and reads,
* dynamic batching and logs compaction.<br/>
* Journal records can be written, read and deleted by providing a {@link Location} object.<br/>
* The whole journal can be replayed by simply iterating through it in a foreach block.<br/>
*
* @author <a href="http://hiramchirino.com">Hiram Chirino</a>
* @author Sergio Bossa
*/
public class Journal implements Iterable<Location> {
static final int RECORD_SIZE = 4;
static final int TYPE_SIZE = 1;
static final int HEADER_SIZE = RECORD_SIZE + TYPE_SIZE;
//
static final int BATCH_SIZE = 4;
static final int CHECKSUM_SIZE = 8;
static final byte[] BATCH_CONTROL_RECORD_MAGIC = "WRITE BATCH".getBytes(Charset.forName("UTF-8"));
static final int BATCH_CONTROL_RECORD_SIZE = HEADER_SIZE + BATCH_SIZE + BATCH_CONTROL_RECORD_MAGIC.length + CHECKSUM_SIZE;
//
static final String DEFAULT_DIRECTORY = ".";
static final String DEFAULT_ARCHIVE_DIRECTORY = "data-archive";
static final String DEFAULT_FILE_PREFIX = "db-";
static final String DEFAULT_FILE_SUFFIX = ".log";
static final int DEFAULT_MAX_FILE_LENGTH = 1024 * 1024 * 32;
static final int DEFAULT_DISPOSE_INTERVAL = 1000 * 60;
static final int MIN_FILE_LENGTH = 1024;
static final int DEFAULT_MAX_BATCH_SIZE = DEFAULT_MAX_FILE_LENGTH;
//
private final ConcurrentNavigableMap<Integer, DataFile> dataFiles = new ConcurrentSkipListMap<Integer, DataFile>();
private final ConcurrentNavigableMap<Location, WriteCommand> inflightWrites = new ConcurrentSkipListMap<Location, WriteCommand>();
private final AtomicReference<Location> lastAppendLocation = new AtomicReference<Location>();
private final AtomicLong totalLength = new AtomicLong();
//
private File directory = new File(DEFAULT_DIRECTORY);
private File directoryArchive = new File(DEFAULT_ARCHIVE_DIRECTORY);
//
private String filePrefix = DEFAULT_FILE_PREFIX;
private String fileSuffix = DEFAULT_FILE_SUFFIX;
private int maxWriteBatchSize = DEFAULT_MAX_BATCH_SIZE;
private int maxFileLength = DEFAULT_MAX_FILE_LENGTH;
private long disposeInterval = DEFAULT_DISPOSE_INTERVAL;
private boolean checksum = true;
//
private DataFileAppender appender;
private DataFileAccessor accessor;
//
private boolean opened;
//
private boolean archiveFiles;
//
private JournalListener listener;
//
private ReplicationTarget replicationTarget;
/**
* Open the journal, eventually recovering it if already existent.
*
* @throws IOException
*/
public synchronized void open() throws IOException {
if (opened) {
return;
}
if (maxFileLength < MIN_FILE_LENGTH) {
throw new IllegalStateException("Max file length must be equal or greater than: " + MIN_FILE_LENGTH);
}
if (maxWriteBatchSize > maxFileLength) {
throw new IllegalStateException("Max batch size must be equal or less than: " + maxFileLength);
}
long start = System.currentTimeMillis();
opened = true;
accessor = new DataFileAccessor(this);
accessor.open();
appender = new DataFileAppender(this);
appender.open();
File[] files = directory.listFiles(new FilenameFilter() {
public boolean accept(File dir, String n) {
return dir.equals(directory) && n.startsWith(filePrefix) && n.endsWith(fileSuffix);
}
});
if (files != null && files.length > 0) {
for (int i = 0; i < files.length; i++) {
try {
File file = files[i];
String n = file.getName();
String numStr = n.substring(filePrefix.length(), n.length() - fileSuffix.length());
int num = Integer.parseInt(numStr);
DataFile dataFile = new DataFile(file, num);
dataFiles.put(dataFile.getDataFileId(), dataFile);
totalLength.addAndGet(dataFile.getLength());
} catch (NumberFormatException e) {
// Ignore file that do not match the pattern.
}
}
try {
Location recovered = recoveryCheck();
lastAppendLocation.set(recovered);
} catch (IOException e) {
warn(e, "Recovery check failed!");
}
}
long end = System.currentTimeMillis();
trace("Startup took: %d ms", (end - start));
}
/**
* Close the journal.
*
* @throws IOException
*/
public synchronized void close() throws IOException {
if (!opened) {
return;
}
accessor.close();
appender.close();
dataFiles.clear();
inflightWrites.clear();
opened = false;
}
/**
* Compact the journal, reducing size of logs containing deleted entries and completely removing completely empty (with only deleted entries) logs.
*
* @throws IOException
*/
public synchronized void compact() throws IOException {
if (!opened) {
return;
} else {
accessor.pause();
try {
for (DataFile file : dataFiles.values()) {
// Can't compact the data file (or subsequent files) that is currently being written to:
if (file.getDataFileId() >= lastAppendLocation.get().getDataFileId()) {
continue;
} else {
Location firstUserLocation = goToFirstLocation(file, Location.USER_RECORD_TYPE, false);
if (firstUserLocation == null) {
removeDataFile(file);
} else {
Location firstDeletedLocation = goToFirstLocation(file, Location.DELETED_RECORD_TYPE, false);
if (firstDeletedLocation != null) {
compactDataFile(file, firstUserLocation);
}
}
}
}
} finally {
accessor.resume();
}
}
}
/**
* Read the record stored at the given {@link Location}.
*
* @param location
* @return
* @throws IOException
* @throws IllegalStateException
*/
public ByteBuffer read(Location location) throws IOException, IllegalStateException {
Buffer buffer = accessor.readLocation(location);
return buffer.toByteBuffer();
}
/**
* Write the given byte buffer record, either sync or async, and returns the stored {@link Location}.<br/>
* A sync write causes all previously batched async writes to be synced too.
*
* @param data
* @param sync True if sync, false if async.
* @return
* @throws IOException
* @throws IllegalStateException
*/
public Location write(ByteBuffer data, boolean sync) throws IOException, IllegalStateException {
Location loc = appender.storeItem(new Buffer(data), Location.USER_RECORD_TYPE, sync);
return loc;
}
/**
* Delete the record at the given {@link Location}.<br/>
* Deletes cause first a batch sync and always are logical: records will be actually deleted at log cleanup time.
* @param location
* @throws IOException
* @throws IllegalStateException
*/
public void delete(Location location) throws IOException, IllegalStateException {
accessor.updateLocation(location, Location.DELETED_RECORD_TYPE, true);
}
/**
* Return an iterator to replay the journal by going through all records locations.
*
* @return
*/
public Iterator<Location> iterator() {
return new Iterator<Location>() {
private Location next = init();
public boolean hasNext() {
return next != null;
}
public Location next() {
if (next != null) {
try {
Location current = next;
next = goToNextLocation(current, Location.USER_RECORD_TYPE, true);
return current;
} catch (IOException ex) {
throw new IllegalStateException(ex.getMessage(), ex);
}
} else {
throw new IllegalStateException("No next location!");
}
}
public void remove() {
if (next != null) {
try {
delete(next);
} catch (IOException ex) {
throw new IllegalStateException(ex.getMessage(), ex);
}
} else {
throw new IllegalStateException("No location to remove!");
}
}
private Location init() {
try {
return goToFirstLocation(dataFiles.firstEntry().getValue(), Location.USER_RECORD_TYPE, true);
} catch (IOException ex) {
throw new IllegalStateException(ex.getMessage(), ex);
}
}
};
}
/**
* Get the files part of this journal.
* @return
*/
public Set<File> getFiles() {
Set<File> result = new HashSet<File>();
for (DataFile dataFile : dataFiles.values()) {
result.add(dataFile.getFile());
}
return result;
}
/**
* Get the max length of each log file.
* @return
*/
public int getMaxFileLength() {
return maxFileLength;
}
/**
* Set the max length of each log file.
*/
public void setMaxFileLength(int maxFileLength) {
this.maxFileLength = maxFileLength;
}
/**
* Get the journal directory containing log files.
* @return
*/
public File getDirectory() {
return directory;
}
/**
* Set the journal directory containing log files.
*/
public void setDirectory(File directory) {
this.directory = directory;
}
/**
* Get the prefix for log files.
* @return
*/
public String getFilePrefix() {
return filePrefix;
}
/**
* Set the prefix for log files.
* @param filePrefix
*/
public void setFilePrefix(String filePrefix) {
this.filePrefix = filePrefix;
}
/**
* Get the optional archive directory used to archive cleaned up log files.
* @return
*/
public File getDirectoryArchive() {
return directoryArchive;
}
/**
* Set the optional archive directory used to archive cleaned up log files.
* @param directoryArchive
*/
public void setDirectoryArchive(File directoryArchive) {
this.directoryArchive = directoryArchive;
}
/**
* Return true if cleaned up log files should be archived, false otherwise.
* @return
*/
public boolean isArchiveFiles() {
return archiveFiles;
}
/**
* Set true if cleaned up log files should be archived, false otherwise.
* @param archiveFiles
*/
public void setArchiveFiles(boolean archiveFiles) {
this.archiveFiles = archiveFiles;
}
/**
* Set the {@link ReplicationTarget} to replicate batch writes to.
* @param replicationTarget
*/
public void setReplicationTarget(ReplicationTarget replicationTarget) {
this.replicationTarget = replicationTarget;
}
/**
* Get the {@link ReplicationTarget} to replicate batch writes to.
* @return
*/
public ReplicationTarget getReplicationTarget() {
return replicationTarget;
}
/**
* Get the suffix for log files.
* @return
*/
public String getFileSuffix() {
return fileSuffix;
}
/**
* Set the suffix for log files.
* @param fileSuffix
*/
public void setFileSuffix(String fileSuffix) {
this.fileSuffix = fileSuffix;
}
/**
* Return true if records checksum is enabled, false otherwise.
* @return
*/
public boolean isChecksum() {
return checksum;
}
/**
* Set true if records checksum is enabled, false otherwise.
* @param checksumWrites
*/
public void setChecksum(boolean checksumWrites) {
this.checksum = checksumWrites;
}
/**
* Get the max size in bytes of the write batch: must always be equal or less than the max file length.
* @return
*/
public int getMaxWriteBatchSize() {
return maxWriteBatchSize;
}
/**
* Set the max size in bytes of the write batch: must always be equal or less than the max file length.
* @param maxWriteBatchSize
*/
public void setMaxWriteBatchSize(int maxWriteBatchSize) {
this.maxWriteBatchSize = maxWriteBatchSize;
}
/**
* Get the {@link JournalListener} to notify when syncing batches.
* @return
*/
public JournalListener getListener() {
return listener;
}
/**
* Set the {@link JournalListener} to notify when syncing batches.
* @param listener
*/
public void setListener(JournalListener listener) {
this.listener = listener;
}
/**
* Set the milliseconds interval for resources disposal: i.e., un-accessed files will be closed.
* @param disposeInterval
*/
public void setDisposeInterval(long disposeInterval) {
this.disposeInterval = disposeInterval;
}
/**
* Get the milliseconds interval for resources disposal.
* @return
*/
public long getDisposeInterval() {
return disposeInterval;
}
public String toString() {
return directory.toString();
}
ConcurrentNavigableMap<Integer, DataFile> getDataFiles() {
return dataFiles;
}
ConcurrentNavigableMap<Location, WriteCommand> getInflightWrites() {
return inflightWrites;
}
void sync() throws IOException {
try {
appender.sync().get();
} catch (Exception ex) {
throw new IllegalStateException(ex.getMessage(), ex);
}
}
DataFile getCurrentWriteFile() throws IOException {
if (dataFiles.isEmpty()) {
rotateWriteFile();
}
return dataFiles.lastEntry().getValue();
}
DataFile rotateWriteFile() {
int nextNum = !dataFiles.isEmpty() ? dataFiles.lastEntry().getValue().getDataFileId().intValue() + 1 : 1;
File file = getFile(nextNum);
DataFile nextWriteFile = new DataFile(file, nextNum);
if (!dataFiles.isEmpty()) {
dataFiles.lastEntry().getValue().setNext(nextWriteFile);
}
dataFiles.put(nextWriteFile.getDataFileId(), nextWriteFile);
return nextWriteFile;
}
void setLastAppendLocation(Location location) {
this.lastAppendLocation.set(location);
}
void addToTotalLength(int size) {
totalLength.addAndGet(size);
}
private Location goToFirstLocation(DataFile file, byte type, boolean goToNextFile) throws IOException, IllegalStateException {
Location candidate = new Location();
candidate.setDataFileId(file.getDataFileId());
candidate.setOffset(0);
if (!fillLocationDetails(candidate, false)) {
return null;
} else {
if (candidate.getType() == type) {
return candidate;
} else {
return goToNextLocation(candidate, type, goToNextFile);
}
}
}
private Location goToNextLocation(Location start, byte type, boolean goToNextFile) throws IOException {
if (start.getSize() == Location.NOT_SET && !fillLocationDetails(start, false)) {
return null;
} else {
Location current = start;
Location next = null;
while (next == null) {
Location candidate = new Location(current);
candidate.setOffset(current.getOffset() + current.getSize());
if (!fillLocationDetails(candidate, goToNextFile)) {
break;
} else {
if (candidate.getType() == type) {
next = candidate;
} else {
current = candidate;
}
}
}
return next;
}
}
private boolean fillLocationDetails(Location cur, boolean goToNextFile) throws IOException {
DataFile dataFile = getDataFile(cur);
// Did it go into the next file and should we go too?
if (dataFile.getLength() <= cur.getOffset()) {
if (goToNextFile) {
dataFile = getNextDataFile(dataFile);
if (dataFile == null) {
return false;
} else {
cur.setDataFileId(dataFile.getDataFileId().intValue());
cur.setOffset(0);
}
} else {
return false;
}
}
return accessor.fillLocationDetails(cur);
}
private File getFile(int nextNum) {
String fileName = filePrefix + nextNum + fileSuffix;
File file = new File(directory, fileName);
return file;
}
private DataFile getDataFile(Location item) throws IOException {
Integer key = Integer.valueOf(item.getDataFileId());
DataFile dataFile = dataFiles.get(key);
if (dataFile == null) {
error("Looking for key %d but not found among data files %s", key, dataFiles);
throw new IOException("Could not locate data file " + getFile(item.getDataFileId()));
}
return dataFile;
}
private DataFile getNextDataFile(DataFile dataFile) {
return dataFile.getNext();
}
private void removeDataFile(DataFile dataFile) throws IOException {
dataFiles.remove(dataFile.getDataFileId());
totalLength.addAndGet(-dataFile.getLength());
if (archiveFiles) {
dataFile.move(getDirectoryArchive());
debug("moved data file %s to %s", dataFile, getDirectoryArchive());
} else {
if (dataFile.delete()) {
debug("Discarded data file %s", dataFile);
} else {
warn("Failed to discard data file %s", dataFile.getFile());
}
}
}
private void compactDataFile(DataFile currentFile, Location firstUserLocation) throws IOException {
DataFile tmpFile = new DataFile(
new File(currentFile.getFile().getParent(), filePrefix + currentFile.getDataFileId() + ".tmp" + fileSuffix),
currentFile.getDataFileId());
RandomAccessFile raf = tmpFile.openRandomAccessFile();
try {
Location currentUserLocation = firstUserLocation;
WriteBatch batch = new WriteBatch(tmpFile, 0);
batch.prepareBatch();
while (currentUserLocation != null) {
Buffer data = accessor.readLocation(currentUserLocation);
WriteCommand write = new WriteCommand(new Location(currentUserLocation), data, true);
batch.appendBatch(write);
currentUserLocation = goToNextLocation(currentUserLocation, Location.USER_RECORD_TYPE, false);
}
batch.perform(raf, null, true);
} finally {
if (raf != null) {
raf.close();
}
}
if (currentFile.getFile().delete()) {
accessor.dispose(currentFile);
totalLength.addAndGet(-currentFile.getLength());
totalLength.addAndGet(tmpFile.getLength());
if (tmpFile.getFile().renameTo(currentFile.getFile())) {
currentFile.setLength(tmpFile.getLength());
} else {
throw new IOException("Cannot rename file: " + tmpFile.getFile());
}
} else {
throw new IOException("Cannot remove file: " + currentFile.getFile());
}
}
private Location recoveryCheck() throws IOException {
Location location = goToFirstLocation(dataFiles.firstEntry().getValue(), Location.BATCH_CONTROL_RECORD_TYPE, false);
while (true) {
ByteBuffer buffer = accessor.readLocation(location).toByteBuffer();
for (int i = 0; i < BATCH_CONTROL_RECORD_MAGIC.length; i++) {
if (buffer.get() != BATCH_CONTROL_RECORD_MAGIC[i]) {
throw new IOException("Bad control record magic for location: " + location);
}
}
if (isChecksum()) {
long expectedChecksum = buffer.getLong();
byte data[] = new byte[buffer.remaining()];
Checksum checksum = new Adler32();
buffer.get(data);
checksum.update(data, 0, data.length);
if (expectedChecksum != checksum.getValue()) {
throw new IOException("Bad checksum for location: " + location);
}
}
Location next = goToNextLocation(location, Location.BATCH_CONTROL_RECORD_TYPE, true);
if (next != null) {
location = next;
} else {
break;
}
}
return location;
}
static class WriteBatch {
private final DataFile dataFile;
private final Queue<WriteCommand> writes = new ConcurrentLinkedQueue<WriteCommand>();
private final CountDownLatch latch = new CountDownLatch(1);
private final int offset;
private volatile int size;
WriteBatch() {
this.dataFile = null;
this.offset = -1;
}
WriteBatch(DataFile dataFile, int offset) throws IOException {
this.dataFile = dataFile;
this.offset = offset;
this.size = BATCH_CONTROL_RECORD_SIZE;
}
boolean canBatch(WriteCommand write, int maxWriteBatchSize, int maxFileLength) throws IOException {
int thisBatchSize = size + write.location.getSize();
int thisFileLength = offset + thisBatchSize;
if (thisBatchSize > maxWriteBatchSize || thisFileLength > maxFileLength) {
return false;
} else {
return true;
}
}
WriteCommand prepareBatch() throws IOException {
WriteCommand controlRecord = new WriteCommand(new Location(), null, false);
controlRecord.location.setType(Location.BATCH_CONTROL_RECORD_TYPE);
controlRecord.location.setSize(Journal.BATCH_CONTROL_RECORD_SIZE);
controlRecord.location.setDataFileId(dataFile.getDataFileId());
controlRecord.location.setOffset(offset);
size = controlRecord.location.getSize();
dataFile.incrementLength(size);
writes.offer(controlRecord);
return controlRecord;
}
void appendBatch(WriteCommand writeRecord) throws IOException {
writeRecord.location.setDataFileId(dataFile.getDataFileId());
writeRecord.location.setOffset(offset + size);
size += writeRecord.location.getSize();
dataFile.incrementLength(writeRecord.location.getSize());
writes.offer(writeRecord);
}
Location perform(RandomAccessFile file, ReplicationTarget replicator, boolean checksum) throws IOException {
DataByteArrayOutputStream buffer = new DataByteArrayOutputStream(size);
boolean forceToDisk = false;
WriteCommand latest = null;
// Write an empty batch control record.
buffer.reset();
buffer.writeInt(BATCH_CONTROL_RECORD_SIZE);
buffer.writeByte(Location.BATCH_CONTROL_RECORD_TYPE);
buffer.writeInt(0);
buffer.write(BATCH_CONTROL_RECORD_MAGIC);
buffer.writeLong(0);
WriteCommand control = writes.peek();
Iterator<WriteCommand> commands = writes.iterator();
// Skip the control write:
commands.next();
// Process others:
while (commands.hasNext()) {
WriteCommand current = commands.next();
forceToDisk |= current.sync;
buffer.writeInt(current.location.getSize());
buffer.writeByte(current.location.getType());
buffer.write(current.data.getData(), current.data.getOffset(), current.data.getLength());
latest = current;
}
// Now we can fill in the batch control record properly.
Buffer sequence = buffer.toBuffer();
buffer.reset();
buffer.skip(Journal.HEADER_SIZE);
buffer.writeInt(sequence.getLength() - Journal.HEADER_SIZE - Journal.BATCH_SIZE);
buffer.skip(Journal.BATCH_CONTROL_RECORD_MAGIC.length);
if (checksum) {
Checksum adler32 = new Adler32();
adler32.update(sequence.getData(), sequence.getOffset() + Journal.BATCH_CONTROL_RECORD_SIZE, sequence.getLength() - Journal.BATCH_CONTROL_RECORD_SIZE);
buffer.writeLong(adler32.getValue());
}
// Now do the 1 big write.
file.seek(offset);
file.write(sequence.getData(), sequence.getOffset(), sequence.getLength());
if (forceToDisk) {
IOHelper.sync(file.getFD());
}
if (replicator != null) {
replicator.replicate(control.location, sequence, forceToDisk);
}
return latest.location;
}
DataFile getDataFile() {
return dataFile;
}
int getSize() {
return size;
}
CountDownLatch getLatch() {
return latch;
}
Collection<WriteCommand> getWrites() {
return Collections.unmodifiableCollection(writes);
}
boolean isEmpty() {
return writes.isEmpty();
}
}
static class WriteCommand implements JournalListener.Write {
private final Location location;
private final boolean sync;
private volatile Buffer data;
WriteCommand(Location location, Buffer data, boolean sync) {
this.location = location;
this.data = data;
this.sync = sync;
}
public Location getLocation() {
return location;
}
Buffer getData() {
return data;
}
boolean isSync() {
return sync;
}
}
static class WriteFuture implements Future<Boolean> {
private final CountDownLatch latch;
WriteFuture(CountDownLatch latch) {
this.latch = latch;
}
public boolean cancel(boolean mayInterruptIfRunning) {
throw new UnsupportedOperationException("Cannot cancel this type of future!");
}
public boolean isCancelled() {
throw new UnsupportedOperationException("Cannot cancel this type of future!");
}
public boolean isDone() {
return latch.getCount() == 0;
}
public Boolean get() throws InterruptedException, ExecutionException {
latch.await();
return true;
}
public Boolean get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException {
boolean success = latch.await(timeout, unit);
return success;
}
}
}