/**
* Copyright 2011-2012 Akiban Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.persistit;
import static com.persistit.TransactionStatus.ABORTED;
import static com.persistit.util.SequencerConstants.PAGE_MAP_READ_INVALIDATE_A;
import static com.persistit.util.SequencerConstants.RECOVERY_PRUNING_B;
import static com.persistit.util.ThreadSequencer.sequence;
import static com.persistit.util.Util.NS_PER_MS;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.persistit.AlertMonitor.AlertLevel;
import com.persistit.AlertMonitor.Event;
import com.persistit.CheckpointManager.Checkpoint;
import com.persistit.JournalRecord.CP;
import com.persistit.JournalRecord.IT;
import com.persistit.JournalRecord.IV;
import com.persistit.JournalRecord.JE;
import com.persistit.JournalRecord.JH;
import com.persistit.JournalRecord.PA;
import com.persistit.JournalRecord.PM;
import com.persistit.JournalRecord.TM;
import com.persistit.JournalRecord.TX;
import com.persistit.Persistit.FatalErrorException;
import com.persistit.TransactionPlayer.TransactionPlayerListener;
import com.persistit.exception.CorruptJournalException;
import com.persistit.exception.PersistitException;
import com.persistit.exception.PersistitIOException;
import com.persistit.exception.PersistitInterruptedException;
import com.persistit.exception.RebalanceException;
import com.persistit.exception.VolumeNotFoundException;
import com.persistit.mxbeans.JournalManagerMXBean;
import com.persistit.util.Debug;
import com.persistit.util.Util;
/**
* Manages the disk-based I/O journal. The journal contains both committed
* transactions and images of updated pages.
*
* @author peter
*
*/
class JournalManager implements JournalManagerMXBean, VolumeHandleLookup {
final static int URGENT = 10;
final static int ALMOST_URGENT = 8;
final static int HALF_URGENT = 5;
final static int URGENT_COMMIT_DELAY_MILLIS = 50;
final static int GENTLE_COMMIT_DELAY_MILLIS = 12;
private final static int IO_MEASUREMENT_CYCLES = 8;
private final static int TOO_MANY_WARN_THRESHOLD = 5;
private final static int TOO_MANY_ERROR_THRESHOLD = 10;
private final static long KILO = 1024;
/**
* REGEX expression that recognizes the name of a journal file.
*/
final static Pattern PATH_PATTERN = Pattern.compile("(.+)\\.(\\d{12})");
private long _journalCreatedTime;
private final Map<PageNode, PageNode> _pageMap = new HashMap<PageNode, PageNode>();
private final RangeRemovingArrayList<PageNode> _pageList = new RangeRemovingArrayList<PageNode>();
private final Map<PageNode, PageNode> _branchMap = new HashMap<PageNode, PageNode>();
private final Map<Volume, Integer> _volumeToHandleMap = new HashMap<Volume, Integer>();
private final Map<Integer, Volume> _handleToVolumeMap = new HashMap<Integer, Volume>();
private final Map<TreeDescriptor, Integer> _treeToHandleMap = new HashMap<TreeDescriptor, Integer>();
private final Map<Integer, TreeDescriptor> _handleToTreeMap = new HashMap<Integer, TreeDescriptor>();
private final Map<Long, TransactionMapItem> _liveTransactionMap = new HashMap<Long, TransactionMapItem>();
private final Persistit _persistit;
private long _blockSize;
private volatile int _writeBufferSize = DEFAULT_BUFFER_SIZE;
private ByteBuffer _writeBuffer;
private long _writeBufferAddress = Long.MAX_VALUE;
private JournalFlusher _flusher;
private JournalCopier _copier;
private final AtomicBoolean _closed = new AtomicBoolean();
private final AtomicBoolean _copying = new AtomicBoolean();
private final AtomicBoolean _copyFast = new AtomicBoolean();
private final AtomicBoolean _flushing = new AtomicBoolean();
private final AtomicBoolean _appendOnly = new AtomicBoolean();
private final AtomicBoolean _ignoreMissingVolume = new AtomicBoolean();
private String _journalFilePath;
/**
* Address of first available byte in the journal. This is usually the
* address of the next record to be written, but if that next record
* requires more space than is available in the current journal file, it
* will advance to the start of the next journal file.
*/
private volatile long _currentAddress;
/**
* Smallest journal address at which a record still needed is located.
* Initially zero, increases as journal files are consumed and deleted.
*/
private volatile long _baseAddress;
private final Map<Long, FileChannel> _journalFileChannels = new HashMap<Long, FileChannel>();
/**
* Counter used to assign internal handle values to Volume and Tree records.
*/
private int _handleCounter = 0;
private Checkpoint _lastValidCheckpoint = new Checkpoint(0, 0);
private long _lastValidCheckpointJournalAddress = 0;
private long _lastValidCheckpointBaseAddress = 0;
private long _deleteBoundaryAddress = 0;
private int _lastReportedJournalFileCount = 0;
private boolean _isNewEpoch = true;
private volatile long _writePageCount = 0;
private volatile long _readPageCount = 0;
private volatile long _copiedPageCount = 0;
private volatile long _droppedPageCount = 0;
private final AtomicLong _totalCommits = new AtomicLong();
private final AtomicLong _totalCommitWaitTime = new AtomicLong();
private final AtomicLong _totalFlushCycles = new AtomicLong();
private final AtomicLong _totalFlushIoTime = new AtomicLong();
private volatile long _flushInterval = DEFAULT_FLUSH_INTERVAL_MS;
private volatile long _slowIoAlertThreshold = DEFAULT_SLOW_IO_ALERT_THRESHOLD_MS;
private final TransactionPlayer _player = new TransactionPlayer(new JournalTransactionPlayerSupport());
private final TransactionPlayerListener _listener = new ProactiveRollbackListener();
private final AtomicBoolean _writePagePruning = new AtomicBoolean(true);
private final AtomicBoolean _rollbackPruning = new AtomicBoolean(true);
/*
* Tunable parameters that determine how vigorously the copyBack thread
* performs I/O. Hopefully we can set good defaults and not expose these as
* knobs.
*/
private volatile long _copierInterval = DEFAULT_COPIER_INTERVAL_MS;
private volatile int _copiesPerCycle = DEFAULT_COPIES_PER_CYCLE;
private volatile long _copierTimestampLimit = Long.MAX_VALUE;
private volatile long _earliestCommittedTimestamp = Long.MAX_VALUE;
private volatile long _earliestAbortedTimestamp = Long.MAX_VALUE;
private boolean _allowHandlesForTempVolumesAndTrees;
private volatile int _urgentFileCountThreshold = DEFAULT_URGENT_FILE_COUNT_THRESHOLD;
private volatile long _throttleSleepInterval;
/**
* <p>
* Initialize the new journal. This method takes its information from the
* supplied RecoveryManager if supplied and valid. Otherwise it starts a new
* journal at address 0.
* </p>
* <p>
* If a RecoveryManager is supplied and has a valid keystone address, then
* this method continues the existing journal. A new journal file will be
* created with a generation number one larger than that of the keystone
* file, and the new file is given the same journal create date as the
* recovered journal. New journal files are also required to have the same
* maximumSize and path name (not including generation suffix) as the
* existing journal, so in the event <code>rman</code> is non-null and
* contains a valid keystone, the <code>path</code> and
* <code>maximumSize</code> parameters are ignored.
* </p>
* <p>
* Otherwise, this method creates a new journal starting at journal address
* 0 with the specified path and maximum file size. Journal file names are
* created by appending a period followed by a generation number suffix to
* the supplied path name. For example if the supplied path is
* "/xxx/yyy/zzz" then journal file names will be
* "/xxx/yyy/zzz.000000000000", "/xxx/yyy/zzz.000000000001", and so on. (The
* suffix contains twelve digits.)
* </p>
*
* @param rman
* @param path
* @param maximumSize
* @throws PersistitException
*/
public synchronized void init(final RecoveryManager rman, final String path, final long maximumSize)
throws PersistitException {
_writeBuffer = ByteBuffer.allocate(_writeBufferSize);
if (rman != null && rman.getKeystoneAddress() != -1) {
_journalFilePath = rman.getJournalFilePath();
_blockSize = rman.getBlockSize();
_currentAddress = rman.getKeystoneAddress() + _blockSize;
_baseAddress = rman.getBaseAddress();
_journalCreatedTime = rman.getJournalCreatedTime();
_lastValidCheckpoint = rman.getLastValidCheckpoint();
rman.collectRecoveredPages(_pageMap, _branchMap);
rman.collectRecoveredVolumeMaps(_handleToVolumeMap, _volumeToHandleMap);
rman.collectRecoveredTreeMaps(_handleToTreeMap, _treeToHandleMap);
rman.collectRecoveredTransactionMap(_liveTransactionMap);
/*
* Set _handleCount so that newly created handles do not conflict
* with existing resources.
*/
for (final Integer handle : _handleToTreeMap.keySet()) {
_handleCounter = Math.max(_handleCounter, handle + 1);
}
for (final Integer handle : _handleToVolumeMap.keySet()) {
_handleCounter = Math.max(_handleCounter, handle + 1);
}
/*
* Populate page list in journal address order.
*/
for (final PageNode root : _pageMap.values()) {
for (PageNode pn = root; pn != null; pn = pn.getPrevious()) {
_pageList.add(pn);
}
}
Collections.sort(_pageList, PageNode.READ_COMPARATOR);
} else {
_journalFilePath = journalPath(path).getAbsoluteFile().toString();
_blockSize = maximumSize;
_currentAddress = 0;
_journalCreatedTime = System.currentTimeMillis();
}
_closed.set(false);
}
public void startJournal() throws PersistitException {
synchronized (this) {
prepareWriteBuffer(JH.OVERHEAD);
}
_flusher = new JournalFlusher();
_copier = new JournalCopier();
_copier.start();
_flusher.start();
}
/**
* Copy dynamic variables into a {@link Management.JournalInfo} structure.
*
* @param info
*/
public synchronized void populateJournalInfo(final Management.JournalInfo info) {
info.closed = _closed.get();
if (_blockSize == 0) {
return;
}
info.copiedPageCount = _copiedPageCount;
info.droppedPageCount = _droppedPageCount;
info.copying = _copying.get();
info.currentGeneration = _currentAddress;
info.currentJournalAddress = _writeBuffer == null ? 0 : _writeBufferAddress + _writeBuffer.position();
info.currentJournalFile = addressToFile(_currentAddress).getPath();
info.flushing = _flushing.get();
info.journaledPageCount = _writePageCount;
info.readPageCount = _readPageCount;
if (_lastValidCheckpointJournalAddress != 0) {
info.lastValidCheckpointSystemTime = _lastValidCheckpoint.getSystemTimeMillis();
info.lastValidCheckpointTimestamp = _lastValidCheckpoint.getTimestamp();
info.lastValidCheckpointJournalFile = addressToFile(_lastValidCheckpointJournalAddress).getPath();
info.lastValidCheckpointJournalAddress = _lastValidCheckpointJournalAddress;
} else {
info.lastValidCheckpointSystemTime = 0;
info.lastValidCheckpointTimestamp = 0;
info.lastValidCheckpointJournalFile = null;
info.lastValidCheckpointJournalAddress = 0;
}
info.blockSize = _blockSize;
info.pageMapSize = _pageMap.size();
info.baseAddress = _baseAddress;
info.appendOnly = _appendOnly.get();
info.fastCopying = _copyFast.get();
}
@Override
public synchronized int getLiveTransactionMapSize() {
return _liveTransactionMap.size();
}
@Override
public synchronized int getPageMapSize() {
return _pageMap.size();
}
@Override
public synchronized int getPageListSize() {
return _pageList.size();
}
@Override
public synchronized long getBaseAddress() {
return _baseAddress;
}
@Override
public synchronized long getCurrentAddress() {
return _currentAddress;
}
@Override
public long getBlockSize() {
return _blockSize;
}
@Override
public boolean isAppendOnly() {
return _appendOnly.get();
}
@Override
public boolean isIgnoreMissingVolumes() {
return _ignoreMissingVolume.get();
}
@Override
public boolean isCopyingFast() {
return _copyFast.get();
}
@Override
public void setAppendOnly(final boolean appendOnly) {
_appendOnly.set(appendOnly);
}
@Override
public void setIgnoreMissingVolumes(final boolean ignore) {
_ignoreMissingVolume.set(ignore);
}
@Override
public void setCopyingFast(final boolean fast) {
_copyFast.set(fast);
}
@Override
public long getFlushInterval() {
return _flusher.getPollInterval();
}
@Override
public void setFlushInterval(final long flushInterval) {
_flusher.setPollInterval(flushInterval);
}
@Override
public long getCopierInterval() {
return _copier.getPollInterval();
}
@Override
public void setCopierInterval(final long copierInterval) {
_copier.setPollInterval(copierInterval);
}
@Override
public void setRollbackPruningEnabled(final boolean rollbackPruning) {
_rollbackPruning.set(rollbackPruning);
}
@Override
public void setWritePagePruningEnabled(final boolean writePruning) {
_writePagePruning.set(writePruning);
}
public JournalManager(final Persistit persistit) {
_persistit = persistit;
}
@Override
public boolean isClosed() {
return _closed.get();
}
@Override
public boolean isCopying() {
return _copying.get();
}
@Override
public boolean isRollbackPruningEnabled() {
return _rollbackPruning.get();
}
@Override
public boolean isWritePagePruningEnabled() {
return _writePagePruning.get();
}
@Override
public String getJournalFilePath() {
return _journalFilePath;
}
@Override
public long getJournaledPageCount() {
return _writePageCount;
}
@Override
public long getReadPageCount() {
return _readPageCount;
}
@Override
public long getCopiedPageCount() {
return _copiedPageCount;
}
@Override
public long getDroppedPageCount() {
return _droppedPageCount;
}
public long getEarliestCommittedTransactionTimestamp() {
return _earliestCommittedTimestamp;
}
public long getEarliestAbortedTransactionTimestamp() {
return _earliestAbortedTimestamp;
}
@Override
public long getJournalCreatedTime() {
return _journalCreatedTime;
}
public Checkpoint getLastValidCheckpoint() {
return _lastValidCheckpoint;
}
@Override
public long getLastValidCheckpointTimestamp() {
return _lastValidCheckpoint.getTimestamp();
}
@Override
public String getLastCopierException() {
return Util.toString(_copier.getLastException());
}
@Override
public String getLastFlusherException() {
return Util.toString(_flusher.getLastException());
}
@Override
public long getLastValidCheckpointTimeMillis() {
return _lastValidCheckpoint.getSystemTimeMillis();
}
@Override
public long getSlowIoAlertThreshold() {
return _slowIoAlertThreshold;
}
@Override
public long getTotalCompletedCommits() {
return _totalCommits.get();
}
@Override
public long getCommitCompletionWaitTime() {
return _totalCommitWaitTime.get() / NS_PER_MS;
}
@Override
public long getCurrentTimestamp() {
return _persistit.getCurrentTimestamp();
}
@Override
public void setSlowIoAlertThreshold(final long slowIoAlertThreshold) {
Util.rangeCheck(slowIoAlertThreshold, MINIMUM_SLOW_ALERT_THRESHOLD_MS, MAXIMUM_SLOW_ALERT_THRESHOLD_MS);
_slowIoAlertThreshold = slowIoAlertThreshold;
}
@Override
public int getUrgentFileCountThreshold() {
return _urgentFileCountThreshold;
}
@Override
public void setUrgentFileCountThreshold(final int threshold) {
Util.rangeCheck(threshold, MINIMUM_URGENT_FILE_COUNT_THRESHOLD, MAXIMUM_URGENT_FILE_COUNT_THRESHOLD);
_urgentFileCountThreshold = threshold;
}
/**
* Compute an "urgency" factor that determines how vigorously the
* JOURNAL_COPIER thread should perform I/O. This number is computed on a
* scale of 0 to 10; larger values are intended make the thread work harder.
* A value of 10 suggests the copier should run flat-out.
*
* @return the JOURNAL_COPIER urgency on a scale of 0 to 10
*/
@Override
public int urgency() {
if (_copyFast.get()) {
return URGENT;
}
final int remainingFiles = _urgentFileCountThreshold - getJournalFileCount();
return Math.max(0, Math.min(URGENT - remainingFiles, URGENT));
}
/**
* Introduce delay into an application thread when JOURNAL_COPIER thread is
* behind. The amount of delay depends on the value returned by
* {@link #urgency()}. When that value is {@value #URGENT} then the delay is
* {@value #URGENT_COMMIT_DELAY_MILLIS} milliseconds.
*
* @throws PersistitInterruptedException
*/
public void throttle() throws PersistitInterruptedException {
final long interval = _throttleSleepInterval;
if (interval > 0) {
Util.sleep(interval);
}
}
int handleForVolume(final Volume volume) throws PersistitException {
if (volume.getHandle() != 0) {
return volume.getHandle();
}
if (!_allowHandlesForTempVolumesAndTrees && volume.isTemporary()) {
throw new IllegalStateException("Creating handle for temporary volume " + volume);
}
if (volume.getHandle() != 0) {
return volume.getHandle();
}
synchronized (this) {
if (volume.getHandle() != 0) {
return volume.getHandle();
}
Integer handle = _volumeToHandleMap.get(volume);
if (handle == null) {
handle = Integer.valueOf(++_handleCounter);
Debug.$assert0.t(!_handleToVolumeMap.containsKey(handle));
writeVolumeHandleToJournal(volume, handle.intValue());
_volumeToHandleMap.put(volume, handle);
_handleToVolumeMap.put(handle, volume);
}
return volume.setHandle(handle.intValue());
}
}
synchronized int handleForTree(final TreeDescriptor td, final boolean create) throws PersistitException {
if (td.getVolumeHandle() == -1) {
// Tree in transient volume -- don't journal updates to it
return -1;
}
Integer handle = _treeToHandleMap.get(td);
if (handle == null) {
if (!create) {
return -1;
}
handle = Integer.valueOf(++_handleCounter);
Debug.$assert0.t(!_handleToTreeMap.containsKey(handle));
if (td.getVolumeHandle() != Volume.LOCK_VOLUME_HANDLE) {
writeTreeHandleToJournal(td, handle.intValue());
}
_treeToHandleMap.put(td, handle);
_handleToTreeMap.put(handle, td);
}
return handle.intValue();
}
int handleForTree(final Tree tree) throws PersistitException {
if (!_allowHandlesForTempVolumesAndTrees && tree.getVolume().isTemporary() && !tree.getVolume().isLockVolume()) {
throw new IllegalStateException("Creating handle for temporary tree " + tree);
}
if (tree.getHandle() != 0) {
return tree.getHandle();
}
synchronized (this) {
if (tree.getHandle() != 0) {
return tree.getHandle();
}
final TreeDescriptor td = new TreeDescriptor(handleForVolume(tree.getVolume()), tree.getName());
return tree.setHandle(handleForTree(td, true));
}
}
Tree treeForHandle(final int handle) throws PersistitException {
final TreeDescriptor td = lookupTreeHandle(handle);
if (td == null) {
return null;
}
final Volume volume = volumeForHandle(td.getVolumeHandle());
if (volume == null) {
return null;
}
return volume.getStructure().getTreeInternal(td.getTreeName());
}
Volume volumeForHandle(final int handle) throws PersistitException {
final Volume volume = lookupVolumeHandle(handle);
if (volume == null) {
if (handle == Volume.LOCK_VOLUME_HANDLE) {
return _persistit.getLockVolume();
} else {
return null;
}
}
if (!volume.isOpened()) {
volume.open(_persistit);
}
return volume;
}
synchronized Volume getVolumeByName(final String volumeName) {
for (final Volume v : _handleToVolumeMap.values()) {
if (volumeName.equals(v.getName())) {
return v;
}
}
return null;
}
@Override
public synchronized Volume lookupVolumeHandle(final int handle) {
return _handleToVolumeMap.get(Integer.valueOf(handle));
}
public synchronized TreeDescriptor lookupTreeHandle(final int handle) {
return _handleToTreeMap.get(Integer.valueOf(handle));
}
private void readFully(final ByteBuffer bb, final long address) throws PersistitIOException,
CorruptJournalException {
//
// If necessary read the bytes out of the _writeBuffer
// before they have been written out to the file. This code
// requires the _writeBuffer to be a HeapByteBuffer.
//
final int position = bb.position();
final int length = bb.remaining();
synchronized (this) {
if (address >= _writeBufferAddress && address + length <= _currentAddress) {
assert _writeBufferAddress + _writeBuffer.position() == _currentAddress : String.format(
"writeBufferAddress=%,d position=%,d currentAddress=%,d", _writeBufferAddress,
_writeBuffer.position(), _currentAddress);
final int wbPosition = _writeBuffer.position();
final int wbLimit = _writeBuffer.limit();
_writeBuffer.position((int) (address - _writeBufferAddress));
_writeBuffer.limit((int) (address - _writeBufferAddress) + length);
bb.put(_writeBuffer);
_writeBuffer.limit(wbLimit);
_writeBuffer.position(wbPosition);
bb.position(position);
return;
}
}
final FileChannel fc = getFileChannel(address);
long fileAddr = addressToOffset(address);
while (bb.remaining() > 0) {
int count;
try {
count = fc.read(bb, fileAddr);
} catch (final IOException ioe) {
throw new PersistitIOException(ioe);
}
if (count < 0) {
final File file = addressToFile(address);
throw new CorruptJournalException(String.format("End of file at %s:%d(%,d)", file, fileAddr, address));
}
fileAddr += count;
}
bb.limit(bb.position());
bb.position(position);
}
boolean readPageFromJournal(final Buffer buffer) throws PersistitIOException {
final int bufferSize = buffer.getBufferSize();
final long pageAddress = buffer.getPageAddress();
final ByteBuffer bb = buffer.getByteBuffer();
final Volume volume = buffer.getVolume();
final PageNode pn = lookupUpPageNode(pageAddress, volume);
if (pn == null) {
return false;
}
bb.position(0);
final long recordPageAddress = readPageBufferFromJournal(pn, bb);
_persistit.getIOMeter().chargeReadPageFromJournal(volume, pageAddress, bufferSize, pn.getJournalAddress(),
buffer.getIndex());
if (pageAddress != recordPageAddress) {
throw new CorruptJournalException("Record at " + pn + " is not volume/page " + buffer.toString());
}
if (bb.limit() != bufferSize) {
throw new CorruptJournalException("Record at " + pn + " is wrong size: expected/actual=" + bufferSize + "/"
+ bb.limit());
}
_readPageCount++;
buffer.getVolume().getStatistics().bumpReadCounter();
return true;
}
PageNode lookupUpPageNode(final long pageAddress, final Volume volume) {
PageNode pnLookup = null;
synchronized (this) {
final Integer volumeHandle = _volumeToHandleMap.get(volume);
if (volumeHandle != null) {
pnLookup = _pageMap.get(new PageNode(volumeHandle, pageAddress, -1, -1));
}
}
if (pnLookup == null) {
return null;
}
final PageNode pn = new PageNode(pnLookup.getVolumeHandle(), pnLookup.getPageAddress(),
pnLookup.getJournalAddress(), pnLookup.getTimestamp());
sequence(PAGE_MAP_READ_INVALIDATE_A);
/*
* If the page is still valid, use the values saved in pn so we don't
* lose them mid-processing. We can use it because it was in the map
* when we first looked and that means it is is still in the journal.
* The journal won't go away because of the claim on buffer preventing
* new checkpoints and that keeps the copier from deleting it.
*/
if (pnLookup.isInvalid()) {
return null;
}
return pn;
}
private long readPageBufferFromJournal(final PageNode pn, final ByteBuffer bb) throws PersistitIOException,
CorruptJournalException {
final int at = bb.position();
bb.limit(at + PA.OVERHEAD);
readFully(bb, pn.getJournalAddress());
if (bb.remaining() < PA.OVERHEAD) {
throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this) + " is incomplete");
}
final int type = JournalRecord.getType(bb);
final int payloadSize = JournalRecord.getLength(bb) - PA.OVERHEAD;
final int leftSize = PA.getLeftSize(bb);
final int bufferSize = PA.getBufferSize(bb);
final long pageAddress = PA.getPageAddress(bb);
if (type != PA.TYPE) {
throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this) + " is not a PAGE record");
}
if (leftSize < 0 || payloadSize < leftSize || payloadSize > bufferSize) {
throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this)
+ " invalid sizes: recordSize= " + payloadSize + " leftSize=" + leftSize + " bufferSize="
+ bufferSize);
}
if (pageAddress != pn.getPageAddress() && pn.getPageAddress() != -1) {
throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this)
+ " mismatched page address: expected/actual=" + pn.getPageAddress() + "/" + pageAddress);
}
bb.limit(at + payloadSize).position(at);
readFully(bb, pn.getJournalAddress() + PA.OVERHEAD);
final int rightSize = payloadSize - leftSize;
System.arraycopy(bb.array(), leftSize + at, bb.array(), bufferSize - rightSize + at, rightSize);
Arrays.fill(bb.array(), leftSize + at, bufferSize - rightSize + at, (byte) 0);
bb.limit(bb.capacity()).position(at).limit(at + bufferSize);
return pageAddress;
}
/**
* Method used by diagnostic tools to attempt to read a page from journal
*
* @param address
* journal address
* @param _bb
* ByteBuffer in which to return the result
* @return pageAddress of the page at the specified location, or -1 if the
* address does not reference a valid page
* @throws PersistitException
*/
Buffer readPageBuffer(final long address) throws PersistitException {
ByteBuffer bb = ByteBuffer.allocate(PA.OVERHEAD);
readFully(bb, address);
if (bb.remaining() < PA.OVERHEAD) {
return null;
}
final int type = JournalRecord.getType(bb);
final int payloadSize = JournalRecord.getLength(bb) - PA.OVERHEAD;
final int leftSize = PA.getLeftSize(bb);
final int bufferSize = PA.getBufferSize(bb);
final long pageAddress = PA.getPageAddress(bb);
final int volumeHandle = PA.getVolumeHandle(bb);
if (type != PA.TYPE || leftSize < 0 || payloadSize < leftSize || payloadSize > bufferSize) {
return null;
}
final BufferPool pool = _persistit.getBufferPool(bufferSize);
final Buffer buffer = new Buffer(bufferSize, -1, pool, _persistit);
buffer.setPageAddressAndVolume(pageAddress, volumeForHandle(volumeHandle));
bb = buffer.getByteBuffer();
bb.limit(payloadSize).position(0);
readFully(bb, address + PA.OVERHEAD);
if (leftSize > 0) {
final int rightSize = payloadSize - leftSize;
System.arraycopy(bb.array(), leftSize, bb.array(), bufferSize - rightSize, rightSize);
Arrays.fill(bb.array(), leftSize, bufferSize - rightSize, (byte) 0);
}
bb.limit(bufferSize).position(0);
final boolean acquired = buffer.claim(true, 0);
assert acquired : "buffer in use";
buffer.load();
buffer.release();
return buffer;
}
private void advance(final int recordSize) {
Debug.$assert1.t(recordSize > 0 && recordSize + _writeBuffer.position() <= _writeBuffer.capacity());
_currentAddress += recordSize;
_writeBuffer.position(_writeBuffer.position() + recordSize);
}
/**
* Write a JH (journal header) record. This record must be written to the
* beginning of the journal file. Note that this method does not call
* {@link #prepareWriteBuffer(int)} - the write buffer needs to be ready to
* receive the JH record.
*
* @throws PersistitException
*/
synchronized void writeJournalHeader() throws PersistitException {
JH.putType(_writeBuffer);
JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
JH.putVersion(_writeBuffer, VERSION);
JH.putBlockSize(_writeBuffer, _blockSize);
JH.putBaseJournalAddress(_writeBuffer, _baseAddress);
JH.putCurrentJournalAddress(_writeBuffer, _currentAddress);
JH.putJournalCreatedTime(_writeBuffer, _journalCreatedTime);
JH.putFileCreatedTime(_writeBuffer, System.currentTimeMillis());
JH.putPath(_writeBuffer, addressToFile(_currentAddress).getPath());
final int recordSize = JournalRecord.getLength(_writeBuffer);
_persistit.getIOMeter().chargeWriteOtherToJournal(recordSize, _currentAddress);
advance(recordSize);
}
/**
* Write the JE (journal end) record. This record must be written to the end
* of each complete journal file. Note that this method does not call
* {@link #prepareWriteBuffer(int)} - the write buffer needs to be ready to
* receive the JE record.
*
* @throws PersistitException
*/
synchronized void writeJournalEnd() throws PersistitException {
if (_writeBufferAddress != Long.MAX_VALUE) {
//
// prepareWriteBuffer contract guarantees there's always room in
// the write buffer for this record.
//
JE.putType(_writeBuffer);
JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
JournalRecord.putLength(_writeBuffer, JE.OVERHEAD);
JE.putCurrentJournalAddress(_writeBuffer, _currentAddress);
JE.putBaseAddress(_writeBuffer, _baseAddress);
JE.putJournalCreatedTime(_writeBuffer, _journalCreatedTime);
_persistit.getIOMeter().chargeWriteOtherToJournal(JE.OVERHEAD, _currentAddress);
advance(JE.OVERHEAD);
}
}
synchronized void writePageMap() throws PersistitException {
int count = 0;
for (final PageNode lastPageNode : _pageMap.values()) {
PageNode pageNode = lastPageNode;
while (pageNode != null) {
count++;
pageNode = pageNode.getPrevious();
}
}
for (final PageNode lastPageNode : _branchMap.values()) {
PageNode pageNode = lastPageNode;
while (pageNode != null) {
count++;
pageNode = pageNode.getPrevious();
}
}
final int recordSize = PM.OVERHEAD + PM.ENTRY_SIZE * count;
prepareWriteBuffer(recordSize);
PM.putType(_writeBuffer);
JournalRecord.putLength(_writeBuffer, recordSize);
JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
advance(PM.OVERHEAD);
int offset = 0;
for (final PageNode lastPageNode : _pageMap.values()) {
PageNode pageNode = lastPageNode;
while (pageNode != null) {
PM.putEntry(_writeBuffer, offset / PM.ENTRY_SIZE, pageNode.getTimestamp(),
pageNode.getJournalAddress(), pageNode.getVolumeHandle(), pageNode.getPageAddress());
offset += PM.ENTRY_SIZE;
count--;
if (count == 0 || offset + PM.ENTRY_SIZE >= _writeBuffer.remaining()) {
advance(offset);
offset = 0;
}
if (PM.ENTRY_SIZE >= _writeBuffer.remaining()) {
flush();
}
pageNode = pageNode.getPrevious();
}
}
for (final PageNode lastPageNode : _branchMap.values()) {
PageNode pageNode = lastPageNode;
while (pageNode != null) {
PM.putEntry(_writeBuffer, offset / PM.ENTRY_SIZE, pageNode.getTimestamp(),
pageNode.getJournalAddress(), pageNode.getVolumeHandle(), pageNode.getPageAddress());
offset += PM.ENTRY_SIZE;
count--;
if (count == 0 || offset + PM.ENTRY_SIZE >= _writeBuffer.remaining()) {
advance(offset);
offset = 0;
}
if (PM.ENTRY_SIZE >= _writeBuffer.remaining()) {
flush();
}
pageNode = pageNode.getPrevious();
}
}
Debug.$assert0.t(count == 0);
_persistit.getIOMeter().chargeWriteOtherToJournal(recordSize, _currentAddress - recordSize);
}
synchronized void writeTransactionMap() throws PersistitException {
int count = _liveTransactionMap.size();
final int recordSize = TM.OVERHEAD + TM.ENTRY_SIZE * count;
prepareWriteBuffer(recordSize);
TM.putType(_writeBuffer);
JournalRecord.putLength(_writeBuffer, recordSize);
JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
advance(TM.OVERHEAD);
int offset = 0;
for (final TransactionMapItem ts : _liveTransactionMap.values()) {
TM.putEntry(_writeBuffer, offset / TM.ENTRY_SIZE, ts.getStartTimestamp(), ts.getCommitTimestamp(),
ts.getStartAddress(), ts.getLastRecordAddress());
offset += TM.ENTRY_SIZE;
count--;
if (count == 0 || offset + TM.ENTRY_SIZE >= _writeBuffer.remaining()) {
advance(offset);
offset = 0;
}
if (TM.ENTRY_SIZE >= _writeBuffer.remaining()) {
flush();
}
}
Debug.$assert0.t(count == 0);
_persistit.getIOMeter().chargeWriteOtherToJournal(recordSize, _currentAddress - recordSize);
}
synchronized void writeCheckpointToJournal(final Checkpoint checkpoint) throws PersistitException {
//
// Make sure all prior journal entries are committed to disk before
// writing this record.
//
force();
//
// Prepare room for CP.OVERHEAD bytes in the journal. If doing so
// started a new journal file then there's no need to write another
// CP record.
//
if (!prepareWriteBuffer(CP.OVERHEAD)) {
final long address = _currentAddress;
JournalRecord.putLength(_writeBuffer, CP.OVERHEAD);
CP.putType(_writeBuffer);
JournalRecord.putTimestamp(_writeBuffer, checkpoint.getTimestamp());
CP.putSystemTimeMillis(_writeBuffer, checkpoint.getSystemTimeMillis());
CP.putBaseAddress(_writeBuffer, _baseAddress);
_persistit.getIOMeter().chargeWriteOtherToJournal(CP.OVERHEAD, _currentAddress);
advance(CP.OVERHEAD);
force();
checkpointWritten(checkpoint);
_persistit.getLogBase().checkpointWritten.log(checkpoint, address);
_persistit.getIOMeter().chargeWriteOtherToJournal(CP.OVERHEAD, address);
}
_lastValidCheckpoint = checkpoint;
_lastValidCheckpointJournalAddress = _currentAddress - CP.OVERHEAD;
_lastValidCheckpointBaseAddress = _baseAddress;
}
void writePageToJournal(final Buffer buffer) throws PersistitException {
final Volume volume;
final int recordSize;
synchronized (this) {
if (!buffer.isTemporary() && buffer.getTimestamp() < _lastValidCheckpoint.getTimestamp()) {
_persistit.getLogBase().lateWrite.log(_lastValidCheckpoint, buffer);
}
volume = buffer.getVolume();
final int handle = handleForVolume(volume);
int leftSize;
int rightSize;
if (buffer.isDataPage() || buffer.isIndexPage() || buffer.isGarbagePage()) {
leftSize = buffer.getKeyBlockEnd();
rightSize = buffer.getBufferSize() - buffer.getAlloc();
} else {
leftSize = 0;
rightSize = buffer.getBufferSize();
}
recordSize = PA.OVERHEAD + leftSize + rightSize;
prepareWriteBuffer(recordSize);
Debug.$assert1.t(_writeBuffer.remaining() >= recordSize);
final long address = _currentAddress;
final int position = _writeBuffer.position();
JournalRecord.putLength(_writeBuffer, recordSize);
PA.putVolumeHandle(_writeBuffer, handle);
PA.putType(_writeBuffer);
JournalRecord.putTimestamp(_writeBuffer, buffer.isTemporary() ? -1 : buffer.getTimestamp());
PA.putLeftSize(_writeBuffer, leftSize);
PA.putBufferSize(_writeBuffer, buffer.getBufferSize());
PA.putPageAddress(_writeBuffer, buffer.getPageAddress());
advance(PA.OVERHEAD);
if (leftSize > 0) {
_writeBuffer.put(buffer.getBytes(), 0, leftSize);
_writeBuffer.put(buffer.getBytes(), buffer.getBufferSize() - rightSize, rightSize);
} else {
_writeBuffer.put(buffer.getBytes());
}
Debug.$assert0.t(_writeBuffer.position() - position == recordSize);
_currentAddress += recordSize - PA.OVERHEAD;
final PageNode pageNode = new PageNode(handle, buffer.getPageAddress(), address, buffer.getTimestamp());
_pageList.add(pageNode);
PageNode oldPageNode = _pageMap.put(pageNode, pageNode);
if (oldPageNode != null) {
assert oldPageNode.getTimestamp() <= pageNode.getTimestamp();
}
final long checkpointTimestamp = _persistit.getTimestampAllocator().getProposedCheckpointTimestamp();
if (oldPageNode != null && oldPageNode.getTimestamp() > checkpointTimestamp
&& buffer.getTimestamp() > checkpointTimestamp) {
oldPageNode.invalidate();
oldPageNode = oldPageNode.getPrevious();
}
pageNode.setPrevious(oldPageNode);
_writePageCount++;
}
_persistit.getIOMeter().chargeWritePageToJournal(volume, buffer.getPageAddress(), buffer.getBufferSize(),
_currentAddress - recordSize, urgency(), buffer.getIndex());
}
/**
* package-private for unit tests only.
*
* @param volume
* @param handle
* @throws PersistitException
*/
synchronized void writeVolumeHandleToJournal(final Volume volume, final int handle) throws PersistitException {
prepareWriteBuffer(IV.MAX_LENGTH);
IV.putType(_writeBuffer);
IV.putHandle(_writeBuffer, handle);
IV.putVolumeId(_writeBuffer, volume.getId());
JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
if (_persistit.getConfiguration().isUseOldVSpec()) {
IV.putVolumeSpecification(_writeBuffer, volume.getName());
} else {
IV.putVolumeSpecification(_writeBuffer, volume.getSpecification().toString());
}
final int recordSize = JournalRecord.getLength(_writeBuffer);
_persistit.getIOMeter().chargeWriteOtherToJournal(recordSize, _currentAddress);
advance(recordSize);
}
synchronized void writeTreeHandleToJournal(final TreeDescriptor td, final int handle) throws PersistitException {
prepareWriteBuffer(IT.MAX_LENGTH);
IT.putType(_writeBuffer);
IT.putHandle(_writeBuffer, handle);
IT.putVolumeHandle(_writeBuffer, td.getVolumeHandle());
JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
IT.putTreeName(_writeBuffer, td.getTreeName());
final int recordSize = JournalRecord.getLength(_writeBuffer);
_persistit.getIOMeter().chargeWriteOtherToJournal(recordSize, _currentAddress);
advance(recordSize);
}
/**
* <p>
* Write a transaction or partial transaction to the journal as a TX record
* containing a variable number of variable-length update records. The
* supplied <code>buffer</code> contains the update records.
* </p>
* <p>
* TX records typically represent a complete transaction, but in the case of
* transactions with a large number of updates, there may be multiple TX
* records. In that case each TX record but the last one written specifies a
* commit timestamp value of zero indicating that the transaction has not
* committed yet, and each TX record but the first one written specifies the
* journal address of the previous one. These pointers allow the recovery
* process find efficiently all the updates of a transaction that needs to
* be rolled back.
* </p>
*
* @param buffer
* The buffer containing the update records
* @param startTimestamp
* Transaction start timestamp
* @param commitTimestamp
* Transaction commit timestamp, or 0 if the transaction has not
* committed yet
* @param backchainAddress
* Journal address of previous TX record written by this
* transaction, or 0 if there is to previous record
*
* @return
* @throws PersistitException
*/
synchronized long writeTransactionToJournal(final ByteBuffer buffer, final long startTimestamp,
final long commitTimestamp, final long backchainAddress) throws PersistitException {
final int recordSize = TX.OVERHEAD + buffer.position();
prepareWriteBuffer(recordSize);
final long address = _currentAddress;
TX.putLength(_writeBuffer, recordSize);
TX.putType(_writeBuffer);
TX.putTimestamp(_writeBuffer, startTimestamp);
TX.putCommitTimestamp(_writeBuffer, commitTimestamp);
TX.putBackchainAddress(_writeBuffer, backchainAddress);
_persistit.getIOMeter().chargeWriteTXtoJournal(recordSize, _currentAddress);
advance(TX.OVERHEAD);
try {
buffer.flip();
_writeBuffer.put(buffer);
} finally {
buffer.clear();
}
_currentAddress += recordSize - TX.OVERHEAD;
if (commitTimestamp != ABORTED) {
final long key = Long.valueOf(startTimestamp);
TransactionMapItem item = _liveTransactionMap.get(key);
if (item == null) {
if (backchainAddress != 0) {
throw new IllegalStateException("Missing back-chained transaction for start timestamp "
+ startTimestamp);
}
item = new TransactionMapItem(startTimestamp, address);
_liveTransactionMap.put(startTimestamp, item);
} else {
if (backchainAddress == 0) {
throw new IllegalStateException("Duplicate transaction " + item);
}
if (item.isCommitted()) {
throw new IllegalStateException("Transaction already committed " + item);
}
item.setLastRecordAddress(address);
}
item.setCommitTimestamp(commitTimestamp);
}
return address;
}
static File journalPath(final String path) {
final File file = new File(path);
if (file.isDirectory()) {
return new File(file, DEFAULT_JOURNAL_FILE_NAME);
} else {
return file;
}
}
static long fileToGeneration(final File file) {
final Matcher matcher = PATH_PATTERN.matcher(file.getName());
if (matcher.matches()) {
return Long.parseLong(matcher.group(2));
} else {
return -1;
}
}
static String fileToPath(final File file) {
final Matcher matcher = PATH_PATTERN.matcher(file.getPath());
if (matcher.matches()) {
return matcher.group(1);
} else {
return null;
}
}
static File generationToFile(final String path, final long generation) {
return new File(String.format(PATH_FORMAT, path, generation));
}
File addressToFile(final long address) {
return generationToFile(_journalFilePath, address / _blockSize);
}
long addressToOffset(final long address) {
return address % _blockSize;
}
void setWriteBufferSize(final int size) {
if (size < MINIMUM_BUFFER_SIZE || size > MAXIMUM_BUFFER_SIZE) {
throw new IllegalArgumentException("Invalid write buffer size: " + size);
}
_writeBufferSize = size;
}
public void close() throws PersistitException {
_closed.set(true);
rollover();
final JournalCopier copier = _copier;
_copier = null;
if (copier != null) {
_persistit.waitForIOTaskStop(copier);
}
final JournalFlusher flusher = _flusher;
_flusher = null;
if (flusher != null) {
_persistit.waitForIOTaskStop(flusher);
}
synchronized (this) {
try {
closeAllChannels();
} catch (final IOException ioe) {
throw new PersistitIOException(ioe);
} finally {
_handleToTreeMap.clear();
_handleToVolumeMap.clear();
_volumeToHandleMap.clear();
_treeToHandleMap.clear();
_pageMap.clear();
_pageList.clear();
_writeBuffer = null;
}
}
}
private void closeAllChannels() throws IOException {
synchronized (this) {
try {
for (final FileChannel channel : _journalFileChannels.values()) {
if (channel != null) {
channel.close();
}
}
} finally {
_journalFileChannels.clear();
}
}
}
/**
* Abruptly stop (using {@link Thread#stop()}) the copier and flusher
* threads. This method should be used only by tests.
*/
void crash() throws IOException {
IOTaskRunnable.crash(_flusher);
IOTaskRunnable.crash(_copier);
//
// Even when simulating a crash do this to release
// channels and therefore allow disk space to be returned to
// the OS when the files are deleted.
//
closeAllChannels();
}
/**
* Flushes the write buffer
*
* @throws PersistitException
*/
synchronized long flush() throws PersistitException {
_persistit.checkFatal();
final long address = _writeBufferAddress;
if (address != Long.MAX_VALUE && _writeBuffer != null) {
assert _writeBufferAddress + _writeBuffer.position() == _currentAddress : String.format(
"writeBufferAddress=%,d position=%,d currentAddress=%,d", _writeBufferAddress,
_writeBuffer.position(), _currentAddress);
try {
if (_writeBuffer.position() > 0) {
final FileChannel channel = getFileChannel(address);
final long size = channel.size();
if (size < addressToOffset(address)) {
throw new CorruptJournalException(String.format(
"Journal file %s size %,d does not match current address %,d", addressToFile(address),
size, address));
}
_writeBuffer.flip();
boolean writeComplete = false;
final int written;
try {
/*
* Note: contract for FileChannel requires write to
* return normally only when all bytes have been
* written. (See java.nio.channels.WritableByteChannel
* #write(ByteBuffer), statement
* "Unless otherwise specified...")
*/
channel.write(_writeBuffer, _writeBufferAddress % _blockSize);
/*
* Surprise: FileChannel#write does not throw an
* Exception if it successfully writes some bytes and
* then encounters a disk full condition. (Found this
* out empirically.)
*/
writeComplete = _writeBuffer.remaining() == 0;
} finally {
written = _writeBuffer.position();
_writeBufferAddress += written;
if (writeComplete) {
if (_writeBuffer.capacity() != _writeBufferSize) {
_writeBuffer = ByteBuffer.allocate(_writeBufferSize);
} else {
_writeBuffer.clear();
}
} else {
/*
* If the buffer didn't get written, perhaps due to
* an interrupt or disk-full condition, then compact
* to remove any bytes from the buffer that actually
* did get written and reset other measurements.
*/
_writeBuffer.compact();
}
final long remaining = _blockSize - (_writeBufferAddress % _blockSize);
if (remaining < (_writeBuffer.limit())) {
_writeBuffer.limit((int) remaining);
}
}
assert _writeBufferAddress + _writeBuffer.position() == _currentAddress : String.format(
"writeBufferAddress=%,d position=%,d currentAddress=%,d", _writeBufferAddress,
_writeBuffer.position(), _currentAddress);
_persistit.getIOMeter().chargeFlushJournal(written, address);
return _writeBufferAddress;
}
} catch (final IOException e) {
throw new PersistitIOException("Writing to file " + addressToFile(address), e);
}
}
return Long.MAX_VALUE;
}
/**
* Force all data written to the journal file to disk.
*/
@Override
public void force() throws PersistitException {
long address = Long.MAX_VALUE;
try {
address = flush();
if (address != Long.MAX_VALUE) {
final FileChannel channel = getFileChannel(address);
channel.force(false);
}
} catch (final IOException e) {
throw new PersistitIOException("Writing to file " + addressToFile(address), e);
}
}
/**
* Map a ByteBuffer to a file address, as needed to ensure client methods
* can write their records. This method modifies the values of _writeBuffer,
* _writeBufferAddress, and in case a new journal file is prepared (a
* "roll-over" event), it also modifies _currentAddress to reflect the
* current address in the new file.
*
* @param size
* Size of record to be written
* @return <code>true</code> if and only if a new journal file was started
* @throws PersistitException
*/
private boolean prepareWriteBuffer(final int size) throws PersistitException {
_persistit.checkFatal();
boolean newJournalFile = false;
if (getCurrentJournalSize() == 0) {
flush();
_writeBufferAddress = _currentAddress;
startJournalFile();
newJournalFile = true;
}
assert _writeBufferAddress + _writeBuffer.position() == _currentAddress : String.format(
"writeBufferAddress=%,d position=%,d currentAddress=%,d", _writeBufferAddress, _writeBuffer.position(),
_currentAddress);
//
// If the current journal file has room for the record, then return.
//
if (_writeBuffer.remaining() > size + JE.OVERHEAD) {
return newJournalFile;
}
//
// Otherwise, flush the write buffer and try again
flush();
if (_writeBuffer.remaining() > size + JE.OVERHEAD) {
return newJournalFile;
}
//
// In the special case of a record which may be longer than
// the capacity of the buffer (e.g., the PageMap), then check whether
// there is enough room in the file to hold the entire map. In that case
// then the buffer is prepared because the PM and TM writers know how to
// fill the buffer multiple times.
//
if (_writeBuffer.remaining() == _writeBuffer.capacity()) {
final long remaining = _blockSize - getCurrentJournalSize();
if (remaining > size + JE.OVERHEAD) {
return newJournalFile;
}
}
//
// Finally if there's still not enough room we're committed to
// rolling the journal.
//
rolloverWithNewFile();
return true;
}
void rollover() throws PersistitException {
rollover(false, false);
}
void rolloverWithNewFile() throws PersistitException {
rollover(false, true);
}
void rolloverWithNewBaseAndFile() throws PersistitException {
rollover(true, true);
}
private synchronized void rollover(final boolean setBaseAddress, final boolean startNewFile)
throws PersistitException {
if (_writeBufferAddress != Long.MAX_VALUE) {
writeJournalEnd();
flush();
try {
final long length = getCurrentJournalSize();
final boolean matches = length == (_writeBuffer.position() + _writeBufferAddress) % _blockSize;
final FileChannel channel = getFileChannel(_currentAddress);
Debug.$assert1.t(matches);
if (matches) {
channel.truncate(length);
}
channel.force(true);
} catch (final IOException ioe) {
throw new PersistitIOException(ioe);
}
_currentAddress = ((_currentAddress / _blockSize) + 1) * _blockSize;
_writeBuffer.clear();
_writeBufferAddress = _currentAddress;
_isNewEpoch = false;
if (setBaseAddress) {
_baseAddress = _currentAddress;
}
if (startNewFile) {
prepareWriteBuffer(JH.OVERHEAD);
}
}
}
/**
* Timestamp marking the Page Map, Transaction Map and other records in the
* journal header. This timestamp is used to discriminate between pages in a
* "branch" history and the live history. See comments in
* {@link RecoveryManager#scanLoadPageMap(long, long, int)} for details.
*
* @return either the current timestamp or the timestamp of the last valid
* checkpoint, depending on whether this journal file starts a new
* epoch.
*/
private long epochalTimestamp() {
return _isNewEpoch ? getLastValidCheckpointTimestamp() : _persistit.getCurrentTimestamp();
}
private void startJournalFile() throws PersistitException {
//
// Write the beginning of a new journal file.
//
// The information written here is designed to accelerate recovery.
// The recovery process can simply read the JournalHeader and
// subsequent records from the last journal file to load the page
// map and live transaction map. The journal file is valid for
// recovery only if the CP (checkpoint) record is present in the
// recovered file.
//
writeJournalHeader();
//
// Write IV (identify volume) records for each volume in the handle
// map
//
for (final Map.Entry<Integer, Volume> entry : _handleToVolumeMap.entrySet()) {
writeVolumeHandleToJournal(entry.getValue(), entry.getKey().intValue());
}
//
// Write IT (identify tree) records for each tree in the handle
// map
//
for (final Map.Entry<Integer, TreeDescriptor> entry : _handleToTreeMap.entrySet()) {
if (entry.getValue().getVolumeHandle() != Volume.LOCK_VOLUME_HANDLE) {
writeTreeHandleToJournal(entry.getValue(), entry.getKey().intValue());
}
}
//
// Write the PM (Page Map) record
//
writePageMap();
//
// Write the TM (Transaction Map) record
//
writeTransactionMap();
//
// Finally, write the current CP (checkpoint) record.
//
writeCheckpointToJournal(_lastValidCheckpoint);
}
/**
* Return the <code>FileChannel</code> for the journal file containing the
* supplied <code>address</code>. If necessary, create a new
* {@link MediatedFileChannel}.
*
* @param address
* the journal address of a record in the journal for which the
* corresponding channel will be returned
* @throws PersistitException
* if the <code>MediatedFileChannel</code> cannot be created
*/
synchronized FileChannel getFileChannel(final long address) throws PersistitIOException {
if (address < _deleteBoundaryAddress || address > _currentAddress + _blockSize) {
throw new IllegalArgumentException("Invalid journal address " + address + " outside of range ("
+ _deleteBoundaryAddress + ":" + (_currentAddress + _blockSize) + ")");
}
final long generation = address / _blockSize;
FileChannel channel = _journalFileChannels.get(generation);
if (channel == null) {
try {
channel = new MediatedFileChannel(addressToFile(address), "rw");
_journalFileChannels.put(generation, channel);
} catch (final IOException ioe) {
throw new PersistitIOException(ioe);
}
}
return channel;
}
/**
* Set the copyFast flag and then wait until all checkpointed pages have
* been copied to their respective volumes, allowing the journal files to be
* deleted. Pages modified after the last valid checkpoint cannot be copied.
* <p>
* Does nothing of the <code>appendOnly</code> is set.
*
* @throws PersistitException
*/
@Override
public void copyBack() throws Exception {
if (!_appendOnly.get()) {
_copyFast.set(true);
final int exceptionCount = _copier.getExceptionCount();
while (_copyFast.get()) {
_copier.kick();
Util.sleep(Persistit.SHORT_DELAY);
if (_copier.getExceptionCount() != exceptionCount) {
throw _copier.getLastException();
}
}
}
}
/**
* Remove transactions and PageNode entries when possible due to completion
* of a new checkpoint.
*
* @param checkpoint
*/
private void checkpointWritten(final Checkpoint checkpoint) {
//
// Will become the earliest timestamp of any record needed to
// be retained for recovery. For transactions containing LONG_RECORD
// pages, those pages may be written to the journal with timestamps
// earlier than the commitTimestamp of the transaction but they are
// guaranteed to be written with timestamp values later than the
// transaction's startTimestamp. Therefore we can't cull PageMap entries
// later than this recoveryTimestamp because the pages they refer to may
// be needed for recovery.
//
long recoveryTimestamp = checkpoint.getTimestamp();
recoveryTimestamp = Math.min(Math.min(recoveryTimestamp, _earliestCommittedTimestamp),
_earliestAbortedTimestamp);
//
// Remove all but the most recent PageNode version before the
// checkpoint.
//
for (final PageNode pageNode : _pageMap.values()) {
for (PageNode pn = pageNode; pn != null; pn = pn.getPrevious()) {
if (pn.getTimestamp() < recoveryTimestamp) {
pn.removeHistory();
break;
}
}
}
//
// Remove any PageNode from the branchMap having a timestamp less
// than the checkpoint. Generally all such entries are removed after
// the first checkpoint that has been established after recovery.
//
for (final Iterator<PageNode> iterator = _branchMap.values().iterator(); iterator.hasNext();) {
final PageNode pageNode = iterator.next();
if (pageNode.getTimestamp() < recoveryTimestamp) {
iterator.remove();
}
}
checkpoint.completed();
}
/**
* Remove obsolete TransactionMapItem instances from the live transaction
* map. An instance is obsolete if it refers to a transaction that committed
* earlier than that last valid checkpoint (because all of the effects of
* that transaction are now check-pointed into the B-Trees themselves) or if
* it is from an aborted transaction that has no remaining MVV values.
*/
void pruneObsoleteTransactions() {
pruneObsoleteTransactions(isRollbackPruningEnabled());
}
void pruneObsoleteTransactions(final boolean rollbackPruningEnabled) {
final long timestamp = _lastValidCheckpoint.getTimestamp();
long earliestCommitted = Long.MAX_VALUE;
long earliestAborted = Long.MAX_VALUE;
final List<TransactionMapItem> toPrune = new ArrayList<TransactionMapItem>();
/*
* Remove any committed transactions that committed before the
* checkpoint. No need to keep a record of such a transaction since its
* updates are now fully written to the journal in modified page images.
*/
synchronized (this) {
for (final Iterator<TransactionMapItem> iterator = _liveTransactionMap.values().iterator(); iterator
.hasNext();) {
final TransactionMapItem item = iterator.next();
if (item.isCommitted()) {
if (item.getCommitTimestamp() < timestamp) {
iterator.remove();
} else if (item.getStartTimestamp() < earliestCommitted) {
earliestCommitted = item.getStartTimestamp();
}
} else {
final TransactionStatus status;
status = _persistit.getTransactionIndex().getStatus(item.getStartTimestamp());
if (status == null || status.getTs() != item.getStartTimestamp()) {
iterator.remove();
} else if (status.getTc() == ABORTED && status.isNotified()) {
if (status.getMvvCount() == 0) {
iterator.remove();
sequence(RECOVERY_PRUNING_B);
} else {
if (item.getStartTimestamp() < earliestAborted) {
earliestAborted = item.getStartTimestamp();
}
if (rollbackPruningEnabled) {
toPrune.add(item);
}
}
}
}
}
_earliestCommittedTimestamp = earliestCommitted;
_earliestAbortedTimestamp = earliestAborted;
}
/*
* Sort the toPrune list - since all members are aborted, the comparison
* will be by startTimeStamp which is a good approximation of journal
* address order.
*/
Collections.sort(toPrune, TransactionMapItem.TRANSACTION_MAP_ITEM_COMPARATOR);
for (final TransactionMapItem item : toPrune) {
try {
synchronized (_player) {
final TransactionStatus status;
status = _persistit.getTransactionIndex().getStatus(item.getStartTimestamp());
if (status != null && status.getTs() == item.getStartTimestamp() && status.getTc() == ABORTED
&& status.isNotified() && status.getMvvCount() > 0) {
_player.applyTransaction(item, _listener);
}
}
} catch (final PersistitException e) {
_persistit.getLogBase().pruneException.log(e, item);
}
}
}
/**
* General method used to wait for durability. This method is used by all
* three commit modes: SOFT, HARD and GROUP. The two parameters represent
* time intervals in milliseconds.
*
* @param flushedTimestamp
* a timestamp taken after the transaction buffer belonging to
* the current transaction has been flushed.
* @param leadTime
* time interval in milliseconds by which to anticipate I/O
* completion; the method will return as soon as the I/O
* operation that will flush the current generation of data is
* expected to complete within that time interval
* @param stallTime
* time interval in milliseconds that this thread is willing to
* wait for I/O completion. If if the JOURNAL_FLUSHER is
* currently pausing, the pause time may be shortened to try to
* complete the I/O when requested. In particular, a value of
* zero indicates the I/O should start immediately.
* @throws PersistitInterruptedException
*/
void waitForDurability(final long flushedTimestamp, final long leadTime, final long stallTime)
throws PersistitException {
final JournalFlusher flusher = _flusher;
if (flusher != null) {
flusher.waitForDurability(flushedTimestamp, leadTime, stallTime);
} else {
throw new IllegalStateException("JOURNAL_FLUSHER is not running");
}
}
public static class TreeDescriptor {
final int _volumeHandle;
final String _treeName;
TreeDescriptor(final int volumeHandle, final String treeName) {
_volumeHandle = volumeHandle;
_treeName = treeName;
}
public int getVolumeHandle() {
return _volumeHandle;
}
public String getTreeName() {
return _treeName;
}
@Override
public boolean equals(final Object obj) {
if (obj == null || !(obj instanceof TreeDescriptor)) {
return false;
}
final TreeDescriptor td = (TreeDescriptor) obj;
return td._treeName.equals(_treeName) && td._volumeHandle == _volumeHandle;
}
@Override
public int hashCode() {
return _treeName.hashCode() ^ _volumeHandle;
}
@Override
public String toString() {
return "{" + _volumeHandle + "}" + _treeName;
}
}
/**
* A PageNode represents the existence of a copy of a page in the journal.
* It links to previously created PageNode objects which refer to earlier
* versions of the same page. These earlier instances are truncated whenever
* a later version of the same page has been checkpointed.
*
* PageNode instances are designed to serve as both Key and Value fields of
* the _pageNodeMap. The general rubric when adding a page to the journal is
* to construct a PageNode representing the page image, and then use it to
* perform a lookup in the _pageNodeMap. If there is no matching PageNode
* already in the map then simply add the new one. If there is a matching
* PageNode, link it to the new one then replace the entry in the map.
*
* This class implement Comparable on the page address. This is used in
* forming a sorted set of PageNodes so that we can copy pages in roughly
* sequential order to each Volume file.
*/
public static class PageNode {
final int _volumeHandle;
final long _pageAddress;
final long _timestamp;
long _journalAddress;
int _offset;
PageNode _previous;
PageNode(final int volumeHandle, final long pageAddress) {
this(volumeHandle, pageAddress, Long.MIN_VALUE, -1);
}
PageNode(final int volumeHandle, final long pageAddress, final long journalAddress, final long timestamp) {
this._volumeHandle = volumeHandle;
this._pageAddress = pageAddress;
this._journalAddress = journalAddress;
this._timestamp = timestamp;
}
/**
* Construct a copy, also copying members of the linked list. Used by
* #queryPageMap.
*/
PageNode(final PageNode pageNode) {
_volumeHandle = pageNode._volumeHandle;
_pageAddress = pageNode._pageAddress;
_journalAddress = pageNode._journalAddress;
_timestamp = pageNode._timestamp;
_offset = pageNode._offset;
final PageNode previous = pageNode._previous;
if (previous != null) {
_previous = new PageNode(previous);
}
}
/**
* @return the previous
*/
public PageNode getPrevious() {
return _previous;
}
/**
* @param previous
* the previous to set
*/
public void setPrevious(final PageNode previous) {
if (previous != null) {
assert _timestamp >= previous._timestamp;
}
this._previous = previous;
}
/**
* @return the volumeHandle
*/
public int getVolumeHandle() {
return _volumeHandle;
}
/**
* @return the pageAddress
*/
public long getPageAddress() {
return _pageAddress;
}
/**
* @return the journalAddress
*/
public long getJournalAddress() {
return _journalAddress;
}
/**
* @return the timestamp
*/
public long getTimestamp() {
return _timestamp;
}
public void setOffset(final int offset) {
_offset = offset;
}
public int getOffset() {
return _offset;
}
@Override
public int hashCode() {
return _volumeHandle ^ (int) _pageAddress ^ (int) (_pageAddress >>> 32);
}
@Override
public boolean equals(final Object obj) {
if (obj == null || !(obj instanceof PageNode)) {
return false;
}
final PageNode pn = (PageNode) obj;
return _pageAddress == pn._pageAddress && _volumeHandle == pn._volumeHandle;
}
@Override
public String toString() {
return String.format("[%d]%d@%d{%d}%s", _volumeHandle, _pageAddress, _journalAddress, _timestamp,
_previous == null ? "" : "+");
}
public String toString(final JournalManager jman) {
final Volume volume = jman._handleToVolumeMap.get(_volumeHandle);
if (volume == null) {
return toString();
}
return String.format("%s:%d@%d{%d}%s", volume, _pageAddress, _journalAddress, _timestamp,
_previous == null ? "" : "+");
}
public String toStringPageAddress(final VolumeHandleLookup lvh) {
final Volume volume = lvh.lookupVolumeHandle(_volumeHandle);
return String.format("%s:%d", volume == null ? String.valueOf(_volumeHandle) : volume.toString(),
_pageAddress);
}
public String toStringJournalAddress(final VolumeHandleLookup lvn) {
return String.format("%d{%d}%s", _journalAddress, _timestamp, _previous == null ? "" : "+");
}
final static Comparator<PageNode> READ_COMPARATOR = new Comparator<PageNode>() {
@Override
public int compare(final PageNode a, final PageNode b) {
if (!a.isInvalid() && !b.isInvalid()) {
return a.getJournalAddress() > b.getJournalAddress() ? 1 : a.getJournalAddress() < b
.getJournalAddress() ? -1 : 0;
}
if (a.isInvalid() && !b.isInvalid()) {
return -1;
}
if (!a.isInvalid() && b.isInvalid()) {
return 1;
}
if (a._volumeHandle != b._volumeHandle) {
return a._volumeHandle - b._volumeHandle;
}
return a._pageAddress > b._pageAddress ? 1 : a._pageAddress < b._pageAddress ? -1 : 0;
}
};
final static Comparator<PageNode> WRITE_COMPARATOR = new Comparator<PageNode>() {
@Override
public int compare(final PageNode a, final PageNode b) {
if (a.getVolumeHandle() != b.getVolumeHandle()) {
return a.getVolumeHandle() < b._volumeHandle ? -1 : 1;
}
return a.getPageAddress() < b.getPageAddress() ? -1 : a.getPageAddress() > b.getPageAddress() ? 1 : 0;
}
};
boolean isInvalid() {
return _journalAddress == Long.MIN_VALUE;
}
void invalidate() {
_journalAddress = Long.MIN_VALUE;
}
void removeHistory() {
PageNode pn = getPrevious();
setPrevious(null);
while (pn != null) {
final PageNode previous = pn.getPrevious();
pn.invalidate();
pn.setPrevious(null);
pn = previous;
}
}
}
public static class TransactionMapItem implements Comparable<TransactionMapItem> {
private final long _startAddress;
private final long _startTimestamp;
private long _commitTimestamp;
private long _lastRecordAddress;
TransactionMapItem(final long startTimestamp, final long address) {
_startTimestamp = startTimestamp;
_commitTimestamp = 0;
_startAddress = address;
_lastRecordAddress = address;
}
TransactionMapItem(final TransactionMapItem item) {
_startAddress = item._startAddress;
_startTimestamp = item._startTimestamp;
_commitTimestamp = item._commitTimestamp;
_lastRecordAddress = item._lastRecordAddress;
}
public long getStartAddress() {
return _startAddress;
}
public long getStartTimestamp() {
return _startTimestamp;
}
public long getCommitTimestamp() {
return _commitTimestamp;
}
public long getLastRecordAddress() {
return _lastRecordAddress;
}
void setCommitTimestamp(final long commitTimestamp) {
_commitTimestamp = commitTimestamp;
}
void setLastRecordAddress(final long address) {
_lastRecordAddress = address;
}
public boolean isCommitted() {
return _commitTimestamp > 0;
}
public boolean isAborted() {
return _commitTimestamp == ABORTED;
}
@Override
public String toString() {
return String.format("TStatus %,d{%,d}%s", _startAddress, _commitTimestamp, isCommitted() ? "c" : "u");
}
@Override
public int compareTo(final TransactionMapItem ts) {
if (isCommitted()) {
return ts.getCommitTimestamp() < _commitTimestamp ? 1 : ts.getCommitTimestamp() > _commitTimestamp ? -1
: 0;
} else {
return ts.isCommitted() ? -1 : ts.getStartTimestamp() < _startTimestamp ? 1
: ts.getStartTimestamp() > _startTimestamp ? -1 : 0;
}
}
final static Comparator<TransactionMapItem> TRANSACTION_MAP_ITEM_COMPARATOR = new Comparator<TransactionMapItem>() {
@Override
public int compare(final TransactionMapItem a, final TransactionMapItem b) {
return a.getLastRecordAddress() > b.getLastRecordAddress() ? 1 : a.getLastRecordAddress() < b
.getLastRecordAddress() ? -1 : 0;
}
};
}
private class JournalCopier extends IOTaskRunnable {
private volatile boolean _shouldStop = false;
private final ByteBuffer _bb = ByteBuffer.allocate(DEFAULT_COPY_BUFFER_SIZE);
private final List<PageNode> _copyList = new ArrayList<PageNode>(_copiesPerCycle);
int _lastCyclePagesWritten;
JournalCopier() {
super(JournalManager.this._persistit);
}
void start() {
start("JOURNAL_COPIER", _copierInterval);
}
@Override
public void runTask() throws Exception {
_copying.set(true);
try {
_copyList.clear();
if (!_appendOnly.get()) {
selectForCopy(_copyList);
if (!_copyList.isEmpty()) {
readForCopy(_copyList, _bb);
}
if (!_copyList.isEmpty()) {
writeForCopy(_copyList, _bb);
}
}
cleanupForCopy(_copyList);
_lastCyclePagesWritten = _copyList.size();
if (_copyList.isEmpty()) {
_copyFast.set(false);
}
} finally {
_copying.set(false);
}
long throttleInterval = 0;
if (!_appendOnly.get()) {
final int urgency = urgency();
if (urgency == URGENT) {
throttleInterval = URGENT_COMMIT_DELAY_MILLIS;
} else if (urgency > ALMOST_URGENT) {
throttleInterval = GENTLE_COMMIT_DELAY_MILLIS;
}
}
if (throttleInterval != _throttleSleepInterval) {
_throttleSleepInterval = throttleInterval;
}
}
@Override
protected boolean shouldStop() {
return _closed.get() || _shouldStop;
}
/**
* Return a nice interval, in milliseconds, to wait between copierCycle
* invocations. The interval decreases as interval goes up, and becomes
* zero when the urgency is greater than or equal to 8. The interval is
* also zero if there has be no recent I/O activity invoked by other
* activities.
*/
@Override
public long pollInterval() {
final IOMeter iom = _persistit.getIOMeter();
final long pollInterval = super.getPollInterval();
final int urgency = urgency();
if (_lastCyclePagesWritten == 0) {
return pollInterval;
}
if (urgency >= ALMOST_URGENT) {
return 0;
}
int divisor = 1;
if (iom.recentCharge() < iom.getQuiescentIOthreshold() * KILO) {
divisor = HALF_URGENT;
} else if (urgency > HALF_URGENT) {
divisor = urgency - HALF_URGENT;
}
return super.getPollInterval() / divisor;
}
}
private class JournalFlusher extends IOTaskRunnable {
volatile long _lastExceptionTimestamp = 0;
volatile Exception _lastException = null;
long[] _ioTimes = new long[IO_MEASUREMENT_CYCLES];
int _ioCycle;
volatile long _expectedIoTime;
volatile long _startTime;
volatile long _endTime;
volatile long _startTimestamp;
volatile long _endTimestamp;
JournalFlusher() {
super(JournalManager.this._persistit);
}
void start() {
start("JOURNAL_FLUSHER", _flushInterval);
}
/**
* General method used to wait for durability. {@See
* JournalManager#waitForDurability(long, long, long)}.
*
* @throws PersistitInterruptedException
*/
private void waitForDurability(final long flushedTimestamp, final long leadTime, final long stallTime)
throws PersistitException {
/*
* Commit is known durable once the JOURNAL_FLUSHER thread has
* posted an _endTimestamp larger than flushedTimestamp.
*/
final long now = System.nanoTime();
long remainingStallTime = stallTime;
while (true) {
/*
* Detect whether an I/O cycle is in progress; if so estimate
* how much more time (in nanoseconds) it will require to
* complete.
*/
long estimatedRemainingIoNanos = -1;
long startTime;
long endTime;
long startTimestamp;
long endTimestamp;
/*
* Spin until values are stable
*/
while (true) {
startTimestamp = _startTimestamp;
endTimestamp = _endTimestamp;
startTime = _startTime;
endTime = _endTime;
if (startTimestamp == _startTimestamp && endTimestamp == _endTimestamp) {
if (flushedTimestamp > startTimestamp && startTimestamp > endTimestamp) {
estimatedRemainingIoNanos = Math.max(startTime + _expectedIoTime - now, 0);
}
break;
}
Util.spinSleep();
}
if (endTimestamp > flushedTimestamp && startTimestamp > flushedTimestamp) {
/*
* Done - commit is durable
*/
break;
}
long remainingSleepNanos;
if (estimatedRemainingIoNanos == -1) {
remainingSleepNanos = Math.max(0, _flushInterval - (now - endTime));
} else {
remainingSleepNanos = _flushInterval;
}
long estimatedNanosToFinish;
if (startTimestamp < flushedTimestamp) {
estimatedNanosToFinish = remainingSleepNanos + _expectedIoTime;
} else {
estimatedNanosToFinish = estimatedRemainingIoNanos;
}
if (leadTime > 0 && leadTime * NS_PER_MS >= estimatedNanosToFinish) {
/*
* If the caller specified an leadTime interval larger than
* the estimated time remaining in the cycle, then return
* immediately. This handles the "soft" commit case.
*/
break;
} else if (estimatedRemainingIoNanos == -1) {
/*
* If there is no I/O in progress, then wait as long as
* possible (determined by stallTime) before kicking the
* JOURNAL_FLUSHER to write the caller's transaction.
*/
if (remainingStallTime > 0) {
Util.sleep(remainingStallTime);
remainingStallTime = 0;
} else {
kick();
Util.spinSleep();
}
} else {
/*
* Otherwise wait for concurrent I/O operation to finish. Do
* this by polling because our experiments with using locks
* here showed significant excess CPU consumption.
*/
Util.spinSleep();
}
}
if (_lastExceptionTimestamp > flushedTimestamp) {
final Exception e = _lastException;
if (e instanceof PersistitException) {
throw (PersistitException) e;
} else {
throw new PersistitException(e);
}
}
_totalCommits.incrementAndGet();
_totalCommitWaitTime.addAndGet(System.nanoTime() - now);
}
@Override
protected void runTask() {
_flushing.set(true);
try {
try {
/*
* This lock is intended only to help other threads in
* waitForDurability to know when the I/O operation has
* finished.
*/
try {
_startTimestamp = _persistit.getTimestampAllocator().updateTimestamp();
_startTime = System.nanoTime();
/*
* Flush the write buffer and call FileChannel.force().
*/
force();
} finally {
_endTime = System.nanoTime();
_endTimestamp = _persistit.getTimestampAllocator().updateTimestamp();
}
final long elapsed = _endTime - _startTime;
_totalFlushCycles.incrementAndGet();
_totalFlushIoTime.addAndGet(elapsed);
_ioTimes[_ioCycle] = elapsed;
_ioCycle = (_ioCycle + 1) % IO_MEASUREMENT_CYCLES;
long avg = 0;
for (int index = 0; index < IO_MEASUREMENT_CYCLES; index++) {
avg += _ioTimes[index];
}
avg /= IO_MEASUREMENT_CYCLES;
_expectedIoTime = avg;
if (elapsed > _slowIoAlertThreshold * NS_PER_MS) {
_persistit.getLogBase().longJournalIO.log(elapsed / NS_PER_MS, IO_MEASUREMENT_CYCLES, avg
/ NS_PER_MS);
}
} catch (final Exception e) {
if (e instanceof InterruptedException || e instanceof FatalErrorException) {
_closed.set(true);
} else if (e instanceof PersistitException) {
_persistit.getAlertMonitor().post(
new Event(AlertLevel.ERROR, _persistit.getLogBase().journalWriteError, e,
addressToFile(_writeBufferAddress), addressToOffset(_writeBufferAddress)),
AlertMonitor.JOURNAL_CATEGORY);
} else {
_persistit.getLogBase().journalWriteError.log(e, addressToFile(_writeBufferAddress),
addressToOffset(_writeBufferAddress));
}
}
} finally {
_flushing.set(false);
}
}
@Override
protected boolean shouldStop() {
return _closed.get();
}
}
synchronized void selectForCopy(final List<PageNode> list) {
list.clear();
if (!_appendOnly.get()) {
final long timeStampUpperBound = Math.min(getLastValidCheckpointTimestamp(), _copierTimestampLimit);
for (final Iterator<PageNode> iterator = _pageList.iterator(); iterator.hasNext();) {
final PageNode pageNode = iterator.next();
for (PageNode pn = pageNode; pn != null && !pn.isInvalid(); pn = pn.getPrevious()) {
if (pn.getTimestamp() < timeStampUpperBound) {
list.add(pn);
break;
}
}
if (list.size() >= _copiesPerCycle) {
break;
}
}
}
}
void readForCopy(final List<PageNode> list, final ByteBuffer bb) throws PersistitException {
Collections.sort(list, PageNode.READ_COMPARATOR);
bb.clear();
Volume volume = null;
int handle = -1;
for (final Iterator<PageNode> iterator = list.iterator(); iterator.hasNext();) {
final PageNode pageNode = iterator.next();
if (pageNode.isInvalid()) {
iterator.remove();
continue;
}
pageNode.setOffset(-1);
if (pageNode.getVolumeHandle() != handle) {
handle = -1;
try {
volume = volumeForHandle(pageNode.getVolumeHandle());
handle = volume.getHandle();
} catch (final VolumeNotFoundException vnfe) {
// Deal with this in writeForCopy
continue;
}
}
if (volume == null) {
// Deal with this in writeForCopy
continue;
}
final int at = bb.position();
final long pageAddress;
try {
final PageNode stablePageNode = new PageNode(pageNode);
if (pageNode.isInvalid()) {
iterator.remove();
continue;
}
pageAddress = readPageBufferFromJournal(stablePageNode, bb);
_persistit.getIOMeter().chargeCopyPageFromJournal(volume, pageAddress, volume.getPageSize(),
stablePageNode.getJournalAddress(), urgency());
} catch (final PersistitException ioe) {
_persistit
.getAlertMonitor()
.post(new Event(AlertLevel.ERROR, _persistit.getLogBase().copyException, ioe, volume,
pageNode.getPageAddress(), pageNode.getJournalAddress()), AlertMonitor.JOURNAL_CATEGORY);
throw ioe;
}
Debug.$assert0.t(pageAddress == pageNode.getPageAddress());
pageNode.setOffset(at);
if (bb.limit() - at != volume.getStructure().getPageSize()) {
throw new CorruptJournalException(pageNode.toStringPageAddress(this) + " bufferSize " + bb.limit()
+ " does not match " + volume + " bufferSize " + volume.getPageSize() + " at "
+ pageNode.toStringJournalAddress(this));
}
bb.position(bb.limit());
}
}
void writeForCopy(final List<PageNode> list, final ByteBuffer bb) throws PersistitException {
Collections.sort(list, PageNode.WRITE_COMPARATOR);
Volume volume = null;
int handle = -1;
final Set<Volume> volumes = new HashSet<Volume>();
for (final Iterator<PageNode> iterator = list.iterator(); iterator.hasNext();) {
final PageNode pageNode = iterator.next();
if (pageNode.getVolumeHandle() != handle) {
handle = -1;
volume = null;
Volume candidate = null;
try {
candidate = lookupVolumeHandle(pageNode.getVolumeHandle());
if (candidate != null) {
if (!candidate.isOpened()) {
candidate.open(_persistit);
}
handle = pageNode.getVolumeHandle();
volume = candidate;
}
} catch (final VolumeNotFoundException vnfe) {
_persistit.getAlertMonitor().post(
new Event(AlertLevel.WARN, _persistit.getLogBase().missingVolume, candidate,
pageNode.getJournalAddress()), AlertMonitor.MISSING_VOLUME_CATEGORY);
if (_ignoreMissingVolume.get()) {
_persistit.getLogBase().lostPageFromMissingVolume.log(pageNode.getPageAddress(), candidate,
pageNode.getJournalAddress());
// Not removing the page from the List here will cause
// cleanupForCopy to remove it from
// the page map.
continue;
}
}
}
if (volume == null || volume.isClosed()) {
// Remove from the List so that below we won't remove it from
// from the pageMap.
iterator.remove();
continue;
}
final long pageAddress = pageNode.getPageAddress();
volume.getStorage().extend(pageAddress);
final int pageSize = volume.getPageSize();
final int at = pageNode.getOffset();
bb.limit(bb.capacity()).position(at).limit(at + pageSize);
try {
volume.getStorage().writePage(bb, pageAddress);
volumes.add(volume);
} catch (final PersistitException ioe) {
_persistit.getLogBase().copyException.log(ioe, volume, pageNode.getPageAddress(),
pageNode.getJournalAddress());
throw ioe;
}
_copiedPageCount++;
_persistit.getIOMeter().chargeCopyPageToVolume(volume, pageAddress, volume.getPageSize(),
pageNode.getJournalAddress(), urgency());
}
for (final Volume vol : volumes) {
vol.getStorage().force();
}
}
private void cleanupForCopy(final List<PageNode> list) throws PersistitException {
//
// Files and FileChannels no longer needed for recovery.
//
final List<FileChannel> obsoleteFileChannels = new ArrayList<FileChannel>();
final List<File> obsoleteFiles = new ArrayList<File>();
// Address of the first file needed for recovery
long deleteBoundary = 0;
synchronized (this) {
for (final PageNode copiedPageNode : list) {
PageNode pageNode = _pageMap.get(copiedPageNode);
if (pageNode.getJournalAddress() == copiedPageNode.getJournalAddress()) {
pageNode.removeHistory();
pageNode.invalidate();
final PageNode pn = _pageMap.remove(pageNode);
assert pn == copiedPageNode;
} else {
PageNode previous = pageNode.getPrevious();
while (previous != null) {
if (previous.getJournalAddress() == copiedPageNode.getJournalAddress()) {
// No need to keep the previous entry, or any of
// its predecessors
pageNode.removeHistory();
break;
} else {
pageNode = previous;
previous = pageNode.getPrevious();
}
}
}
}
_droppedPageCount += cleanupPageList() - list.size();
//
// Will hold the address of the first record containing information
// not yet copied back into a Volume, and therefore required for
// recovery.
//
long recoveryBoundary = _currentAddress;
//
// Detect first journal address holding a mapped page
// required for recovery
//
for (final PageNode pageNode : _pageMap.values()) {
//
// If there are multiple versions, we need to keep
// the most recent one that has been checkpointed.
//
for (PageNode pn = pageNode; pn != null; pn = pn.getPrevious()) {
if (!pn.isInvalid() && pn.getJournalAddress() < recoveryBoundary) {
recoveryBoundary = pn.getJournalAddress();
}
}
}
//
// Detect first journal address still holding an uncheckpointed
// Transaction required for recovery.
//
for (final Iterator<TransactionMapItem> iterator = _liveTransactionMap.values().iterator(); iterator
.hasNext();) {
final TransactionMapItem item = iterator.next();
if (item.getStartAddress() < recoveryBoundary) {
recoveryBoundary = item.getStartAddress();
}
}
if (recoveryBoundary < _baseAddress) {
throw new IllegalStateException(String.format("Retrograde base address %,d is less than current %,d",
recoveryBoundary, _baseAddress));
}
_baseAddress = recoveryBoundary;
for (deleteBoundary = _deleteBoundaryAddress; deleteBoundary + _blockSize <= _lastValidCheckpointBaseAddress; deleteBoundary += _blockSize) {
final long generation = deleteBoundary / _blockSize;
final FileChannel channel = _journalFileChannels.remove(generation);
if (channel != null) {
obsoleteFileChannels.add(channel);
}
obsoleteFiles.add(addressToFile(deleteBoundary));
}
//
// Conditions mean that there is no active content in the
// journal and the current journal file has more than RT bytes
// in it where RT is the "rolloverThreshold". When these
// conditions are met then we force a rollover and cause the
// current journal file to be deleted. This behavior keeps
// the journal small when there are no un-checkpointed pages
// or transactions.
//
if (_baseAddress == _currentAddress && _lastValidCheckpointBaseAddress >= _currentAddress - CP.OVERHEAD
&& (getCurrentJournalSize() > rolloverThreshold())) {
final FileChannel channel = _journalFileChannels.remove(_currentAddress / _blockSize);
if (channel != null) {
obsoleteFileChannels.add(channel);
}
obsoleteFiles.add(addressToFile(_currentAddress));
rolloverWithNewBaseAndFile();
}
}
for (final FileChannel channel : obsoleteFileChannels) {
if (channel != null) {
try {
channel.close();
} catch (final IOException e) {
// TODO - log this?
// Ignored for now - this simply means we can't close
// a file we don't need any more.
}
}
}
boolean deleted = true;
for (final File file : obsoleteFiles) {
if (!file.delete()) {
deleted = false;
// TODO - log this.
// Ignored for now - this simply means we can't delete
// a file we don't need any more.
}
}
if (deleted) {
_deleteBoundaryAddress = deleteBoundary;
}
reportJournalFileCount();
}
/**
* Remove obsolete PageNodes from the page list.
*
* @return Count of removed PageNode instances.
*/
int cleanupPageList() {
final int size = _pageList.size();
int from;
for (from = 0; from < size && !_pageList.get(from).isInvalid(); from++)
;
int to = from;
for (from = from + 1; from < size; from++) {
final PageNode pn = _pageList.get(from);
if (!pn.isInvalid()) {
_pageList.set(to++, pn);
}
}
if (size > to) {
_pageList.removeRange(to, size);
}
return size - to;
}
synchronized void truncate(final Volume volume, final long timestamp) {
for (final PageNode lastPageNode : _pageMap.values()) {
PageNode pageNode = lastPageNode;
while (pageNode != null) {
if (volume.getHandle() == pageNode.getVolumeHandle() && pageNode.getTimestamp() < timestamp) {
pageNode.invalidate();
}
pageNode = pageNode.getPrevious();
}
}
}
private void reportJournalFileCount() {
/*
* Does not need synchronization since only the JOURNAL_COPIER thread
* calls this
*/
final int journalFileCount = getJournalFileCount();
if (journalFileCount != _lastReportedJournalFileCount) {
if (journalFileCount > TOO_MANY_ERROR_THRESHOLD + _urgentFileCountThreshold) {
_persistit.getAlertMonitor()
.post(new Event(AlertLevel.ERROR, _persistit.getLogBase().tooManyJournalFilesError,
journalFileCount), AlertMonitor.MANY_JOURNAL_FILES);
} else if (journalFileCount > TOO_MANY_WARN_THRESHOLD + _urgentFileCountThreshold) {
_persistit.getAlertMonitor()
.post(new Event(AlertLevel.WARN, _persistit.getLogBase().tooManyJournalFilesWarning,
journalFileCount), AlertMonitor.MANY_JOURNAL_FILES);
} else {
_persistit.getAlertMonitor().post(
new Event(AlertLevel.NORMAL, _persistit.getLogBase().normalJournalFileCount, journalFileCount),
AlertMonitor.MANY_JOURNAL_FILES);
}
_lastReportedJournalFileCount = journalFileCount;
}
}
private class JournalTransactionPlayerSupport implements TransactionPlayerSupport {
final ByteBuffer _readBuffer = ByteBuffer.allocate(Transaction.TRANSACTION_BUFFER_SIZE
+ JournalRecord.TX.OVERHEAD);
@Override
public void read(final long address, final int size) throws PersistitIOException {
_readBuffer.clear().limit(size);
readFully(_readBuffer, address);
}
@Override
public ByteBuffer getReadBuffer() {
return _readBuffer;
}
@Override
public void convertToLongRecord(final Value value, final int treeHandle, final long address,
final long commitTimestamp) throws PersistitException {
// Do nothing - long record value does not need to be recovered for
// pruning
}
@Override
public Persistit getPersistit() {
return _persistit;
}
}
class ProactiveRollbackListener implements TransactionPlayerListener {
TransactionStatus status;
@Override
public void store(final long address, final long timestamp, final Exchange exchange) throws PersistitException {
exchange.prune();
}
@Override
public void removeKeyRange(final long address, final long timestamp, final Exchange exchange, final Key from,
final Key to) throws PersistitException {
try {
exchange.prune(from, to);
} catch (final RebalanceException e) {
// ignore
}
}
@Override
public void removeTree(final long address, final long timestamp, final Exchange exchange)
throws PersistitException {
// TODO
}
@Override
public void delta(final long address, final long timestamp, final Tree tree, final int index,
final int accumulatorType, final long value) throws PersistitException {
// Nothing to to undo.
}
@Override
public void startRecovery(final long address, final long timestamp) throws PersistitException {
// Default: do nothing
}
@Override
public void startTransaction(final long address, final long startTimestamp, final long commitTimestamp)
throws PersistitException {
// Default: do nothing
status = _persistit.getTransactionIndex().getStatus(startTimestamp);
}
@Override
public void endTransaction(final long address, final long timestamp) throws PersistitException {
final TransactionStatus ts = _persistit.getTransactionIndex().getStatus(timestamp);
/*
* Can be null because the MVV count became zero and
* TransactionIndex already removed it.
*/
if (ts != null) {
if (ts.getMvvCount() > 0 && _persistit.isInitialized()) {
_persistit.getLogBase().pruningIncomplete.log(ts,
TransactionPlayer.addressToString(address, timestamp));
}
}
}
@Override
public void endRecovery(final long address, final long timestamp) throws PersistitException {
// Default: do nothing
}
@Override
public boolean requiresLongRecordConversion() {
return false;
}
@Override
public boolean createTree(final long timestamp) throws PersistitException {
return false;
}
}
/**
* Extend ArrayList to export the removeRange method.
*/
@SuppressWarnings("serial")
static class RangeRemovingArrayList<T> extends ArrayList<T> {
@Override
public void removeRange(final int fromIndex, final int toIndex) {
super.removeRange(fromIndex, toIndex);
}
}
private long rolloverThreshold() {
return _closed.get() ? 0 : ROLLOVER_THRESHOLD;
}
/**
* @return number of internal handle values that have been assigned so far
*/
public int getHandleCount() {
return _handleCounter;
}
long getLastValidCheckpointBaseAddress() {
return _lastValidCheckpointBaseAddress;
}
/**
* For use only by unit tests that test page maps, etc.
*
* @param handleToVolumeMap
*/
synchronized void unitTestInjectVolumes(final Map<Integer, Volume> handleToVolumeMap) {
_handleToVolumeMap.putAll(handleToVolumeMap);
}
/**
* For use only by unit tests that test page maps, etc.
*
* @param handleToVolumeMap
*/
void unitTestInjectPageMap(final Map<PageNode, PageNode> pageMap) {
_pageMap.putAll(pageMap);
}
void unitTestInjectTransactionMap(final Map<Long, TransactionMapItem> transactionMap) {
_liveTransactionMap.putAll(transactionMap);
}
void unitTestClearTransactionMap() {
_liveTransactionMap.clear();
}
long getCurrentJournalSize() {
return _currentAddress % _blockSize;
}
long getWriteBufferAddress() {
return _writeBufferAddress;
}
int getJournalFileCount() {
return (int) (_currentAddress / _blockSize - _baseAddress / _blockSize) + 1;
}
synchronized boolean unitTestTxnExistsInLiveMap(final Long startTimestamp) {
return _liveTransactionMap.containsKey(startTimestamp);
}
void unitTestInjectPageList(final List<PageNode> list) {
_pageList.addAll(list);
}
boolean unitTestPageListEquals(final List<PageNode> list) {
return list.equals(_pageList);
}
synchronized List<File> unitTestGetAllJournalFiles() {
final List<File> files = new ArrayList<File>();
for (final Long address : _journalFileChannels.keySet()) {
files.add(addressToFile(address));
}
return files;
}
void unitTestAllowHandlesForTemporaryVolumesAndTrees() {
_allowHandlesForTempVolumesAndTrees = true;
}
public PageNode queryPageNode(final int volumeHandle, final long pageAddress) {
final PageNode pn = _pageMap.get(new PageNode(volumeHandle, pageAddress, -1, -1));
if (pn != null) {
return new PageNode(pn);
} else {
return null;
}
}
public PageNode queryBranchNode(final int volumeHandle, final long pageAddress) {
final PageNode pn = _branchMap.get(new PageNode(volumeHandle, pageAddress, -1, -1));
if (pn != null) {
return new PageNode(pn);
} else {
return null;
}
}
public TransactionMapItem queryTransactionMap(final long timestamp) {
final TransactionMapItem item = _liveTransactionMap.get(timestamp);
if (item != null) {
return new TransactionMapItem(item);
} else {
return null;
}
}
public SortedMap<Integer, Volume> queryVolumeMap() {
return new TreeMap<Integer, Volume>(_handleToVolumeMap);
}
public SortedMap<Integer, TreeDescriptor> queryTreeMap() {
return new TreeMap<Integer, TreeDescriptor>(_handleToTreeMap);
}
}