Package org.voltdb.network

Source Code of org.voltdb.network.NIOWriteStream

/* This file is part of VoltDB.
* Copyright (C) 2008-2010 VoltDB Inc.
*
* VoltDB is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* VoltDB is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
*/

package org.voltdb.network;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.GatheringByteChannel;
import java.nio.channels.SelectionKey;
import java.util.ArrayDeque;

import org.apache.log4j.Logger;
import org.voltdb.messaging.FastSerializable;
import org.voltdb.messaging.FastSerializer;
import org.voltdb.utils.DBBPool;
import org.voltdb.utils.DBBPool.BBContainer;
import org.voltdb.utils.DeferredSerialization;
import org.voltdb.utils.EstTime;
import org.voltdb.utils.VoltLoggerFactory;

/**
*
*  Provide a queue for ByteBuffers and DeferredSerializations and drain them to gathering ByteChannel.
*  Uses a thread local memory pool for serializing messages that are < MAX_GATHERING_WRITE size and HeapByteBuffers
*  otherwise. Jumps through serious hoops to avoid ever writing large HeapByteBuffers to the channel
*  because Java will allocate a DirectByteBuffer and copy ALL the data into the DirectByteBuffer even if only
*  a small fraction can reasonably be written to the channel. This wastes time in copying data that can never possibly
*  make it into the channel in non blocking mode and space because the DirectByteBuffer is never released unlike
*  the pool which shrinks after a long time without usage.
*
*  The value m_port.m_expectedOutgoingMessageSize is used to set the initial storage a FastSerializer will
*  allocate for when doing deferred serialization of FastSerializables. FastSerializable + enqueue is the
*  best way to serialize data unless you can't pick a good value for m_port.m_expectedOutgoingMessageSize.
*  In most cases you are optimizing for the bulk of your message and it is fine to guess a little high as the memory
*  allocation works well.
*/
public class NIOWriteStream implements WriteStream {
    private static final Logger LOG = Logger.getLogger(NIOWriteStream.class);
   
    /**
     * Reference to the port for changing interest ops
     */
    private final VoltPort m_port;

    private static final Logger networkLog = Logger.getLogger("NETWORK", VoltLoggerFactory.instance());

    private boolean m_isShutdown = false;

    /**
     * Contains serialized buffers ready to write to the socket
     */
    private final ArrayDeque<BBContainer> m_queuedBuffers = new ArrayDeque<BBContainer>();

    /**
     * Contains messages waiting to be serialized and written to the socket
     */
    private final ArrayDeque<DeferredSerialization> m_queuedWrites1 =
        new ArrayDeque<DeferredSerialization>();

    private final ArrayDeque<DeferredSerialization> m_queuedWrites2 =
        new ArrayDeque<DeferredSerialization>();

    private ArrayDeque<DeferredSerialization> m_queuedWrites = m_queuedWrites1;

    private final int m_maxQueuedWritesBeforeBackpressure = 100;

    private final Runnable m_offBackPressureCallback;
    private final Runnable m_onBackPressureCallback;

    private final QueueMonitor m_monitor;

    private long m_bytesWritten = 0;
    private long m_messagesWritten = 0;

    /*
     * Used to provide incremental reads of the amount of
     * data written
     */
    private long m_lastBytesWritten = 0;
    private long m_lastMessagesWritten = 0;

    synchronized long[] getBytesAndMessagesWritten(boolean interval) {
        if (interval) {
            final long bytesWrittenThisTime = m_bytesWritten - m_lastBytesWritten;
            m_lastBytesWritten = m_bytesWritten;

            final long messagesWrittenThisTime = m_messagesWritten - m_lastMessagesWritten;
            m_lastMessagesWritten = m_messagesWritten;
            return new long[] { bytesWrittenThisTime, messagesWrittenThisTime };
        } else {
            return new long[] {m_bytesWritten, m_messagesWritten};
        }
    }

    /**
     * Set to -1 when there are no pending writes. If there is a pending write it is set to the time
     * of the last successful write or the time the oldest pending write was queued.
     */
    private long m_lastPendingWriteTime = -1;

    NIOWriteStream(VoltPort port) {
        this(port, null, null, null);
    }

    NIOWriteStream (
            VoltPort port,
            Runnable offBackPressureCallback,
            Runnable onBackPressureCallback,
            QueueMonitor monitor)
    {
        m_port = port;
        m_offBackPressureCallback = offBackPressureCallback;
        m_onBackPressureCallback = onBackPressureCallback;
        m_monitor = monitor;

    }

    synchronized public boolean isEmpty()
    {
        return m_queuedBuffers.isEmpty() && m_queuedWrites.isEmpty();
    }

    /**
     * Returns true when a drainTo invocation was unable to completely drain all queued bytes ||
     * > 1000 writes are queued
     */
    @Override
    public boolean hadBackPressure() {
        return m_hadBackPressure;
    }

    @Override
    public void setBackPressure(boolean enable) {
        if (enable) {
            this.backpressureStarted();
        } else {
            this.backpressureEnded();
        }
    }

    /**
     * Called when not all queued data could be flushed to the channel
     */
    public final void backpressureStarted() {
        if (networkLog.isTraceEnabled()) {
            networkLog.trace("Backpressure started for client " + m_port);
        }
        if (m_hadBackPressure == false) {
            m_hadBackPressure = true;
            if (m_onBackPressureCallback != null) {
                m_onBackPressureCallback.run();
            }
        }
    }

    /**
     * Called when all queued data is flushed to the channel
     */
    public final void backpressureEnded() {
        if (networkLog.isTraceEnabled()) {
            networkLog.trace("Backpressure ended for client " + m_port);
        }
        if (m_hadBackPressure == true) {
            m_hadBackPressure = false;
            if (m_offBackPressureCallback != null) {
                m_offBackPressureCallback.run();
            }
        }
    }
    /**
     * Boolean used to store the latest back pressure state
     * Does this need to be volatile?
     */
    private volatile boolean m_hadBackPressure = false;

    /**
     * The maximum amount that the stream will attempt to write to the channel under any circumstances
     */
    static final int MAX_GATHERING_WRITE = 262144;

    /**
     * Does the work of queueing addititional buffers that have been serialized
     * and choosing between gathering and regular writes to the channel. Also splits up very large
     * writes of HeapByteBuffers into many smaller writes so Java doesn't allocate a monster DirectByteBuffer
     * that will never be freed
     * @param channel
     * @param additional
     * @return
     * @throws IOException
     */
    int drainTo (final GatheringByteChannel channel, final BBContainer additional[]) throws IOException {

        /*
         * Add the containers that were serialized when swapAndSerializeQueuedWrites was
         * invoked to the end of the queue so they will eventually be written
         */
        if (additional != null) {
            for (final BBContainer c : additional) {
                m_queuedBuffers.offer(c);
            }
        }

        int bytesWritten = 0;
        long rc = 0;
        do {
            //For gathering write (many small writes) store the list of buffers here
            ByteBuffer buffers[] = null;
            //For regular writes store the buffer to write here
            ByteBuffer buffer = null;

            /*
             * Nothing to write
             */
            if (m_queuedBuffers.isEmpty()) {
                if (m_hadBackPressure && m_queuedWrites.size() <= m_maxQueuedWritesBeforeBackpressure) {
                    backpressureEnded();
                }
                m_lastPendingWriteTime = -1;
                updateQueued(-bytesWritten, false);
                m_bytesWritten += bytesWritten;
                return bytesWritten;
            }

            /*
             * Peek the first buffer and inspect it to see if it needs any special handling
             * If it is to large to use as part of a gathering write then branch and just focus on
             * writing the one
             */
            final int queuedForWrite = 0;
            final BBContainer peekedBuffer = m_queuedBuffers.peek();
            if (peekedBuffer.b.remaining() > MAX_GATHERING_WRITE) {
                /*
                 * If the buffer is not direct and it is this large it should be split up in separate writes
                 */
                if (!peekedBuffer.b.isDirect()) {
                    /**
                     * Split a big heap byte buffer into many smaller slices
                     * so Java doesn't allocate a bunch of direct ByteBuffers
                     */
                    //The source buffer for the slices that holds the memory goes in first
                    //so it will be discarded once all the slices have been read.
                    m_queuedBuffers.push(peekedBuffer);
                    final int originalPosition = peekedBuffer.b.position();
                    do {
                        final int amountToSplit = Math.min(peekedBuffer.b.remaining(), MAX_GATHERING_WRITE);
                        peekedBuffer.b.position(peekedBuffer.b.limit() - amountToSplit);
                        final BBContainer splice = DBBPool.wrapBB(peekedBuffer.b.slice());
                        m_queuedBuffers.push(splice);
                        m_messagesWritten--;//corrects message count
                        peekedBuffer.b.limit(peekedBuffer.b.position());
                        peekedBuffer.b.position(originalPosition);
                    }
                    while(peekedBuffer.b.hasRemaining());
                    buffer = m_queuedBuffers.peek().b;
                } else {
                    //Elect to do a single regular write instead of all the gathering write work
                    //It is okay not to split it into many smaller buffers because it is direct
                    buffer = peekedBuffer.b;
                }
            } else {
                /*
                 * Iterate over the queued buffers until we have 10, there are none left, or
                 * adding another would make the write too big
                 */
                final int queuedBuffersSize = m_queuedBuffers.size();
                buffers = new ByteBuffer[queuedBuffersSize < 10 ? queuedBuffersSize : 10];
                int ii = 0;
                for (final BBContainer c : m_queuedBuffers) {

                    /*
                     * Don't queue insanely large gathering writes. It does bad things
                     * This catches a series of writes that would be too large.
                     * This doesn't skip the buffer that is too large and look
                     * for a smaller one in order to ensure that a large buffer
                     * doesn't get neglected
                     */
                    final int potentialQueuedForWrite = queuedForWrite + c.b.remaining();
                    if (potentialQueuedForWrite > MAX_GATHERING_WRITE) {
                        //Create a new correctly sized array
                        //to pass in for the gathering write
                        final ByteBuffer oldArray[] = buffers;
                        buffers = new ByteBuffer[ii];
                        for(int zz = 0; zz < ii; zz++) {
                            buffers[zz] = oldArray[zz];
                        }
                        break;
                    }

                    /*
                     * Never ever ever try and do a gathering write with a buffer that is
                     * not direct. Java will allocate a DirectByteBuffer every time and not
                     * pool them at all. If the buffer is not direct it will be skipped and
                     * the gathering write will go ahead with what has already come off of the queue
                     * @TODO A potential optimization might be to continue looking ahead in the queue for
                     * more direct buffers. Probably more complex then it is worth.
                     */
                    if (!c.b.isDirect()) {
                        if (ii == 0) {
                            buffers = null;
                            buffer = c.b;
                            break;
                        } else {
                            final ByteBuffer oldArray[] = buffers;
                            buffers = new ByteBuffer[ii];
                            for(int zz = 0; zz < ii; zz++) {
                                buffers[zz] = oldArray[zz];
                            }
                            break;
                        }
                    }

                    /*
                     * The regular case where there is nothing wrong
                     * and the buffer can be added to the list for the gathering write
                     */
                    buffers[ii++] = c.b;
                    if (ii == 10) {
                        break;
                    }
                }
            }

            /*
             * Choose between a gathering write vs. a single buffer write based
             * on the presence of the buffers array
             */
            rc = 0;
            if (buffers != null) {
                assert(checkAllDirect(buffers));
                rc = channel.write (buffers);

                //Discard the buffer back to a pool if no data remains
                for (final ByteBuffer b : buffers) {
                    if (!b.hasRemaining()) {
                        m_queuedBuffers.poll().discard();
                        m_messagesWritten++;
                    } else {
                        if (!m_hadBackPressure) {
                            backpressureStarted();
                        }
                        break;
                    }
                }
            } else {
                rc = channel.write(buffer);

                //Discard the buffer back to a pool if no data remains
                if (buffer.hasRemaining()) {
                    if (!m_hadBackPressure) {
                        backpressureStarted();
                    }
                } else {
                    m_queuedBuffers.poll().discard();
                    m_messagesWritten++;
                }
            }
            bytesWritten += rc;

        } while (rc > 0);

        //This extra check is necessary because sometimes a buffer with nothing remaining
        //has to be queued in the above loop resulting in rc == 0. Since rc == 0
        //it won't loop around a last time and see that there are no more queued buffers
        //and thus no backpressure
        if (m_queuedBuffers.isEmpty() && m_hadBackPressure && m_queuedWrites.size() <= m_maxQueuedWritesBeforeBackpressure) {
            backpressureEnded();
        }

        if (!isEmpty()) {
            if (bytesWritten > 0) {
                m_lastPendingWriteTime = EstTime.currentTimeMillis();
            }
        } else {
            m_lastPendingWriteTime = -1;
        }
        updateQueued(-bytesWritten, false);
        m_bytesWritten += bytesWritten;
        return bytesWritten;
    }

    /**
     * Used for assertions. Returns false if one of the buffers is not direct
     * @param buffers
     * @return
     */
    private final boolean checkAllDirect(final ByteBuffer buffers[]) {
        for (final ByteBuffer b : buffers ) {
           if (!b.isDirect()) {
               return false;
           }
        }
        return true;
    }

    /**
     * Queued a container for writing. This isn't the ideal API to use since the serialization has been done
     * outside of a network thread
     * @param c
     */
    @Override
    public boolean enqueue(final BBContainer c) {
        assert(c != null);
        assert(c.b != null);
        if (c.b.remaining() == 0) {
            c.discard();
            return false;
        }
        synchronized (this) {
            if (m_isShutdown) {
                c.discard();
                return false;
            }
            updateLastPendingWriteTimeAndQueueBackpressure();
            updateQueued(c.b.remaining(), false);
            m_queuedBuffers.offer(c);
            m_port.setInterests( SelectionKey.OP_WRITE, 0);
        }
        return true;
    }

    /**
     * Queue a FastSerializable object for writing. This is 3rd best way to serialize and queue messages.
     * Since no expected message size is provided the default one for this port is used which may not be accurate
     * for this particular message. Because FastSerializer is used to serialize the object there is some
     * overhead incurred if the FastSerializer has to resize and every time the FastSerializer has to check
     * if it needs to grow.
     * @param f
     */
    @Override
    public boolean enqueue(final FastSerializable f) {
        synchronized (this) {
            if (m_isShutdown) {
                return false;
            }
            updateLastPendingWriteTimeAndQueueBackpressure();
            m_queuedWrites.offer(new DeferredSerialization() {
                @Override
                public BBContainer serialize(final DBBPool pool) throws IOException {
                    final FastSerializer fs = new FastSerializer(pool, m_port.m_expectedOutgoingMessageSize);
                    return fs.writeObjectForMessaging(f);
                }

                @Override
                public void cancel() {}
            });
            m_port.setInterests( SelectionKey.OP_WRITE, 0);
        }
        return true;
    }

    /**
     * Queue a FastSerializable object for writing. This is 2nd best way to serialize and queue messages.
     * The expected message size is used to size the initial allocation for the FastSerializer.
     * Because FastSerializer is used to serialize the object there is some over head incurred
     * when the FastSerializer has to check if it needs to grow, but the cost is pretty minor compared
     * to the cost of actually growing the FastSerializer.
     * @param f
     */
    @Override
    public boolean enqueue(final FastSerializable f, final int expectedSize) {
        synchronized (this) {
            if (m_isShutdown) {
                return false;
            }
            updateLastPendingWriteTimeAndQueueBackpressure();
            m_queuedWrites.offer(new DeferredSerialization() {
                @Override
                public BBContainer serialize(final DBBPool pool) throws IOException {
                    final FastSerializer fs = new FastSerializer(pool, expectedSize);
                    return fs.writeObjectForMessaging(f);
                }

                @Override
                public void cancel() {}
            });
            m_port.setInterests( SelectionKey.OP_WRITE, 0);
        }
        return true;
    }

    /**
     * Queue a message and defer the serialization of the message until later. This is the ideal mechanism
     * for serializing and queueing network writes. It allows the sender to define an efficient serialization
     * mechanism that performs a single allocation of the correct size without the overhead of FastSerializer
     * which has to constantly check if it needs to grow.
     * @param ds A deferred serialization task that will generate the message
     */
    @Override
    public boolean enqueue(final DeferredSerialization ds) {
        synchronized (this) {
            if (m_isShutdown) {
                ds.cancel();
                return false;
            }
            updateLastPendingWriteTimeAndQueueBackpressure();
            m_queuedWrites.offer(ds);
            m_port.setInterests( SelectionKey.OP_WRITE, 0);
        }
        return true;
    }

    /**
     * Queue a ByteBuffer for writing to the network. If the ByteBuffer is not direct then it will
     * be copied to a DirectByteBuffer if it is less then DBBPool.MAX_ALLOCATION_SIZE. This method
     * is a backup for code that isn't able to defer its serialization to a network thread
     * for whatever reason. It is reasonably efficient if a DirectByteBuffer is passed in,
     * but it would be better to keep allocations of DirectByteBuffers inside the network pools.
     * @param b
     */
    @Override
    public boolean enqueue(final ByteBuffer b) {
        assert(b != null);
        assert(!b.isDirect());//Don't queue direct buffers, they leak memory without a container
        if (b.remaining() == 0) {
            return false;
        }

        synchronized (this) {
            if (m_isShutdown) {
                return false;
            }

            updateLastPendingWriteTimeAndQueueBackpressure();

            /*
             * Attempt to use one of our own pooled direct byte buffers
             * so that a gathering write can be done later. Java gathering
             * writes spam direct byte buffers if many regular BBs are used
             */
            if (b.remaining() < DBBPool.MAX_ALLOCATION_SIZE){
                m_queuedWrites.offer(new DeferredSerialization() {
                    @Override
                    public BBContainer serialize(final DBBPool pool) {
                        final BBContainer c = pool.acquire(b.remaining());
                        assert(c.b.isDirect());
                        c.b.put(b);
                        c.b.flip();
                        return c;
                    }

                    @Override
                    public void cancel() {}
                });
            } else {
                updateQueued(b.remaining(), false);
                m_queuedBuffers.offer(DBBPool.wrapBB(b));
            }
            m_port.setInterests( SelectionKey.OP_WRITE, 0);
        }
        return true;
    }

    /**
     * Swap the two queues of DeferredSerializations and serialize everything in the queue
     * and return the resulting ByteBuffers as an array.
     * @return
     * @throws IOException
     */
    final BBContainer[] swapAndSerializeQueuedWrites(final DBBPool pool) throws IOException {
        ArrayDeque<DeferredSerialization> oldlist;
        synchronized (this) {
            if (m_queuedWrites.isEmpty()) {
                return null;
            } else {
                if (m_queuedWrites == m_queuedWrites1) {
                    oldlist = m_queuedWrites1;
                    m_queuedWrites = m_queuedWrites2;
                }
                else {
                    oldlist = m_queuedWrites2;
                    m_queuedWrites = m_queuedWrites1;
                }
            }
        }
        final BBContainer results[] = new BBContainer[oldlist.size()];
        int ii = 0;
        DeferredSerialization ds = null;
        int bytesQueued = 0;
        while ((ds = oldlist.poll()) != null) {
            results[ii] = ds.serialize(pool);
            bytesQueued += results[ii].b.remaining();
            assert(results[ii] != null);
            assert(results[ii].b != null);
            ii++;
        }
        updateQueued(bytesQueued, true);
        return results;
    }

    /**
     * Free the pool resources that are held by this WriteStream. The pool itself is thread local
     * and will be freed when the thread terminates.
     */
    synchronized void shutdown() {
        int bytesReleased = 0;
        m_isShutdown = true;
        BBContainer c = null;
        while ((c = m_queuedBuffers.poll()) != null) {
            bytesReleased += c.b.remaining();
            c.discard();
        }
        updateQueued(-bytesReleased, false);
        DeferredSerialization ds = null;
        while ((ds = m_queuedWrites.poll()) != null) {
            ds.cancel();
        }
    }

    @Override
    public synchronized int calculatePendingWriteDelta(final long now) {
        if (m_lastPendingWriteTime == -1) {
            return 0;
        }
        return (int)(now - m_lastPendingWriteTime);
    }

    private void updateLastPendingWriteTimeAndQueueBackpressure() {
        if (m_lastPendingWriteTime == -1) {
            m_lastPendingWriteTime = EstTime.currentTimeMillis();
        }
        if (m_queuedWrites.size() > m_maxQueuedWritesBeforeBackpressure && !m_hadBackPressure) {
            backpressureStarted();
        }
    }

    private void updateQueued(int queued, boolean noBackpressureSignal) {
        if (m_monitor != null) {
            boolean shouldSignalBackpressure = m_monitor.queue(queued);
            if (!noBackpressureSignal && shouldSignalBackpressure) {
                if (!m_hadBackPressure) {
                    backpressureStarted();
                }
            }
        }
    }
}
TOP

Related Classes of org.voltdb.network.NIOWriteStream

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.