Source Code of nux.xom.binary.BinaryXMLCodec$SymbolTable

/*
 * Copyright (c) 2005, The Regents of the University of California, through
 * Lawrence Berkeley National Laboratory (subject to receipt of any required
 * approvals from the U.S. Dept. of Energy). All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * (1) Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 * 
 * (2) Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 * 
 * (3) Neither the name of the University of California, Lawrence Berkeley
 * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
 * may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * 
 * You are under no obligation whatsoever to provide any bug fixes, patches, or
 * upgrades to the features, functionality or performance of the source code
 * ("Enhancements") to anyone; however, if you choose to make your Enhancements
 * available either publicly, or directly to Lawrence Berkeley National
 * Laboratory, without imposing a separate written license agreement for such
 * Enhancements, then you hereby grant the following license: a non-exclusive,
 * royalty-free perpetual license to install, use, modify, prepare derivative
 * works, incorporate into other computer software, distribute, and sublicense
 * such enhancements or derivative works thereof, in binary and source code
 * form.
 */
package nux.xom.binary;


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;


import nu.xom.Attribute;
import nu.xom.Comment;
import nu.xom.DocType;
import nu.xom.Document;
import nu.xom.Element;
import nu.xom.IllegalAddException;
import nu.xom.Node;
import nu.xom.NodeFactory;
import nu.xom.Nodes;
import nu.xom.ParentNode;
import nu.xom.ProcessingInstruction;
import nu.xom.Text;
import nu.xom.WellformednessException;
import nu.xom.XMLException;
import nux.xom.io.StreamingSerializer;


/**
 * Serializes (encodes) and deserializes (decodes) XOM XML documents to and from
 * an efficient and compact custom binary XML data format (termed <i>bnux </i>
 * format), without loss or change of any information. Serialization and
 * deserialization is much faster than with the standard textual XML format, and
 * the resulting binary data is more compressed than textual XML.
 * 
 * <h4>Applicability</h4>
 * 
 * The overall goal of the <i>bnux algorithm</i> is to maximize serialization
 * and deserialization (parsing) performance without requiring any schema
 * description. Serialization and deserialization speed are roughly balanced
 * against each other; neither side is particularly favoured over the other.
 * Another benefitial effect of the algorithm is that a considerable degree of
 * XML data redundancy is eliminated, but compression is more a welcome
 * side-effect than a primary goal in itself. The algorithm is primarily
 * intended for tightly coupled high-performance systems exchanging large
 * volumes of XML data over networks, as well as for compact main memory caches
 * and for <i>short-term </i> storage as BLOBs in backend databases or files
 * (e.g. "session" data with limited duration). In the case of BLOB storage,
 * selecting matching BLOBs can be sped up by maintaining a simple metaindex
 * side table for the most frequent access patterns. See the <a
 * href="#performance">performance results</a> below.
 * <p>
 * While the Java API is considered stable, the bnux data format should be
 * considered a black box: Its internals are under-documented and may change
 * without notice from release to release in backwards-incompatible manners. It
 * is unlikely that support for reading data written with older Nux versions
 * will ever be available. bnux is an exchange format but not an
 * interoperability format. Having said that, the data format is machine
 * architecture/platform independent. For example a bnux file can be moved back
 * and forth between a 32 bit Intel little-endian machine and a 64 bit PowerPC
 * big-endian machine; it remains parseable no matter where.
 * <p>
 * This approach is expressly <b>not intended </b>as a replacement for standard
 * textual XML in loosely coupled systems where maximum long-term
 * interoperability is the overarching concern. It is also expressly <b>not
 * intended </b>for long-term data storage. If you store data in bnux format
 * there's every chance you won't be able to read it back a year or two from
 * now, or even earlier. Finally, it is probably unwise to use this class if
 * your application's performance requirements are not particularly stringent,
 * or profiling indicates that the bottleneck is not related to XML
 * serialization/deserialization anyway.
 * <p>
 * The bnux serialization algorithm is a fully streaming block-oriented
 * algorithm, ideal for large numbers of very small to arbitrarily large 
 * XML documents.
 * <p>
 * The bnux deserialization algorithm is a fully streaming algorithm and can
 * optionally be pushed through a {@link nu.xom.NodeFactory}. This enables
 * efficient filtering and can avoid the need to build a main memory tree, which
 * is particularly useful for arbitrarily large documents. For example, streaming 
 * XQueries over binary XML can be expressed via the NodeFactory generated by a
 * {@link nux.xom.xquery.StreamingPathFilter}. In streaming mode, the binary
 * codec exactly mimics the NodeFactory based behaviour of the XOM
 * {@link nu.xom.Builder}.
 * 
 * <h4>Faithfully Preversing XML</h4>
 * 
 * Any and all arbitrary XOM XML documents are supported, and no schema is
 * required. A XOM document that is serialized and subsequently deserialized by
 * this class is <i>exactly the same </i> as the original document, preserving
 * "as is" all names and data for elements, namespaces, additional namespace
 * declarations, attributes, texts, document type, comments, processing
 * instructions, whitespace, Unicode characters including surrogates, etc. As a
 * result, the W3C XML Infoset and the W3C Canonical XML representation is
 * guaranteed to be preserved. In particular there always holds:
 * 
 * <pre>
 * java.util.Arrays.equals(XOMUtil.toCanonicalXML(doc), XOMUtil
 *     .toCanonicalXML(deserialize(serialize(doc))));
 * </pre>
 * 
 * <h4>Optional ZLIB Compression</h4>
 * 
 * The bnux algorithm considerably compresses XML data with little CPU
 * consumption, by its very design. However, bnux also has an option to further
 * compress/decompress its output/input with the ZLIB compression algorithm.
 * ZLIB is based on Huffman coding and also used by the popular
 * <code>gzip</code> (e.g. {@link java.util.zip.Deflater}). ZLIB compression
 * is rather CPU intensive, but it typically yields strong compression factors,
 * in particular for documents containing mostly narrative text (e.g. the
 * bible). For example, strong compression may be desirable over low-bandwith
 * networks or when bnux data is known to be accessed rather infrequently. On
 * the other hand, ZLIB compression probably kills performance in the presence
 * of high-bandwidth networks such as ESnet, Internet2/Abilene or 10 Gigabit
 * Ethernet/InfiniBand LANs, even with high-end CPUs. CPU drain is also a
 * scalability problem in the presence of large amounts of concurrent
 * connections. An option ranging from 0 (no ZLIB compression; best performance)
 * to 1 (little ZLIB compression; reduced performance) to 9 (strongest ZLIB
 * compression; worst performance) allows one to configure the CPU/memory
 * consumption trade-off.
 * 
 * <h4>Reliability</h4>
 * 
 * This class has been successfully tested against some 50000 extremely
 * weird and unique test documents, including the W3C XML conformance test
 * suite, and no bugs are known.
 * <p>
 * Serialization employs no error checking at all, since malformed XOM input
 * documents are impossible to produce given XOM's design: XOM strictly enforces
 * wellformedness anyway. Deserialization employs some limited error checking,
 * throwing exceptions for any improper API usage, non-bnux input data, data
 * format version mismatch, or general binary data corruption. Beyond this,
 * deserialization relies on XOM's hard-wired wellformedness checks, just like
 * serialization does. Barring one of the above catastrophic situations, the
 * bnux algorithm will always correctly and faithfully reconstruct the exact
 * same well-formed XOM document.
 * 
 * <h4>Example Usage:</h4>
 * 
 * <pre>
 * // parse standard textual XML, convert to binary format, round-trip it and compare results
 * Document doc = new Builder().build(new File("samples/data/periodic.xml"));
 * BinaryXMLCodec codec = new BinaryXMLCodec();
 * byte[] bnuxDoc = codec.serialize(doc, 0);
 * Document doc2 = codec.deserialize(bnuxDoc);
 * boolean isEqual = java.util.Arrays.equals(
 *     XOMUtil.toCanonicalXML(doc), XOMUtil.toCanonicalXML(doc2));
 * System.out.println("isEqual = " + isEqual);
 * System.out.println(doc2.toXML());
 * 
 * // write binary XML document to file
 * OutputStream out = new FileOutputStream("/tmp/periodic.xml.bnux");
 * out.write(bnuxDoc);
 * out.close();
 * 
 * // read binary XML document from file
 * bnuxDoc = FileUtil.toByteArray(new FileInputStream("/tmp/periodic.xml.bnux"));
 * Document doc3 = codec.deserialize(bnuxDoc);
 * System.out.println(doc3.toXML());
 * </pre>
 * 
 * <a name="performance"/>
 * <h4>Performance</h4>
 * 
 * This class has been carefully profiled and optimized. Preliminary performance
 * results over a wide range of real-world documents are given below. A more
 * detailed presentation can be found at the Global Grid Forum <a
 * target="_blank"
 * href="http://www.ggf.org/GGF15/ggf_events_schedule_WSPerform.htm">Web
 * Services Performance Workshop</a>.
 * <p>
 * Contrasting bnux BinaryXMLCodec with the XOM Builder and Serializer:
 * <ul>
 * <li>Tree Deserialization speedup: 40-100 MB/s vs. 3-30 MB/s</li>
 * <li>Streaming Deserialization speedup: 60-500 MB/s vs. 3-30 MB/s</li>
 * <li>Tree Serialization speedup: 50-150 MB/s vs. 5-20 MB/s</li>
 * <li>Data compression factor: 1.0 - 4</li>
 * </ul>
 * For meaningful comparison, MB/s and compression factors are always given
 * normalized in relation to the original standard textual XML file size.
 * <ul>
 * <li>Benchmark test data: A wide variety of small to medium large XML
 * documents is used, including SOAP documents heavily using namespaces ( <a
 * target="_blank" href="http://xbis.sourceforge.net/">XBIS </a>), simple XML
 * formatted web server logs using no namespaces, RDF documents with lots of
 * attributes and namespaces, the periodic table, documents consisting of large
 * narrative text ( <a target="_blank"
 * href="http://www.oasis-open.org/cover/bosakShakespeare200.html">Shakespeare
 * </a>), publication citations ( <a target="_blank"
 * href="http://dblp.uni-trier.de/xml/">DBLP </a>), music title databases ( <a
 * target="_blank" href="http://www.freedb.org/">FreeDB </a>), Japanese
 * documents (XML conformance test suite), SVG image files, etc.</li>
 * 
 * <li>Benchmark configuration: no ZLIB compression, xom-1.2, non-validating
 * XOM Builder using xerces-2.8.0, no DTD or schema, Sun JDK 1.5.0 server VM,
 * commodity PC 2004, Dual Pentium 4, 3.4 GHz, Redhat 9</li>
 * </ul>
 * 
 * Example Interpretation:
 * <ul>
 * <li>Small documents: Results translate, for example, to ping-pong
 * round-tripping of typical 500 byte SOAP request/response message documents at
 * a rate of 25000 msg/s, compared to 2500 msg/s with XOM (excluding network
 * latency). More pronounced, 500 (150) byte documents with few namespaces
 * translate to 35000 (120000) msg/s, compared to 3500 (5000) msg/s with XOM.
 * Consequently, XML serialization and deserialization are probably nomore your
 * application's bottleneck, leaving, say, 95% CPU headroom free for other
 * application modules.</li>
 * 
 * <li>Medium documents: When having a main-memory cache of several thousand 1
 * MB documents, each containing highly structured complex data, one can
 * deserialize, XQuery and serve from the cache at a rate of 50 documents/s,
 * compared to 5 documents/s with XOM.</li>
 * 
 * </ul>
 * Note that in contrast to other algorithms, these measurements include XOM
 * tree building and walking, hence measures delivering data to and from actual
 * XML applications, rather than merely to and from a low-level SAX event stream
 * (which is considerably cheaper and deemed less useful).
 * <p>
 * The deserialization speedup is further multiplied when DTDs or schema
 * validation is used while parsing standard textual XML.
 * <p>
 * This class relies on advanced Java compiler optimizations, which take
 * considerable time to warm up. Hence, for comparative benchmarks, use a
 * server-class VM and make sure to repeat runs for at least 30 seconds.
 * <p>
 * Further, you will probably want to eliminate drastic XOM hotspots by
 * compiling XOM with "ant -Dfat=true jar" to maintain an internal String
 * instead of an UTF-8 encoded byte array in {@link nu.xom.Text}, which
 * eliminates the expensive character conversions implied for each access to a
 * Text object. This increases performance at the expense of memory footprint.
 * The measurements above report numbers using these patches, both for xom and
 * bnux. If you're curious about the whereabouts of bottlenecks, run java with
 * the non-perturbing '-server -agentlib:hprof=cpu=samples,depth=10' flags, then
 * study the trace log and correlate its hotspot trailer with its call stack
 * headers (see <a target="_blank"
 * href="http://java.sun.com/developer/technicalArticles/Programming/HPROF.html">
 * hprof tracing </a>).
 * <p>
 * Use class {@link nux.xom.sandbox.BinaryXMLTest} to reproduce results, verify
 * correctness or to evaluate performance for your own datasets.
 * 
 * @author whoschek.AT.lbl.DOT.gov
 * @author $Author: hoschek $
 * @version $Revision: 1.179 $, $Date: 2006/06/18 21:25:02 $
 */
public class BinaryXMLCodec {
  
  /*
   * TODO: add a StAX interface on top of bnux?
   * e.g. createXMLStreamReader(byte[]), similar for XMLStreamWriter
   * TODO: add coalescing of adjacent Text nodes on deserialization?
   */
  
  /* 
   * TODO: My impression is that there is remaining potential for some speedup,
   * both for serialization and deserialization performance. Ideas towards this 
   * end include:
   * 
   * - Add option to always use UTF16 (level=-1)
   * - Micro caches may become obsolete once XOM has its own internal QName LRU cache
   * - Use a better low level namespace iteration
   * - Split unified symbolTable into several smaller symbolTables for Text, Attributes, and other.
   * - other?
   */ 


  // for deserialization: factory to stream (push) into
  private NodeFactory factory;


  // for deserialization: unique symbols from deserialized symbolTable
  private String[] symbols;


  // for deserialization: are pages ZLIB compressed?
  private boolean isCompressed;  


  // for deserialization and serialization: page buffer
  private ArrayByteList page; // multi-byte integers are ALWAYS in big-endian
  
  // for serialization:
  // holds unique strings found in document (qnames, texts, uris, etc.)
  private SymbolTable symbolTable; 
  
  // for serialization:
  // byte-level node type tokens in XML document order; also length of next index(es)
  private ArrayByteList nodeTokens;
  
  // for serialization: indexes into symbolTable/entries
  private ArrayIntList indexData; 
  
  // for serialization: has first page of current document already been written?
  private boolean isFirstPage = true;


  // ZLIB
  private Inflater decompressor; // ZLIB
  private Deflater compressor;   // ZLIB
  private int compressionLevel = -1; // initialize to "undefined"
  
  // for deserialization: avoids reverification of PCDATA
  private Text[] textCache;
  
  // for deserialization: avoids reverification of namespace URIs
  private String[] nameCache;
  private LRUHashMap1 internedNames;
  
  // for deserialization:
  // avoids reverification of qname and URI, as well as indexOf() calls, and saves string memory
  private NodeBuilder nodeBuilder;  
  
  // for streaming serialization
  private OutputStream out;
  
  /**
   * For serialization: (approximate) maximum number of bytes per page.
   * <p>
   * To enable true streaming, a serialized document consists of one or more
   * independent consecutive pages. Each page contains a portion of the XML
   * document, in document order. More specifically, each page consists of a
   * tokenized byte array and corresponding symbols. Once a page has been
   * read/written related (heavy) state can be discarded, freeing memory. No
   * more than one page needs to be held in memory at any given time. For very
   * large documents this page design reduces memory consumption, increases
   * throughput and reduces latency. For small to medium sized documents it
   * makes next to no difference.
   * <p>
   * A small page capacity (e.g. 128 bytes) leads to lower latency per page
   * but also lower throughput and lower compression overall. Conversely, a
   * large page capacity (e.g. 1 MB) leads to higher throughput and higher
   * compression, at the expense of higher latency. However, a very large page
   * capacity (e.g. 10 MB) leads to memory subsystem pressure on streaming.
   * Thus, here we use a happy medium, small enough to generate little memory
   * subsystem pressure, and large enough to gain high throughput, retain
   * almost all compression stemming from redundancy eliminating tokenization,
   * with near zero overhead for small to medium sized documents, and
   * outstanding performance for very large documents.
   */
  private static final int MAX_PAGE_CAPACITY = 64 * 1024;
//  private static final int MAX_PAGE_CAPACITY = 1; // DEBUG only
  
  // low 3 bits for node token types
  // high 4 bits typically hold number of bytes of the next two indexes
  private static final int TEXT = 0;
  private static final int ATTRIBUTE = 1;
  private static final int BEGIN_ELEMENT = 2;
  private static final int END_ELEMENT = 3;
  private static final int COMMENT = 4;
  private static final int NAMESPACE_DECLARATION = 5;  
  private static final int PROCESSING_INSTRUCTION = 6;
  private static final int DOC_TYPE = 7;
  
  private static final int BNUX_MAGIC = createMagicNumber(); // for sanity checks
  private static final byte VERSION = 7; // version of bnux data format
  private static final int DOCUMENT_HEADER_SIZE = 4 + 1; // in bytes
  private static final int PAGE_HEADER_SIZE = 4 + 4 + 4 + 4 + 4; // in bytes
  
  private static final boolean IS_EXTENDED_XOM = hasXOMExtensions();


  /**
   * Marker for non-existant systemID or publicID in DocType. XML and the
   * nu.xom.Verifier regard " " as an illegal ID, so it can never occur in
   * practice. Thus we can use it as an unambigous marker identifying a
   * serialized null value.
   */
  private static final String DOCTYPE_NULL_ID = " ";
    
  private static final boolean DEBUG = false; // VM does dead code elimination
  
  /** Reinitializes instance variables to virgin state. */
  private void reset() {
    // deserialization state:
    internedNames = null; 
    nodeBuilder = null;
    factory = null;
    
    // serialization state:
    symbolTable = null;
    page = null;
    nodeTokens = null;
    indexData = null;
    isFirstPage = true;
    out = null;


    // better safe than sorry:
    try {
      if (decompressor != null) decompressor.end(); // free resources
    } finally {
      decompressor = null;
      try {
        if (compressor != null) compressor.end(); // free resources
      } finally {
        compressor = null;
      }
    }
  }
  
  /**
   * Constructs an instance; An instance can be reused serially, but is not
   * thread-safe, just like a {@link nu.xom.Builder}.
   */
  public BinaryXMLCodec() {
  }
  
  /**
   * Constructs a new streaming serializer that serializes bnux binary XML to
   * the given underlying output stream, using the given ZLIB compression
   * level.
   * <p>
   * An optional zlib compression level ranging from 0 (no ZLIB compression;
   * best performance) to 1 (little ZLIB compression; reduced performance) to
   * 9 (strongest ZLIB compression; worst performance) allows one to configure
   * the CPU/memory consumption trade-off.
   * <p>
   * Unless there is a good reason to the contrary, you should always use
   * level 0: the bnux algorithm typically already precompresses considerably.
   * 
   * @param out
   *            the underlying output stream to write to
   * @param zlibCompressionLevel
   *            a number in the range 0..9
   * @return a streaming serializer
   */
  public StreamingSerializer createStreamingSerializer(OutputStream out, int zlibCompressionLevel) {
    return new StreamingBinaryXMLSerializer(this, out, zlibCompressionLevel);
  }
  
  /**
   * Returns whether or not the given input stream contains a bnux document.
   * <p>
   * A peek into the first 4 bytes is sufficient for unambigous detection, as
   * standard textual XML cannot start with any arbitrary four byte
   * combination.
   * <p>
   * Finally, the read bytes are put back onto the stream, so they can be
   * reread as part of subsequent parsing attempts. Therefore, the input
   * stream must support <code>input.mark()</code> and
   * <code>input.reset()</code>. For example, a
   * {@link java.io.BufferedInputStream} is a good choice.
   * 
   * @param input
   *            the stream to read from
   * @return true if the stream contains a bnux document
   * @throws IllegalArgumentException
   *             if the underlying stream does not support
   *             <code>input.mark()</code> and <code>input.reset()</code>.
   * @throws IOException
   *             if the underlying input stream encounters an I/O error
   * @see InputStream#mark(int)
   */
  public boolean isBnuxDocument(InputStream input) throws IOException {
    if (input == null) 
      throw new IllegalArgumentException("input stream must not be null");
    if (!input.markSupported()) 
      throw new IllegalArgumentException("markSupported() must be true");
    
    int magicBytes = 4;
    input.mark(magicBytes);
    try {
      ArrayByteList list = new ArrayByteList(magicBytes);
      if (!list.ensureRemaining(input, magicBytes)) {
        return false; // stream contains less than 4 bytes
      }
      return list.getInt() == BNUX_MAGIC;
    } finally {
      input.reset(); // unread the header
    }
  }
  
  /**
   * Equivalent to
   * <code>deserialize(new ByteArrayInputStream(input), new NodeFactory())</code>.
   * 
   * @param bnuxDocument
   *            the bnux document to deserialize.
   * @return the new XOM document obtained from deserialization.
   * @throws BinaryParsingException
   *             if the bnux document is unreadable or corrupt for some reason
   */
  public Document deserialize(byte[] bnuxDocument) throws BinaryParsingException {
    if (bnuxDocument == null) 
      throw new IllegalArgumentException("bnuxDocument must not be null");
    
    try {
      return deserialize(new ByteArrayInputStream(bnuxDocument), null);
    } catch (IOException e) {
      throw new BinaryParsingException(e); // can never happen
    }
  }
  
  /**
   * Returns the XOM document obtained by deserializing the next binary XML
   * document from the given input stream.
   * <p>
   * If the document is in ZLIB compressed bnux format, it will be
   * auto-detected and auto-decompressed as part of deserialization.
   * <p>
   * This method exactly mimics the NodeFactory based behaviour of the XOM
   * {@link nu.xom.Builder}. A NodeFactory enables efficient filtering and
   * can avoid the need to build a main memory tree, which is particularly
   * useful for large documents. For example, streaming XQueries over binary
   * XML can be expressed via the NodeFactory generated by a
   * {@link nux.xom.xquery.StreamingPathFilter}. Binary XML files can be
   * converted to and from standard textual XML files via a
   * {@link nux.xom.pool.XOMUtil#getRedirectingNodeFactory(StreamingSerializer)}. For
   * other example factories, see {@link nux.xom.pool.XOMUtil}.
   * <p>
   * Bnux is a self-framing data format: It knows where the end of a document
   * occurs. An input stream can contain any number of independent documents,
   * one after another. Thus, this method reads from the stream as many bytes
   * as required for the current document, but no more than that. Unlike SAX
   * XML parsers and unlike a {@link nu.xom.Builder}, it does not read until
   * end-of-stream (EOS), and it does not auto-close the input stream. If this
   * method returns successfully, the input stream has been positioned one
   * byte past the current bnux document, ready to deserialize the following
   * document, if any. It is the responsibility of the caller to ensure the
   * input stream gets properly closed when deemed appropriate.
   * 
   * @param input
   *            the stream to read and deserialize from
   * @param factory
   *            the node factory to stream into. May be <code>null</code> in
   *            which case the default XOM NodeFactory is used, building the
   *            complete XML document tree.
   * @return the new XOM document obtained from deserialization.
   * @throws BinaryParsingException
   *             if the bnux document is unreadable or corrupt for some reason
   * @throws IOException
   *             if the underlying input stream encounters an I/O error
   */
  public Document deserialize(InputStream input, NodeFactory factory) 
      throws BinaryParsingException, IOException {
    
    if (input == null) 
      throw new IllegalArgumentException("input stream must not be null");
    if (factory == null) factory = new NodeFactory();
    
    // read document header
    if (page == null) page = new ArrayByteList(256);
    page.clear();
    if (!page.ensureRemaining(input, 4 + 1 + 1 + 4)) 
      throw new BinaryParsingException("Missing bnux document header");


    int magic = page.getInt();
    if (magic != BNUX_MAGIC) throw new BinaryParsingException(
      "Bnux magic number mismatch: " + magic + ", must be: " + BNUX_MAGIC);
    
    int version = page.get();
    isCompressed = version < 0;
    if (isCompressed) version = -version;
    if (version != VERSION) throw new BinaryParsingException(
      "Bnux data format version mismatch: " + version + ", must be: " + VERSION);
    if (isCompressed) {
      if (decompressor == null) decompressor = new Inflater();
    }
    
    if (page.get() != DOC_TYPE) // surrogate hack to identify BEGIN_PAGE
      throw new BinaryParsingException("Illegal bnux page header marker");
    
    // prepare
    if (internedNames == null) internedNames = new LRUHashMap1(128);
    if (nodeBuilder == null) nodeBuilder = new NodeBuilder();
        
    // parse node token data and packed indexes, building the XOM tree
    this.factory = factory;
    try {
      return readDocument(page, input);
    } catch (Throwable t) { 
      reset(); // better safe than sorry
      if (t instanceof Error) {
        throw (Error) t;
      } else if (t instanceof BinaryParsingException) {
        throw (BinaryParsingException) t;
      } else if (t instanceof IOException) {
        throw (IOException) t;
      } else {
        throw new BinaryParsingException(t);
      }
    } finally {
      this.symbols = null; // help gc
      this.textCache = null; // help gc
      this.nameCache = null; // help gc
      this.factory = null; // help gc
//      if (decompressor != null) decompressor.end();
//      decompressor = null;
    }
  }
  
  /**
   * Returns the bnux binary XML document obtained by serializing the given
   * XOM document.
   * <p>
   * An optional zlib compression level ranging from 0 (no ZLIB compression;
   * best performance) to 1 (little ZLIB compression; reduced performance) to
   * 9 (strongest ZLIB compression; worst performance) allows one to configure
   * the CPU/memory consumption trade-off.
   * <p>
   * Unless there is a good reason to the contrary, you should always use
   * level 0: the bnux algorithm typically already precompresses considerably.
   * 
   * @param document
   *            the XOM document to serialize
   * @param zlibCompressionLevel
   *            a number in the range 0..9
   * @return the bnux document obtained from serialization.
   * @throws IllegalArgumentException
   *             if the compression level is out of range.
   */
  public byte[] serialize(Document document, int zlibCompressionLevel) 
      throws IllegalArgumentException {
    
    ByteArrayOutputStream result = new ByteArrayOutputStream(256);
    try {
      serialize(document, zlibCompressionLevel, result);
    } catch (IOException e) {
      throw new RuntimeException(e); // can never happen
    }
    return result.toByteArray();
  }
  
  /**
   * Serializes the given XOM document as a bnux binary XML document onto 
   * the given output stream.
   * <p>
   * An optional zlib compression level ranging from 0 (no ZLIB compression;
   * best performance) to 1 (little ZLIB compression; reduced performance) to
   * 9 (strongest ZLIB compression; worst performance) allows one to configure
   * the CPU/memory consumption trade-off.
   * <p>
   * Unless there is a good reason to the contrary, you should always use
   * level 0: the bnux algorithm typically already precompresses considerably.
   * 
   * @param document
   *            the XOM document to serialize
   * @param zlibCompressionLevel
   *            a number in the range 0..9
   * @param out
   *        the output stream to write to
   * @throws IllegalArgumentException
   *             if the compression level is out of range.
   * @throws IOException
   *             if the underlying output stream encounters an I/O error
   */
  public void serialize(Document document, int zlibCompressionLevel, 
      OutputStream out) throws IllegalArgumentException, IOException {
    
    if (document == null) 
      throw new IllegalArgumentException("XOM document must not be null");
    if (zlibCompressionLevel < 0 || zlibCompressionLevel > 9)
      throw new IllegalArgumentException("Compression level must be 0..9");
    if (out == null) 
      throw new IllegalArgumentException("Output stream must not be null");
    
    try {      
      setOutputStream(zlibCompressionLevel, out);
      writeDocument(document); // generate output
    } catch (Throwable t) { 
      reset(); // better safe than sorry
      if (t instanceof Error) {
        throw (Error) t;
      } else if (t instanceof RuntimeException) {
        throw (RuntimeException) t;
      } else if (t instanceof IOException) {
        throw (IOException) t;
      } else {
        throw new RuntimeException(t);
      }
    } finally {
      this.symbolTable = null; // help gc  
      this.out = null;
    }  
  }
  
  final void setOutputStream(int zlibCompressionLevel, OutputStream out) {
    if (zlibCompressionLevel > 0) {
      if (compressor == null || zlibCompressionLevel != compressionLevel) {
        if (compressor != null) compressor.end(); // free resources
        compressor = new Deflater(zlibCompressionLevel);
      }
    }
    compressionLevel = zlibCompressionLevel;
    this.out = out;
  }
  
  /** Prepares reading from the next page. */
  private void readPage(ArrayByteList src, InputStream input) 
      throws BinaryParsingException, IOException {
    
    if (DEBUG) System.err.println("reading page");
    if (!src.ensureRemaining(input, 4))
      throw new BinaryParsingException("Missing remaining bnux page size");
    int pageSize = src.getInt();
    if (src.remaining() != 0) 
      throw new IllegalStateException("Internal codec bug");
    
    boolean isLastPage = pageSize < 0;
    if (isLastPage) pageSize = -pageSize;
//    if (DEBUG) System.err.println("pageSize = " + pageSize);
    if (!isLastPage) { 
      pageSize++; // read one byte past page, fetching PAGE_BEGIN marker
    }
    if (!src.ensureRemaining(input, pageSize)) 
      throw new BinaryParsingException("Missing remaining bnux page body");
    
    if (isCompressed) decompress(src);
    
    int symbolTableSize = src.getInt();
    if (symbolTableSize < 0) 
      throw new BinaryParsingException("Negative symbol table size");
    int decodedSize = src.getInt();
    if (decodedSize < 0) 
      throw new BinaryParsingException("Negative decodedSize");
    int encodedSize = src.getInt();
    if (encodedSize < 0) 
      throw new BinaryParsingException("Negative encodedSize");
    
    // read symbolTable
    this.symbols = null; // help gc
    if (decodedSize == encodedSize) { // safe trick, faster
      // Note that 7 bit ASCII is a proper subset of UTF-8
      this.symbols = src.getASCIIStrings(symbolTableSize); 
    } else { 
      this.symbols = src.getUTF8Strings(symbolTableSize);
    }
//    this.symbols = src.getUTF16Strings(symbolTableSize);
    if (DEBUG) System.err.println("read symbols = " + Arrays.asList(symbols));
    
    int magic = src.getInt();
    if (magic != BNUX_MAGIC) throw new BinaryParsingException(
      "Bnux magic number mismatch: " + magic + ", must be: " + BNUX_MAGIC);
    
    // reset caches in preparation for XML token decoding
    if (this.nameCache == null) {
      nameCache = new String[Math.min(64, symbolTableSize)];
    } else {
      for (int i=nameCache.length; --i >= 0; ) nameCache[i] = null;
    }
    
    if (factory.getClass() == NodeFactory.class) { // fast path
      if (this.textCache == null) {
        textCache = new Text[Math.min(256, symbolTableSize)];
      } else {
        for (int i=textCache.length; --i >= 0; ) textCache[i] = null;
      }
    }    
//    this.nameCache = null; // help gc
//    this.nameCache = new String[Math.min(64, symbolTableSize)];  
//    this.textCache = null; // help gc
//    if (factory.getClass() == NodeFactory.class) { // fast path
//      this.textCache = new Text[Math.min(128, symbolTableSize)];
//    }
  }
  
  private void decompress(ArrayByteList src) throws BinaryParsingException {
    if (nodeTokens == null) nodeTokens = new ArrayByteList();
    nodeTokens.clear();
    
    try {
      nodeTokens.add(decompressor, src);
    } catch (DataFormatException e) {
      String s = e.getMessage();
        throw new BinaryParsingException(
            s != null ? s : "Invalid ZLIB data format", e);
    }


    src.swap(nodeTokens); // replace src with nodeTokens
    nodeTokens.clear();
  }
  
  /** Parses document from encoded src buffer; tokens appear in document order. */
  private Document readDocument(ArrayByteList src, InputStream input) 
      throws BinaryParsingException, IOException {
    
    if (DEBUG) System.err.println("reading document");
    readPage(src, input);
    Document doc = factory.startMakingDocument();
//    doc.setBaseURI(symbols[src.getInt()]);
    doc.setBaseURI(getInternedName(src.getInt()));
    boolean hasRootElement = false;
    int i = 0;
    
    // add children of document, retaining the exact same order found in input
    while (src.remaining() > 0) {
      Nodes nodes;
      int type = src.get(); // look ahead
      if (DEBUG) System.err.println("reading type = " + toString(type));
      switch (type & 0x07) { // three low bits indicate node type
        case TEXT: {
          throw new BinaryParsingException("Unreachable text");
        }
        case ATTRIBUTE: {
          throw new BinaryParsingException("Unreachable attribute");
        }
        case BEGIN_ELEMENT: {
          if (factory.getClass() == NodeFactory.class) { // fast path
            Element root = readStartTag(src, type);
            readElement(src, root, input); // reads entire subtree
            nodes = new Nodes(root);
          } else { // slow path
            Element root = readStartTagF(src, type, true);
            if (root == null) {
              throw new NullPointerException("Factory failed to create root element.");
            }
            doc.setRootElement(root);
            readElementF(src, root, input);
            nodes = factory.finishMakingElement(root);
          }
          break;
        }
        case END_ELEMENT: {
          throw new BinaryParsingException("Unreachable end of element");
        }
        case COMMENT: {
          nodes = readCommentF(src, type);
          break;
        }
        case NAMESPACE_DECLARATION: {
          throw new BinaryParsingException("Unreachable namespace declaration");
        }
        case PROCESSING_INSTRUCTION: {
          nodes = readProcessingInstructionF(src);
          break;
        }
        case DOC_TYPE: {
          nodes = readDocTypeF(src);
          break;
        }
        default: {
          throw new BinaryParsingException("Illegal node type code=" + type);
        }
      }


      // append nodes:
      for (int j=0; j < nodes.size(); j++) {
        Node node = nodes.get(j);
        if (node instanceof Element) { // replace fake root with real root
          if (hasRootElement) {
            throw new IllegalAddException(
              "Factory returned multiple root elements");
          }
          doc.setRootElement((Element) node);
          hasRootElement = true;
        } else {
          doc.insertChild(node, i);
        }
        i++;
      }
    }
    
    if (!hasRootElement) throw new WellformednessException(
        "Factory attempted to remove the root element");
    factory.finishMakingDocument(doc);
    if (DEBUG) System.err.println("finished reading document");
    return doc;
  }
  
  /** Reads start tag and returns a corresponding empty element */
  private Element readStartTag(ArrayByteList src, int type) {
    String qname = readString(src, 4, type);
    String namespaceURI = readName(src, 6, type);
    return this.nodeBuilder.createElement(qname, namespaceURI);
//    return new Element(qname, namespaceURI);
  }
  
  private Element readStartTagF(ArrayByteList src, int type, boolean isRoot) {
    String qname = readString(src, 4, type);
    String namespaceURI = readName(src, 6, type);
    return isRoot ?
      factory.makeRootElement(qname, namespaceURI) :
      factory.startMakingElement(qname, namespaceURI);
  }
  
  /** Iterative pull parser reading an entire element subtree. */
  private void readElement(ArrayByteList src, Element current, InputStream input) 
    throws BinaryParsingException, IOException {
    
    while (true) {
      Node node = null;
      Element down = null;
      int type = src.get(); // look ahead
//      if (DEBUG) System.err.println("reading type = " + toString(type));
      switch (type & 0x07) { // three low bits indicate node type
        case TEXT: {
          node = readText(src, type);
          break;
        }
        case ATTRIBUTE: {
          readAttribute(src, current, type);
          continue;
        }
        case BEGIN_ELEMENT: {
          down = readStartTag(src, type);
          node = down;
          break;
        }
        case END_ELEMENT: {
          current = (Element) current.getParent();
          if (current == null) return; // we're done with the root element
          continue;
        }
        case COMMENT: {
          node = readComment(src, type);
          break;
        }
        case NAMESPACE_DECLARATION: {
          readNamespaceDeclaration(src, current, type);
          continue;
        }
        case PROCESSING_INSTRUCTION: {
          node = readProcessingInstruction(src);
          break;
        }
        case DOC_TYPE: { // surrogate hack to identify BEGIN_PAGE
          readPage(src, input);
          continue;
        }
      }


//      if (DEBUG) System.err.println("read node=" + node.toXML());
      
      // assert node != null
      if (IS_EXTENDED_XOM) { // xom-1.1 + patch
        current.fastInsertChild(node, current.getChildCount());
      } else {
        current.insertChild(node, current.getChildCount());
      }
      
      if (down != null) current = down; // recurse down
    }
  }
  
  /** Iterative pull parser reading an entire element subtree (using NodeFactory). */
  private void readElementF(ArrayByteList src, Element current, InputStream input) 
      throws BinaryParsingException, IOException {
    
//    final ArrayList stack = new ArrayList();
    final FastStack stack = new FastStack();
    stack.push(current);
    boolean addAttributesAndNamespaces = true;
    
    while (true) {
      Nodes nodes = null;
      int type = src.get(); // look ahead
//      if (DEBUG) System.err.println("reading type = " + toString(type));


      switch (type & 0x07) { // three low bits indicate node type
        case TEXT: {
          nodes = readTextF(src, type);
          break;
        }
        case ATTRIBUTE: {
          Element elem = addAttributesAndNamespaces ? current : null;
          nodes = readAttributeF(src, elem, type);
          break;
        }
        case BEGIN_ELEMENT: {
          Element elem = readStartTagF(src, type, false);
          stack.push(elem); // even if it's null
          if (elem != null) {
            current.insertChild(elem, current.getChildCount());
            current = elem; // recurse down
          }
          addAttributesAndNamespaces = elem != null;
          continue;
        }
        case END_ELEMENT: {
          Element elem = stack.pop();
          if (elem == null) {
            continue; // skip element
          }
          ParentNode parent = elem.getParent();
          if (parent == null) throwTamperedWithParent();
          if (parent instanceof Document) {
            return; // we're done with the root element
          }
          
          current = (Element) parent; // recurse up
          nodes = factory.finishMakingElement(elem);
                     
          if (nodes.size()==1 && nodes.get(0)==elem) { // same node? (common case)
            continue; // optimization: no need to remove and then readd same element
          }
          
          if (current.getChildCount()-1 < 0) throwTamperedWithParent();        
          current.removeChild(current.getChildCount()-1);
          break;
        }
        case COMMENT: {
          nodes = readCommentF(src, type);
          break;
        }
        case NAMESPACE_DECLARATION: {
          Element elem = addAttributesAndNamespaces ? current : null;
          readNamespaceDeclaration(src, elem, type);
          continue;
        }
        case PROCESSING_INSTRUCTION: {
          nodes = readProcessingInstructionF(src);
          break;
        }
        case DOC_TYPE: { // surrogate hack for BEGIN_PAGE
          readPage(src, input); 
          continue;
        }
      }
    
      appendNodes(current, nodes);
    }
  }
  
  private static void appendNodes(Element elem, Nodes nodes) {
    if (nodes != null) {
      int size = nodes.size();
      for (int i=0; i < size; i++) {
        Node node = nodes.get(i);
        if (node instanceof Attribute) {
          elem.addAttribute((Attribute) node);
        } else {
          elem.insertChild(node, elem.getChildCount());
        }
      }
    }
  }
  
  private static void throwTamperedWithParent() {
    throw new XMLException("Factory has tampered with a parent pointer " + 
        "of ancestor-or-self in finishMakingElement()");
  }
  
  private void readAttribute(ArrayByteList src, Element dst, int type) throws BinaryParsingException {
    String qname = readString(src, 4, type);
    String namespaceURI = readName(src, 6, type);
    String value = readString(src, 4, src.get());
    Attribute.Type attrType = Util.getAttributeType(src.get());
    Attribute attr = this.nodeBuilder.createAttribute(qname, namespaceURI, value, attrType);
//    Attribute attr = new Attribute(qname, namespaceURI, value, attrType);
    dst.addAttribute(attr);    
  }
  
  private Nodes readAttributeF(ArrayByteList src, Element dst, int type) throws BinaryParsingException {
    String qname = readString(src, 4, type);
    String namespaceURI = readName(src, 6, type);
    String value = readString(src, 4, src.get());
    Attribute.Type attrType = Util.getAttributeType(src.get());
    if (dst == null) return null; // NONE;
    return factory.makeAttribute(qname, namespaceURI, value, attrType);
  }
  
  private Comment readComment(ArrayByteList src, int type) {
    return new Comment(readString(src, 4, type));
  }
  
  private Nodes readCommentF(ArrayByteList src, int type) {
    return factory.makeComment(readString(src, 4, type));
  }
  
  private void readNamespaceDeclaration(ArrayByteList src, Element dst, int type) {
    String prefix = readString(src, 4, type);
    String uri = readName(src, 6, type);
    if (dst != null) dst.addNamespaceDeclaration(prefix, uri);
  }
  
  private ProcessingInstruction readProcessingInstruction(ArrayByteList src) {
    int type = src.get(src.position() - 1);
    String target = readString(src, 4, type);
    String value = readString(src, 6, type);
    return new ProcessingInstruction(target, value);
  }
  
  private Nodes readProcessingInstructionF(ArrayByteList src) {
    int type = src.get(src.position() - 1);
    String target = readString(src, 4, type);
    String value = readString(src, 6, type);
    return factory.makeProcessingInstruction(target, value);
  }
  
  /** Does not pack indexes of doctype (infrequent anyway) */
  private Nodes readDocTypeF(ArrayByteList src) {
    String rootElementName = symbols[src.getInt()];
    String publicID = symbols[src.getInt()];
    if (DOCTYPE_NULL_ID.equals(publicID)) publicID = null;
    String systemID = symbols[src.getInt()];
    if (DOCTYPE_NULL_ID.equals(systemID)) systemID = null;
    String internalDTDSubset = symbols[src.getInt()];
    if (internalDTDSubset.length() == 0) internalDTDSubset = null;
    
    Nodes nodes = factory.makeDocType(rootElementName, publicID, systemID);
    for (int i=0; i < nodes.size(); i++) {
      if (nodes.get(i) instanceof DocType) {
        DocType docType = (DocType) nodes.get(i);
        if (docType.getInternalDTDSubset().length() == 0) {
          try {
            docType.setInternalDTDSubset(internalDTDSubset);
          } catch (IllegalAccessError e) {
            ; // ignore; setInternalDTDSubset() is private in xom < 1.1 
          }
        }
      }
    }
    return nodes;
  }
  
  // try to avoid XML reverification by caching repetitive Texts
  private Text readText(ArrayByteList src, int type) {
    int i = readSymbol(src, 4, type);
    Text text;
    if (i < textCache.length) {
      text = textCache[i];
      if (text != null) return new Text(text);
    }
    text = new Text(symbols[i]);
    if (i < textCache.length) textCache[i] = text;
    return text;
  }
  
  private Nodes readTextF(ArrayByteList src, int type) {
    return factory.makeText(readString(src, 4, type));
  }
  
  /** Reads string via packed index from symbolTable */
  private String readString(ArrayByteList src, int shift, int type) {
    int i = readSymbol(src, shift, type);
    if (i < 0) return "";
    return symbols[i];
  }
  
  /** Reads string via packed index from symbolTable */
  private static int readSymbol(ArrayByteList src, int shift, int type) {
    // assert shift == 4 || shift == 6    
    if (Util.isInlinedIndex(type)) {
      if (shift == 6) return -1;
      return Util.getInlinedIndex(type);      
    }
    
    switch ((type >>> shift) & 0x03) { // look at two bits indicating index length
      case 0 : return Util.getUnsignedByte(src.get());
      case 1 : return Util.getUnsignedShort(src.getShort());
      case 2 : return -1;
      default: return src.getInt();
    }
  }
  
  private String readName(ArrayByteList src, int shift, int type) {
    int i = readSymbol(src, shift, type);
    if (i < 0) return "";
    if (i < nameCache.length) {
      String name = nameCache[i];
      if (name == null) { // cache miss
        name = getInternedName(i);
        nameCache[i] = name;
      }
      return name;
    }
    return symbols[i];
  }


  private String getInternedName(int i) {
    String name = symbols[i];
    if (name.length() == 0) {
      name = "";
    } else {
      name = (String) internedNames.get(name);
      if (name == null) {
        name = symbols[i];
        internedNames.put(name, name);
      }
    }
    return name;
  }
  
  /** Writes nodeTokens, indexData, symbolTable to output stream */
  private void writePage(boolean isLastPage) throws IOException {
    if (DEBUG) System.err.println("writing page");
    Entry[] entries = symbolTable.getEntries();
    int numChars = symbolTable.numCharacters();
    
    // reorder entries and update indexData accordingly.
    packSort(entries, indexData);
//    if (DEBUG) printStatistics(entries);
    
    // add header to page
    page.ensureCapacity(page.size() + 1 + 4 + 4 + 4 + 4 + 4 + numChars*4 + entries.length + 
        nodeTokens.size() + indexData.size() + numChars/100); // an educated guess
    page.add((byte) DOC_TYPE); // BEGIN_PAGE marker surrogate    
    page.addInt(0); // pageSize dummy placeholder
    int pageOffset = page.size();    
    page.addInt(entries.length);
    page.addInt(numChars + entries.length);  // decodedSize (+zero terminator)
    page.addInt(0); // encodedSize dummy placeholder
    int encodedOffset = page.size();    
    
    // add symbolTable to page
    encodeSymbols(entries, page); // assert: no need to expand underlying array
    int encodedSize = page.size() - encodedOffset;
    page.setInt(encodedOffset-4, encodedSize); // replace dummy placeholder
    entries = null; // help gc
    
    // add node tokens and packed symbol indexes to page
    page.addInt(BNUX_MAGIC);
    encodeTokens(nodeTokens, indexData.asArray(), page);
    
    int pageSize;
    nodeTokens.clear();
    if (compressionLevel > 0) { // compress page body
      page.position(pageOffset);
      nodeTokens.add(compressor, page);
      page.remove(pageOffset, page.size());
      pageSize = nodeTokens.size();
    } else {
      pageSize = page.size() - pageOffset;
    }
    if (isLastPage) pageSize = -pageSize;
    page.setInt(pageOffset-4, pageSize); // replace pageSize dummy placeholder


    // having filled the buffers, flush them onto underlying output stream
    page.write(out);
    nodeTokens.write(out);
    
    // reset 
    if (!isLastPage) symbolTable.clear();
    page.clear();
    nodeTokens.clear();
    indexData.clear();
    if (DEBUG) System.err.println("finished writing page");
  }
  
  private void writeDocument(Document doc) throws IOException {
    if (DEBUG) System.err.println("writing document");
    writeXMLDeclaration(doc.getBaseURI());
    for (int i = 0; i < doc.getChildCount(); i++) {
      Node node = doc.getChild(i);
      if (node instanceof Element) {
        writeElement((Element) node);
      } else if (node instanceof Comment) {
        writeComment((Comment) node);
      } else if (node instanceof ProcessingInstruction) {
        writeProcessingInstruction((ProcessingInstruction) node);
      } else if (node instanceof DocType) {
        writeDocType((DocType) node);
      } else {
        throw new IllegalAddException("Cannot write node type: " + node);
      }
    }
    writeEndDocument();
    if (DEBUG) System.err.println("finished writing document");
  }
  
  final void writeXMLDeclaration(String baseURI) {
    if (baseURI == null) baseURI = "";
    
    // setup
    symbolTable = new SymbolTable();
    if (nodeTokens == null) nodeTokens = new ArrayByteList();
    nodeTokens.clear();
    if (indexData == null) indexData = new ArrayIntList(); 
    indexData.clear();


    // write bnux document header
    if (page == null) page = new ArrayByteList(256);
    page.clear();
    page.ensureCapacity(DOCUMENT_HEADER_SIZE + PAGE_HEADER_SIZE + 1);
    page.addInt(BNUX_MAGIC);
    int version = VERSION;
    if (compressionLevel > 0) version = -version;
    page.add((byte)version);
    
    isFirstPage = true;
    writeIndex(baseURI);    
  }
  
  final void writeEndDocument() throws IOException {
    flush(true);
  }
  
  // assertion: must not be called with !isLastPage 
  // if we're not at a safe point, i.e. at nesting depth == 0
  final void flush(boolean isLastPage) throws IOException {
    try {
      if (nodeTokens.size() > 0) { // anything remaining to be written?
        writePage(isLastPage);
      }
      out.flush();
    } finally {
      if (isLastPage) {
        this.symbolTable = null; // help gc
        this.out = null;
      }
      nodeTokens.clear();
    }
  }
  
  /** Encodes a node into the intermediate unpacked binary form */
  private void writeChild(Node node) throws IOException {
    if (node instanceof Element) {
      writeElement((Element) node);
    } else if (node instanceof Text) {
      writeText((Text) node);
    } else if (node instanceof Comment) {
      writeComment((Comment) node);
    } else if (node instanceof ProcessingInstruction) {
      writeProcessingInstruction((ProcessingInstruction) node);
    } else {
      throw new IllegalAddException("Cannot write node type: " + node);
    }
  }


  final void writeElement(Element elem) throws IOException {  
    writeStartTag(elem);
    
    for (int i = 0; i < elem.getChildCount(); i++) {
      writeChild(elem.getChild(i));
    }
        
    writeEndTag();
  }
  
  final void writeStartTag(Element elem) {
    writeIndex(elem.getNamespacePrefix(), elem.getLocalName());


    int type = BEGIN_ELEMENT;
    if (elem.getNamespaceURI().length() == 0) {
      type = Util.noNamespace(type);
    } else {
      writeIndex(elem.getNamespaceURI());
    }
    nodeTokens.add((byte)type);
    
    for (int i = 0; i < elem.getAttributeCount(); i++) {
      writeAttribute(elem.getAttribute(i));
    }
    
    if (IS_EXTENDED_XOM) {
      writeNamespaceDeclarationsFast(elem);
    } else {
      writeNamespaceDeclarations(elem);
    }
  }


  final void writeEndTag() throws IOException {
    if (nodeTokens.size() + indexData.size() + symbolTable.numCharacters() 
        + symbolTable.size() >= MAX_PAGE_CAPACITY) {
      writePage(false); // write nodeTokens, indexData, symbolTable to output stream
    }
    
    nodeTokens.add((byte)END_ELEMENT);    
  }


  private void writeAttribute(Attribute attr) {
    writeIndex(attr.getNamespacePrefix(), attr.getLocalName());
    
    int type = ATTRIBUTE;
    if (attr.getNamespaceURI().length() == 0) {
      type = Util.noNamespace(type);
    } else {
      writeIndex(attr.getNamespaceURI());
    }
    
    writeIndex(attr.getValue());
    nodeTokens.add((byte)type);
    nodeTokens.add(Util.getAttributeTypeCode(attr));
  }


  final void writeComment(Comment comment) {
    nodeTokens.add((byte)COMMENT);
    writeIndex(comment.getValue());
  }
  
  /** Does not pack indexes of doctype (infrequent anyway) */
  final void writeDocType(DocType docType) {
    nodeTokens.add((byte)DOC_TYPE);
    writeIndex(docType.getRootElementName());    
    writeIndex(docType.getPublicID() == null ? DOCTYPE_NULL_ID : docType.getPublicID());
    writeIndex(docType.getSystemID() == null ? DOCTYPE_NULL_ID : docType.getSystemID());
    writeIndex(docType.getInternalDTDSubset() == null ? "" : docType.getInternalDTDSubset());
  }  
  
  // requires xom-1.1 + patches
  private void writeNamespaceDeclarationsFast(Element elem) {
    String[] decls = elem.getAdditionalNamespaceDeclarations();
    for (int i=0; i < decls.length; ) {
      nodeTokens.add((byte)NAMESPACE_DECLARATION);
      writeIndex(decls[i++]); // prefix
      writeIndex(decls[i++]); // URI
    }
  }
  
  private void writeNamespaceDeclarations(Element elem) {
    int count = elem.getNamespaceDeclarationCount();
    if (count == 1) 
      return; // elem.getNamespaceURI() has already been written


    for (int i = 0; i < count; i++) {
      String prefix = elem.getNamespacePrefix(i);
      String uri = elem.getNamespaceURI(prefix);
      if (prefix.equals(elem.getNamespacePrefix()) && uri.equals(elem.getNamespaceURI())) {
//        if (DEBUG) System.err.println("********** NAMESPACE IGNORED ON WRITE ***************\n");
        continue;
      }
      nodeTokens.add((byte)NAMESPACE_DECLARATION);
      writeIndex(prefix);
      writeIndex(uri);
    }
  }


  final void writeProcessingInstruction(ProcessingInstruction pi) {
    nodeTokens.add((byte)PROCESSING_INSTRUCTION);
    writeIndex(pi.getTarget());
    writeIndex(pi.getValue());
  }
  
  final void writeText(Text text) {
    nodeTokens.add((byte)TEXT);
    writeIndex(text.getValue());
  }
  
  /** Puts symbol into symbolTable and appends the string's index to indexData */
  private final void writeIndex(String symbol) {
    writeIndex("", symbol);
  }


  /** Puts symbol into symbolTable and appends the string's index to indexData */
  private final void writeIndex(String prefix, String localName) {
    int index = symbolTable.addSymbol(prefix, localName);
    indexData.add(index);
  }


  /** Converts strings of symbolTable into UTF-8 bytes */
  private void encodeSymbols(Entry[] entries, ArrayByteList dst) {    
    /*
     * As an optimization, one could use dst.ensureCapacity(dst.size + 4 *
     * sum(entries.key.length) + entries.length) then use plain array
     * accesses via dst.addUTF8Entries(entries) instead of many small
     * dst.add(byte) calls. Update: I benchmarked various variants of this
     * idea; none of them turned out to be worthwhile, so, for the time
     * being, we'll keep the simple version below.
     */
//    if (DEBUG) System.err.println("encoding symbols = " + toString(entries));
    int len = entries.length;
    for (int i=0; i < len; i++) {
      Entry entry = entries[i];
      dst.addUTF8String(entry.getKey1(), entry.getKey2());
      //dst.addUTF16String(entry.getKey1(), entry.getKey2());
    }    
  }
  
  /**
   * Sorts entries descending by symbol frequency (# of occurances) and
   * updates indexData accordingly. This allows to compress frequent 4 byte
   * indexes into a single byte. FAQ: this sort is *not* the bottleneck.
   */
  private void packSort(Entry[] entries, ArrayIntList indexData) {
    if (!DEBUG && entries.length <= 256) { // 0, 1, ... , 255
      return; // no need to sort indexes - all fit into one unsigned byte anyway
    }


    // Swap entries with frequency == 1 to end of array (typically Text nodes).
    // There's no need to sort those with O(N log N).
    int head = entries.length;
    for (int i=entries.length; --i >= 0; ) {
      Entry e = entries[i];
      if (e.getFrequency() == 1) {
        head--;
        entries[i] = entries[head];
        entries[head] = e;
      }
    }
//    if (DEBUG) System.err.println("len=" + entries.length + ", #f>1=" + (100.0f * head / entries.length));
    
    // sort remaining entries descending by frequency.
    Arrays.sort(entries, 0, head,
      new Comparator() {
        public final int compare(Object e1, Object e2) { 
          int f1 = ((Entry) e1).getFrequency();
          int f2 = ((Entry) e2).getFrequency();
          return f2 - f1;
        }
      }
    );
    
    // reorder indexData with sorted indexes
    // since sort has moved entries[indexData[k]] to entries[i]
    int[] indexes = new int[entries.length];
    for (int i=entries.length; --i >= 0; ) {
      indexes[entries[i].getIndex()] = i;
    }
    
    int[] ix = indexData.asArray();
    for (int i=indexData.size(); --i >= 0; ) {
      ix[i] = indexes[ix[i]];
    }
    // post-condition: entries[indexData[k]] corresponds to entries[i]
  }  
  
  /**
   * Writes nodeTokens in document order; in the process stitches in packed
   * indexes referring to symbols in the symbolTable.
   */  
  private void encodeTokens(ArrayByteList tokenList, int[] indexes, ArrayByteList dst) {
    byte[] tokens = tokenList.asArray();
    int size = tokenList.size();
    int i = 0;
    int j = 0;
    if (isFirstPage) dst.addInt(indexes[i++]); // document baseURI
    isFirstPage = false;


    while (j < size) {
      int type = tokens[j++];
      dst.add((byte)type);


//      if (DEBUG) System.err.println("encoding type = " + toString(type));
      switch (type & 0x07) {      
        case TEXT: {
          Util.packOneIndex(dst, indexes[i++], type); // value
          break;
        }
        case ATTRIBUTE: {
          if (Util.hasNoNamespace(type)) { // qname
            Util.packOneIndex(dst, indexes[i++], type); 
          } else { // qname, URI
            Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type);
          }
          dst.add((byte)0);
          Util.packOneIndex(dst, indexes[i++], 0); // value
          dst.add(tokens[j++]); // attrType
          break;
        }
        case BEGIN_ELEMENT: {
          if (Util.hasNoNamespace(type)) {
            Util.packOneIndex(dst, indexes[i++], type); // qname
          } else {
            Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type); // qname, URI
          }
          break;
        }
        case END_ELEMENT: {
          break; // nothing to do
        }        
        case COMMENT: {
          Util.packOneIndex(dst, indexes[i++], type); // value
          break;
        }
        case NAMESPACE_DECLARATION: { // prefix, URI
          Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type); 
          break;
        }
        case PROCESSING_INSTRUCTION: { // target, value
          Util.packTwoIndexes(dst, indexes[i++], indexes[i++], type);
          break;
        }
        case DOC_TYPE: { // infrequent; no need to pack indexes
          dst.addInt(indexes[i++]); // rootElementName
          dst.addInt(indexes[i++]); // publicID
          dst.addInt(indexes[i++]); // systemID
          dst.addInt(indexes[i++]); // internalDTDSubset
          break;
        }
        default: {
          throw new IllegalArgumentException("illegal node type");
        }        
      }      
    }
  }


  private static int createMagicNumber() {
    // 0xE0, 0x01, 0xDF, 0xFE = -32, 1, -33, -2 = -536748034
    // Thanks to Paul.Sandoz@Sun.COM
    ArrayByteList magic = new ArrayByteList(4);
    magic.add((byte)0xE0);
    magic.add((byte)0x01);
    magic.add((byte)0xDF);
    magic.add((byte)0xFE);
    return magic.getInt();
  }
  
  /** does xom.jar contain our performance extensions? */
  private static boolean hasXOMExtensions() {
    try {
      ParentNode.class.getMethod("fastInsertChild", new Class[] {Node.class, Integer.TYPE});
      Element.class.getMethod("getAdditionalNamespaceDeclarations", null);    
      return true;
    } catch (Throwable t) {
      return false;
    }
  }
  
  private static String toString(int type) { // DEBUG only
    switch (type & 0x07) {
      case TEXT: return "TEXT";
      case ATTRIBUTE: return "ATTRIBUTE";
      case BEGIN_ELEMENT: return "BEGIN_ELEMENT";
      case END_ELEMENT: return "END_ELEMENT";
      case COMMENT: return "COMMENT";
      case NAMESPACE_DECLARATION: return "NAMESPACE_DECLARATION";
      case PROCESSING_INSTRUCTION: return "PROCESSING_INSTRUCTION";
      case DOC_TYPE: return "DOC_TYPE";
      default: {
        throw new IllegalArgumentException(
            "Illegal node type code=" + (type & 0x07));
      }
    }
  }
    
  private static String toString(Entry[] entries) { // DEBUG only
    ArrayList list = new ArrayList();
    for (int i=0; i < entries.length; i++) {
      list.add(entries[i].getQualifiedName());
    }
    return list.toString();
  }
  


  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  
  /**
   * Table of unique symbols for tokenization on XML serialization. This is a
   * classic text book hash algorithm, adapted to meet our specific
   * performance needs. It's close to the one used by the JDK HashMap.
   * Maintains a map of (String, String) ==> (index, frequency) associations.
   */
  private static final class SymbolTable { // not a public class!


    private static final float LOAD_FACTOR = 0.75f;
    private static final int INITIAL_CAPACITY = 16;
    private Entry[] entries = new Entry[INITIAL_CAPACITY];
    private int threshold = (int) (INITIAL_CAPACITY * LOAD_FACTOR);
    private int size = 0;
    private int numChars = 0;
    
    /** Constructs and returns a table with default parameters. */
    public SymbolTable() {
    }
    
    /** Removes all entries from this table, retaining the current capacity. */
    public void clear() {
      size = 0;
      numChars = 0;
      Entry[] src = entries;
      for (int i=src.length; --i >= 0; ) src[i] = null;
    }
    
    /** Returns the total number of characters occupied by all symbol strings. */
    public int numCharacters() {
      return numChars;
    }
    
    /** Returns the number of symbols. */
    public int size() {
      return size;
    }


    /**
     * Adds the given symbol to the table if not already present. Otherwise
     * increments its frequency counter.
     * 
     * A symbol is structured like a lexical XML QName.
     * symbol: key1 + ":" + key2   (if key1 is non-empty string)
     * symbol: key2                (if key1 is empty string)
     *  
     * @return a sequence number N >= 0 indicating that the symbol was added
     *         to this table as the N-th entry, in order.
     */
    public int addSymbol(String key1, String key2) {
      // assert: key1 and key2 are non-null
      int hash = hash(key1, key2);
      int i = hash & (entries.length - 1);
      Entry entry = findEntry(key1, key2, entries[i], hash);
      if (entry != null) {
        entry.frequency++;
        return entry.index;        
      }
      
      // not found; add entry for key --> (index=size, freq=1) mapping
      // new entry is inserted at head of chain
//      if (DEBUG) checkNULChar(key1);
//      if (DEBUG) checkNULChar(key2);
      numChars += key1.length() + key2.length();
      if (key1.length() != 0) numChars++;
      entries[i] = new Entry(key1, key2, hash, entries[i], size); 
      if (size >= threshold) rehash();
      return size++;
    }
    
    private static Entry findEntry(String key1, String key2, Entry cursor, int hash) {
      while (cursor != null) { // scan collision chain
        if (hash == cursor.hash && eq(key2, cursor.key2) && eq(key1, cursor.key1)) { 
          cursor.key1 = key1; // speeds up future lookups: equals() vs. ==
          cursor.key2 = key2; // speeds up future lookups: equals() vs. ==
          return cursor;
        }
        cursor = cursor.next;
      }
      return null;    
    }
    
    /**
     * Expands the capacity of this table, rehashing all entries into
     * corresponding new slots.
     */
    private void rehash() {
      Entry[] src = entries;
      int capacity = 2 * src.length;
      Entry[] dst = new Entry[capacity];
      
      for (int i = src.length; --i >= 0; ) {
        Entry e = src[i];
        while (e != null) { // walk collision chain
          int j = e.hash & (capacity - 1);
          Entry next = e.next;
          e.next = dst[j];
          dst[j] = e; // insert e at head of chain
          e = next;
        }
      }
      entries = dst;
      threshold = (int) (capacity * LOAD_FACTOR);
    }


    /**
     * Returns all table entries, sorted ascending by entry.index. The
     * result can subsequently be used to sort by symbol frequency, or
     * similar. Much faster than an entrySet().iterator().next() loop would
     * be.
     */
    public Entry[] getEntries() {
      Entry[] dst = new Entry[size];
      Entry[] src = entries;
      for (int i = src.length; --i >= 0; ) {
        Entry e = src[i];
        while (e != null) { // walk collision chain
          dst[e.index] = e;
          e = e.next;
        }
      }
      return dst;
    }
    
    private static int hash(String key1, String key2) {
      int h = key2.hashCode();
      if (key1 != "") h = key1.hashCode() ^ h;
      return auxiliaryHash(h);
//      return auxiliaryHash(key1.hashCode() ^ key2.hashCode());
    }


    /**
     * Auxiliary hash function that defends against poor base hash
     * functions. Ensures more uniform hash distribution, hence reducing the
     * probability of pathologically long collision chains, in particular
     * for short key symbols that are quite similar to each other, or XML 
     * boundary whitespace (worst case scenario).
     */
    private static int auxiliaryHash(int h) {
      h += ~(h << 9);
      h ^= (h >>> 14);
      h += (h << 4);
      h ^= (h >>> 10);
      return h;
    }


    private static boolean eq(String x, String y) {
      return x == y || x.equals(y);
    }
      
    /** Sanity check; Unnecessary since NULs have already been checked by nu.xom.Verifier. */
    private static void checkNULChar(String key) { 
      int i = key.indexOf((char)0);
      if (i >= 0) {
        throw new IllegalArgumentException(
          "Symbol must not contain C0 control character NUL (char 0x00) [index:" + i 
          + " within '" + key + "']");
      }
    }
    
  }
    
  /**
   * A value in the SymbolTable.
   */
  private static final class Entry {


    String key1; // prefix or ""
    String key2; // localName or Text or Comment or similar
    final int hash;   // cache for symbol's hash code
    final int index;  // index to correlate Entry with indexList on XML serialization
    int frequency = 1;// number of occurances of symbol within current XML document
    Entry next;       // successor in collision chain, mapping to the same hash slot


    public Entry(String key1, String key2, int hash, Entry next, int index) {
      this.key1 = key1;
      this.key2 = key2;
      this.hash = hash;
      this.next = next;
      this.index = index;
    }


    public String getKey1()    { return key1; }
    public String getKey2()    { return key2; }
    public int getIndex()     { return index; }
    public int getFrequency() { return frequency; }
    
    public String getQualifiedName() { // DEBUG only
      if (key1.length() == 0) return key2;
      return key1 + ':' + key2;
    }
    public String toString() { // DEBUG only
      return "[key1=" + key1 + ", key2=" + key2 + ", freq=" + frequency + "]";
    }
    
  }
  
  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  
  /**
   * Fast replacement for ArrayList and java.util.Stack. Possibly premature
   * and unnecessary?
   */
  private static final class FastStack {
    private Element[] elements = new Element[10];
    private int size = 0;
    
    public Element pop() {
      Element elem = elements[size-1];
      elements[--size] = null; // help gc
      return elem;
    }
    
    public void push(Element elem) {
      if (size == elements.length) ensureCapacity(size + 1);
      elements[size++] = elem;
    }
    
    private void ensureCapacity(int minCapacity) {
      if (minCapacity > elements.length) {
        int newCapacity = Math.max(minCapacity, 2 * elements.length + 1);
        elements = subArray(0, size, newCapacity);
      }
    }
    
    /** Small helper method eliminating redundancy. */
    private Element[] subArray(int from, int length, int capacity) {
      Element[] subArray = new Element[capacity];
      System.arraycopy(elements, from, subArray, 0, length);
      return subArray;
    }
    
  }
  
}
Source Code of nux.xom.binary.BinaryXMLCodec$SymbolTable

Related Classes of nux.xom.binary.BinaryXMLCodec$SymbolTable