Source Code of org.pdf4j.saxon.tinytree.LargeStringBuffer

package org.pdf4j.saxon.tinytree;


import org.pdf4j.saxon.om.FastStringBuffer;


import java.io.Writer;
import java.io.Serializable;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;


/**
 * This is an implementation of the JDK 1.4 CharSequence interface: it implements
 * a CharSequence as a list of arrays of characters (the individual arrays are known
 * as segments). When characters are appended, a new segment is started if the previous
 * array would otherwise overflow a threshold size (the maxAllocation size).
 * <p/>
 * This is more efficient than a buffer backed by a contiguous array of characters
 * in cases where the size is likely to grow very large, and where substring operations
 * are rare. As used within the TinyTree, the value of each text node is contiguous within
 * one segment, so extraction of the value of a text node is efficient.
 */


public final class LargeStringBuffer implements CharSequence, Serializable {


    // TODO:PERF with large documents the Arrays.binarySearch() cost can be noticeable.
    // It would be better if TinyTree addressed into this structure using segment+offset addressing.
    // 16 bits for each would do fine.


    private int minAllocation;
    private int maxAllocation;
    private List segments;      // each segment is a FastStringBuffer
    private int[] startOffsets; // if startOffsets[23] is 123456, then the first
                                // character in segment 23 is the 123456'th character
                                // of the CharSequence value.
    private int length;         // total length of the CharSequence


   /**
    * Create an empty LargeStringBuffer with default space allocation
    */


    public LargeStringBuffer() {
        this(4096, 65536);
    }


   /**
    * Create an empty LargeStringBuffer
    * @param minAllocation initial allocation size for each segment (including the first). If minAllocation
    * exceeds maxAllocation, it is rounded down to the value of maxAllocation
    * @param maxAllocation maximum allocation size for each segment. When a segment reaches this
    * size, a new segment is created rather than appending more characters to the existing segment.
    * However, a segment may have size greater than maxAllocation if the data is appended in a single chunk
    * of size maxAllocation.
    */
    public LargeStringBuffer(int minAllocation, int maxAllocation) {
        this.minAllocation = Math.min(minAllocation, maxAllocation);
        this.maxAllocation = maxAllocation;
        FastStringBuffer initial = new FastStringBuffer(minAllocation);
        segments = new ArrayList(4);
        segments.add(initial);
        startOffsets = new int[1];
        startOffsets[0] = 0;
        length = 0;
    }


    /**
     * Append a CharSequence to this LargeStringBuffer
     * @param data the data to be appended
     */


    public void append(CharSequence data) {
        final int increment = data.length();
        if (increment == 0) {
            return;
        }
        FastStringBuffer last = ((FastStringBuffer)segments.get(segments.size()-1));
        if (last.length() + increment <= maxAllocation) {
            last.append(data);
        } else {
            int[] s2 = new int[startOffsets.length+1];
            System.arraycopy(startOffsets, 0, s2, 0, startOffsets.length);
            s2[startOffsets.length] = length;
            startOffsets = s2;
            last = new FastStringBuffer(Math.max(minAllocation, increment));
            segments.add(last);
            last.append(data);
        }
        length += increment;
    }


    /**
     * Returns the length of this character sequence.  The length is the number
     * of 16-bit UTF-16 characters in the sequence. </p>
     *
     * @return  the number of characters in this sequence
     */
    public int length() {
        return length;
    }


    /**
     * Returns the character at the specified index.  An index ranges from zero
     * to <tt>length() - 1</tt>.  The first character of the sequence is at
     * index zero, the next at index one, and so on, as for array
     * indexing. </p>
     *
     * @param   index   the index of the character to be returned
     *
     * @return  the specified character
     *
     * @throws  IndexOutOfBoundsException
     *          if the <tt>index</tt> argument is negative or not less than
     *          <tt>length()</tt>
     */
    public char charAt(int index) {
        if (startOffsets.length == 1) {
            // optimize for small documents
            return ((FastStringBuffer)segments.get(0)).charAt(index);
        }
        if (index < 0 || index >= length) {
            throw new IndexOutOfBoundsException(index+"");
        }
        int seg = Arrays.binarySearch(startOffsets, index);
        if (seg >= 0) {
            return ((FastStringBuffer)segments.get(seg)).charAt(0);
        }
        seg = -seg - 2;
        final int offset = index - startOffsets[seg];
        return ((FastStringBuffer)segments.get(seg)).charAt(offset);
    }


    /**
     * Returns a new character sequence that is a subsequence of this sequence.
     * The subsequence starts with the character at the specified index and
     * ends with the character at index <tt>end - 1</tt>.  The length of the
     * returned sequence is <tt>end - start</tt>, so if <tt>start == end</tt>
     * then an empty sequence is returned. </p>
     *
     * @param   start   the start index, inclusive
     * @param   end     the end index, exclusive
     *
     * @return  the specified subsequence
     *
     * @throws  IndexOutOfBoundsException
     *          if <tt>start</tt> or <tt>end</tt> are negative,
     *          if <tt>end</tt> is greater than <tt>length()</tt>,
     *          or if <tt>start</tt> is greater than <tt>end</tt>
     */
    public CharSequence subSequence(int start, int end) {
        if (startOffsets.length == 1) {
            // optimize for small documents
            return ((FastStringBuffer)segments.get(0)).subSequence(start, end);
        }
        if (start < 0 || end < 0 || end > length || start > end) {
            throw new IndexOutOfBoundsException("[" + start + ',' + end + ']');
        }
        int seg0 = Arrays.binarySearch(startOffsets, start);
        int offset0;
        if (seg0 >= 0) {
            offset0 = 0;
        } else {
            seg0 = -seg0 - 2;
            offset0 = start - startOffsets[seg0];
        }
        int seg1 = Arrays.binarySearch(startOffsets, end);
        int offset1;
        if (seg1 >= 0) {
            offset1 = 0;
        } else {
            seg1 = -seg1 - 2;
            offset1 = end - startOffsets[seg1];
        }
        FastStringBuffer startSegment = (FastStringBuffer)segments.get(seg0);
        // We've had reports (28 Feb 2007) of an NPE here, which we couldn't reproduce.
        // The following code is designed to produce diagnostics if it ever happens again
        if (startSegment == null) {
            dumpDataStructure();
            throw new NullPointerException("startSegment: subSequence(" + start + ", " + end + ")");
        }
        if (seg0 == seg1) {
            // the required substring is all in one segment
            return startSegment.subSequence(offset0, offset1);
        } else {
            // copy the data into a new FastStringBuffer. This case should be exceptional
            FastStringBuffer sb = new FastStringBuffer(end - start);
            sb.append(startSegment.subSequence(offset0, startSegment.length()));
            for (int i=seg0+1; i<seg1; i++) {
                sb.append(((FastStringBuffer)segments.get(i)));
            }
            if (offset1 > 0) {
                sb.append(((FastStringBuffer)segments.get(seg1)).subSequence(0, offset1));
            }
            return sb;
        }
    }


    /**
     * Convert to a string
     */


    public String toString() {
        if (startOffsets.length == 1) {
            // optimize for small documents
            return segments.get(0).toString();
        }
        FastStringBuffer sb = new FastStringBuffer(length);
        for (int i=0; i<segments.size(); i++) {
            sb.append(((FastStringBuffer)segments.get(i)));
        }
        return sb.toString();
    }


    /**
     * Compare equality
     */


    public boolean equals(Object other) {
        return toString().equals(other.toString());
    }


    /**
     * Generate a hash code
     */


    public int hashCode() {
        // Same algorithm as String#hashCode(), but not cached
        int h = 0;
        for (int s=0; s<segments.size(); s++) {
            FastStringBuffer sb = ((FastStringBuffer)segments.get(s));
            for (int i=0; i<sb.length(); i++) {
                h = 31 * h + charAt(i);
            }
        }
        return h;
    }


    /**
     * Returns a new character sequence that is a subsequence of this sequence.
     * Unlike subSequence, this is guaranteed to return a String.
     * @param start index of the first character to be included
     * @param end index of the character after the last one to be included
     * @return the substring at the given position
     */


    public String substring(int start, int end) {
        return subSequence(start, end).toString();
    }


    /**
     * Write the value to a writer
     * @param writer the writer to which the value is to be written
     */


    public void write(Writer writer) throws java.io.IOException {
        for (int s=0; s<segments.size(); s++) {
            FastStringBuffer sb = ((FastStringBuffer)segments.get(s));
            sb.write(writer);
        }
    }


    /**
     * Produce diagnostic dump
     */


    public void dumpDataStructure() {
        System.err.println("** Segments:");
        for (int s=0; s<segments.size(); s++) {
            System.err.println("   SEG " + s + " start offset " + startOffsets[s] + " length "
                    + ((FastStringBuffer)segments.get(s)).length());
        }
    }


//    public static void main(String[] args) {
//        LargeStringBuffer lsb = new LargeStringBuffer();
//        for (int i=0; i<30; i++)  {
//            char[] chars = new char[i*5000];
//            Arrays.fill(chars, 'x');
//            lsb.append(new String(chars));
//            lsb.append("");
//        }
//        for (int i=0; i<lsb.length()-10000; i+=10000) {
//            System.out.println(i + ":" + lsb.subSequence(i, i+9999).length());
//        }
//        lsb.dumpDataStructure();
//    }


}


//
// The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
// you may not use this file except in compliance with the License. You may obtain a copy of the
// License at http://www.mozilla.org/MPL/
//
// Software distributed under the License is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the License for the specific language governing rights and limitations under the License.
//
// The Original Code is: all this file.
//
// The Initial Developer of the Original Code is Michael H. Kay
//
// Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
//
// Contributor(s): none
//
Source Code of org.pdf4j.saxon.tinytree.LargeStringBuffer

Related Classes of org.pdf4j.saxon.tinytree.LargeStringBuffer