package com.fasterxml.aalto.in;
import java.io.*;
import javax.xml.stream.XMLStreamException;
import org.codehaus.stax2.XMLStreamLocation2;
import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.impl.IoStreamException;
import com.fasterxml.aalto.impl.LocationImpl;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.TextBuilder;
import com.fasterxml.aalto.util.XmlCharTypes;
import com.fasterxml.aalto.util.XmlChars;
import com.fasterxml.aalto.util.XmlConsts;
/**
* This is the concrete scanner implementation used when input comes
* as a {@link java.io.Reader}. In general using this scanner is quite
* a bit less optimal than that of {@link java.io.InputStream} based
* scanner. Nonetheless, it is included for completeness, since Stax
* interface allows passing Readers as input sources.
*/
public final class ReaderScanner
extends XmlScanner
{
/**
* Although java chars are basically UTF-16 in memory, the closest
* match for char types is Latin1.
*/
private final static XmlCharTypes sCharTypes = InputCharTypes.getLatin1CharTypes();
/*
/**********************************************************************
/* Configuration
/**********************************************************************
*/
/**
* Underlying InputStream to use for reading content.
*/
protected Reader _in;
/*
/**********************************************************************
/* Input buffering
/**********************************************************************
*/
protected char[] _inputBuffer;
protected int _inputPtr;
protected int _inputEnd;
/**
* Storage location for a single character that can not be pushed
* back (for example, multi-byte char)
*/
protected int mTmpChar = INT_NULL;
/*
/**********************************************************************
/* Symbol handling
/**********************************************************************
*/
/**
* For now, symbol table contains prefixed names. In future it is
* possible that they may be split into prefixes and local names?
*/
protected final CharBasedPNameTable _symbols;
/*
/**********************************************************************
/* Life-cycle
/**********************************************************************
*/
public ReaderScanner(ReaderConfig cfg, Reader r,
char[] buffer, int ptr, int last)
{
super(cfg);
_in = r;
_inputBuffer = buffer;
_inputPtr = ptr;
_inputEnd = last;
_pastBytesOrChars = 0; // should it be passed by caller?
_rowStartOffset = 0; // should probably be passed by caller...
_symbols = cfg.getCBSymbols();
}
public ReaderScanner(ReaderConfig cfg, Reader r)
{
super(cfg);
_in = r;
_inputBuffer = cfg.allocFullCBuffer(ReaderConfig.DEFAULT_CHAR_BUFFER_LEN);
_inputPtr = _inputEnd = 0;
_pastBytesOrChars = 0; // should it be passed by caller?
_rowStartOffset = 0; // should probably be passed by caller...
_symbols = cfg.getCBSymbols();
}
@Override
protected void _releaseBuffers()
{
super._releaseBuffers();
if (_symbols.maybeDirty()) {
_config.updateCBSymbols(_symbols);
}
/* Note: if we have block input (_in == null), the buffer we
* use is not owned by scanner, can't recycle
* Also note that this method will always get called before
* _closeSource(); so that _in won't be cleared before we
* have a chance to see it.
*/
if (_in != null) {
if (_inputBuffer != null) {
_config.freeFullCBuffer(_inputBuffer);
_inputBuffer = null;
}
}
}
@Override
protected void _closeSource()
throws IOException
{
if (_in != null) {
_in.close();
_in = null;
}
}
/*
/**********************************************************************
/* Public scanner interface (1st level parsing)
/**********************************************************************
*/
// // // First, main iteration methods
@Override
public final int nextFromProlog(boolean isProlog) throws XMLStreamException
{
if (_tokenIncomplete) { // left-overs from last thingy?
skipToken();
}
// First: keep track of where event started
setStartLocation();
// Ok: we should get a WS or '<'. So, let's skip through WS
while (true) {
// Any more data? Just need a single byte
if (_inputPtr >= _inputEnd) {
if (!loadMore()) {
setStartLocation();
return TOKEN_EOI;
}
}
int c = _inputBuffer[_inputPtr++] & 0xFF;
// Really should get white space or '<'...
if (c == '<') {
break;
}
if (c != ' ') {
if (c == '\n') {
markLF();
} else if (c == '\r') {
if (_inputPtr >= _inputEnd) {
if (!loadMore()) {
markLF();
setStartLocation();
return TOKEN_EOI;
}
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
} else if (c != '\t') {
reportPrologUnexpChar(isProlog, c, null);
}
}
}
// Ok, got LT:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed(COMMENT);
}
char c = _inputBuffer[_inputPtr++];
if (c == '!') { // comment/DOCTYPE? (CDATA not legal)
return handlePrologDeclStart(isProlog);
}
if (c == '?') {
return handlePIStart();
}
/* End tag not allowed if no open tree; and only one root
* element (one root-level start tag)
*/
if (c == '/' || !isProlog) {
reportPrologUnexpChar(isProlog, c, " (unbalanced start/end tags?)");
}
return handleStartElement(c);
}
@Override
public final int nextFromTree() throws XMLStreamException
{
if (_tokenIncomplete) { // left-overs?
if (skipToken()) { // Figured out next event (ENTITY_REFERENCE)?
// !!! We don't yet parse DTD, don't know real contents
return _nextEntity();
}
} else { // note: START_ELEMENT/END_ELEMENT never incomplete
if (_currToken == START_ELEMENT) {
if (_isEmptyTag) {
// Important: retain same start location as with START_ELEMENT, don't overwrite
--_depth;
return (_currToken = END_ELEMENT);
}
} else if (_currToken == END_ELEMENT) {
_currElem = _currElem.getParent();
// Any namespace declarations that need to be unbound?
while (_lastNsDecl != null && _lastNsDecl.getLevel() >= _depth) {
_lastNsDecl = _lastNsDecl.unbind();
}
} else {
// It's possible CHARACTERS entity with an entity ref:
if (_entityPending) {
_entityPending = false;
return _nextEntity();
}
}
}
// and except for special cases, mark down actual start location of the event
setStartLocation();
/* Any more data? Although it'd be an error not to get any,
* let's leave error reporting up to caller
*/
if (_inputPtr >= _inputEnd) {
if (!loadMore()) {
setStartLocation();
return TOKEN_EOI;
}
}
char c = _inputBuffer[_inputPtr];
/* Can get pretty much any type; start/end element, comment/PI,
* CDATA, text, entity reference...
*/
if (c == '<') { // root element, comment, proc instr?
++_inputPtr;
c = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(COMMENT);
if (c == '!') { // comment or CDATA
return handleCommentOrCdataStart();
}
if (c == '?') {
return handlePIStart();
}
if (c == '/') {
return handleEndElement();
}
return handleStartElement(c);
}
if (c == '&') { // entity reference
++_inputPtr;
/* Need to expand; should indicate either text, or an unexpanded
* entity reference
*/
int i = handleEntityInText(false);
if (i == 0) { // general entity
return (_currToken = ENTITY_REFERENCE);
}
/* Nope, a char entity; need to indicate it came from an entity.
* Since we may want to store the char as is, too, let's negate
* entity-based char
*/
mTmpChar = -i;
} else {
/* Let's store it for future reference. May or may not be used --
* so let's not advance input ptr quite yet.
*/
mTmpChar = c;
}
// text, possibly/probably ok
if (_cfgLazyParsing) {
_tokenIncomplete = true;
} else {
finishCharacters();
}
return (_currToken = CHARACTERS);
}
/**
* Helper method used to isolate things that need to be (re)set in
* cases where
*/
protected int _nextEntity() {
// !!! Also, have to assume start location has been set or such
_textBuilder.resetWithEmpty();
// !!! TODO: handle start location?
return (_currToken = ENTITY_REFERENCE);
}
/*
/**********************************************************************
/* 2nd level parsing
/**********************************************************************
*/
protected final int handlePrologDeclStart(boolean isProlog)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c = _inputBuffer[_inputPtr++];
if (c == '-') { // Comment?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == '-') {
if (_cfgLazyParsing) {
_tokenIncomplete = true;
} else {
finishComment();
}
return (_currToken = COMMENT);
}
} else if (c == 'D') { // DOCTYPE?
if (isProlog) { // no DOCTYPE in epilog
handleDtdStart();
// incomplete flag is set by handleDtdStart
if (!_cfgLazyParsing) {
if (_tokenIncomplete) {
finishDTD(true); // must copy contents, may be needed
_tokenIncomplete = false;
}
}
return DTD;
}
}
/* error... for error recovery purposes, let's just pretend
* like it was unfinished CHARACTERS, though.
*/
_tokenIncomplete = true;
_currToken = CHARACTERS;
reportPrologUnexpChar(isProlog, c, " (expected '-' for COMMENT)");
return _currToken; // never gets here
}
private final int handleDtdStart()
throws XMLStreamException
{
matchAsciiKeyword("DOCTYPE");
// And then some white space and root name
char c = skipInternalWs(true, "after DOCTYPE keyword, before root name");
_tokenName = parsePName(c);
c = skipInternalWs(false, null);
//boolean gotId;
if (c == 'P') { // PUBLIC
matchAsciiKeyword("PUBLIC");
c = skipInternalWs(true, null);
_publicId = parsePublicId(c);
c = skipInternalWs(true, null);
_systemId = parseSystemId(c);
c = skipInternalWs(false, null);
} else if (c == 'S') { // SYSTEM
matchAsciiKeyword("SYSTEM");
c = skipInternalWs(true, null);
_publicId = null;
_systemId = parseSystemId(c);
c = skipInternalWs(false, null);
} else {
_publicId = _systemId = null;
}
/* Ok; so, need to get either an internal subset, or the
* end:
*/
if (c == '>') { // fine, we are done
_tokenIncomplete = false;
return (_currToken = DTD);
}
if (c != '[') { // If not end, must have int. subset
String msg = (_systemId != null) ?
" (expected '[' for the internal subset, or '>' to end DOCTYPE declaration)" :
" (expected a 'PUBLIC' or 'SYSTEM' keyword, '[' for the internal subset, or '>' to end DOCTYPE declaration)";
reportTreeUnexpChar(c, msg);
}
/* Need not parse the int. subset yet, can leave as is, and then
* either skip or parse later on
*/
_tokenIncomplete = true;
return (_currToken = DTD);
}
protected final int handleCommentOrCdataStart()
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c = _inputBuffer[_inputPtr++];
// Let's first see if it's a comment (simpler)
if (c == '-') { // Comment
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c != '-') {
reportTreeUnexpChar(c, " (expected '-' for COMMENT)");
}
if (_cfgLazyParsing) {
_tokenIncomplete = true;
} else {
finishComment();
}
return (_currToken = COMMENT);
}
// If not, should be CDATA:
if (c == '[') { // CDATA
_currToken = CDATA;
for (int i = 0; i < 6; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c != CDATA_STR.charAt(i)) {
reportTreeUnexpChar(c, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)");
}
}
if (_cfgLazyParsing) {
_tokenIncomplete = true;
} else {
finishCData();
}
return CDATA;
}
reportTreeUnexpChar(c, " (expected either '-' for COMMENT or '[CDATA[' for CDATA section)");
return TOKEN_EOI; // never gets here
}
protected final int handlePIStart()
throws XMLStreamException
{
_currToken = PROCESSING_INSTRUCTION;
// Ok, first, need a name
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
// Ok, first, need a name
char c = _inputBuffer[_inputPtr++];
_tokenName = parsePName(c);
{ // but is it "xml" (case insensitive)?
String ln = _tokenName.getLocalName();
if (ln.length() == 3 && ln.equalsIgnoreCase("xml") &&
_tokenName.getPrefix() == null) {
reportInputProblem(ErrorConsts.ERR_WF_PI_XML_TARGET);
}
}
/* Let's then verify that we either get a space, or closing
* '?>': this way we'll catch some problems right away, and also
* simplify actual processing of contents.
*/
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c <= INT_SPACE) {
// Ok, let's skip the white space...
while (true) {
if (c == '\n') {
markLF();
} else if (c == '\r') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
} else if (c != ' ' && c != '\t') {
throwInvalidSpace(c);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr];
if (c > 0x0020) {
break;
}
++_inputPtr;
}
// Ok, got non-space, need to push back:
if (_cfgLazyParsing) {
_tokenIncomplete = true;
} else {
finishPI();
}
} else {
if (c != INT_QMARK) {
reportMissingPISpace(c);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c != '>') {
reportMissingPISpace(c);
}
_textBuilder.resetWithEmpty();
_tokenIncomplete = false;
}
return PROCESSING_INSTRUCTION;
}
/**
* @return Code point for the entity that expands to a valid XML
* content character.
*/
protected final int handleCharEntity()
throws XMLStreamException
{
// Hex or decimal?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c = _inputBuffer[_inputPtr++];
int value = 0;
if (c == 'x') { // hex
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == ';') {
break;
}
value = value << 4;
if (c <= '9' && c >= '0') {
value += (c - '0');
} else if (c >= 'a' && c <= 'f') {
value += 10 + (c - 'a');
} else if (c >= 'A' && c <= 'F') {
value += 10 + (c - 'A');
} else {
throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)");
}
if (value > MAX_UNICODE_CHAR) { // Overflow?
reportEntityOverflow();
}
}
} else { // numeric (decimal)
while (c != ';') {
if (c <= '9' && c >= '0') {
value = (value * 10) + (c - '0');
if (value > MAX_UNICODE_CHAR) { // Overflow?
reportEntityOverflow();
}
} else {
throwUnexpectedChar(c, "; expected a decimal number");
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
}
}
// Ok, and then need to check result is a valid XML content char:
if (value >= 0xD800) { // note: checked for overflow earlier
if (value < 0xE000) { // no surrogates via entity expansion
reportInvalidXmlChar(value);
}
if (value == 0xFFFE || value == 0xFFFF) {
reportInvalidXmlChar(value);
}
} else if (value < 32) {
// XML 1.1 allows most other chars; 1.0 does not:
if (value != INT_LF && value != INT_CR && value != INT_TAB) {
if (!_xml11 || value == 0) {
reportInvalidXmlChar(value);
}
}
}
return value;
}
protected final int handleStartElement(char c)
throws XMLStreamException
{
_currToken = START_ELEMENT;
_currNsCount = 0;
PName elemName = parsePName(c);
/* Ok. Need to create a qualified name. Simplest for element
* in default ns (no extra work -- expressed as null binding);
* otherwise need to find binding
*/
String prefix = elemName.getPrefix();
boolean allBound; // flag to check 'late' bindings
if (prefix == null) { // element in default ns
allBound = true; // which need not be bound
} else {
elemName = bindName(elemName, prefix);
allBound = elemName.isBound();
}
_tokenName = elemName;
_currElem = new ElementScope(elemName, _currElem);
// And then attribute parsing loop:
int attrPtr = 0;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
// Intervening space to skip?
if (c <= INT_SPACE) {
do {
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
} else if (c != ' ' && c != '\t') {
throwInvalidSpace(c);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
} while (c <= INT_SPACE);
} else if (c != INT_SLASH && c != INT_GT) {
throwUnexpectedChar(c, " expected space, or '>' or \"/>\"");
}
// Ok; either need to get an attribute name, or end marker:
if (c == INT_SLASH) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c != '>') {
throwUnexpectedChar(c, " expected '>'");
}
_isEmptyTag = true;
break;
} else if (c == '>') {
_isEmptyTag = false;
break;
} else if (c == '<') {
reportInputProblem("Unexpected '<' character in element (missing closing '>'?)");
}
// Ok, an attr name:
PName attrName = parsePName(c);
prefix = attrName.getPrefix();
boolean isNsDecl;
if (prefix == null) { // can be default ns decl:
isNsDecl = (attrName.getLocalName() == "xmlns");
} else {
// May be a namespace decl though?
if (prefix == "xmlns") {
isNsDecl = true;
} else {
attrName = bindName(attrName, prefix);
if (allBound) {
allBound = attrName.isBound();
}
isNsDecl = false;
}
}
// Optional space to skip again
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c > INT_SPACE) {
break;
}
if (c == '\n') {
markLF();
} else if (c == '\r') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
} else if (c != ' ' && c != '\t') {
throwInvalidSpace(c);
}
}
if (c != '=') {
throwUnexpectedChar(c, " expected '='");
}
// Optional space to skip again
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c > INT_SPACE) {
break;
}
if (c == '\n') {
markLF();
} else if (c == '\r') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
} else if (c != ' ' && c != '\t') {
throwInvalidSpace(c);
}
}
if (c != '"' && c != '\'') {
throwUnexpectedChar(c, " Expected a quote");
}
/* Ok, finally: value parsing. However, ns URIs are to be handled
* different from attribute values... let's offline URIs, since
* they should be less common than attribute values.
*/
if (isNsDecl) { // default ns, or explicit?
handleNsDeclaration(attrName, c);
++_currNsCount;
} else { // nope, a 'real' attribute:
attrPtr = collectValue(attrPtr, c, attrName);
}
}
{
// Note: this call also checks attribute uniqueness
int act = _attrCollector.finishLastValue(attrPtr);
if (act < 0) { // error, dup attr indicated by -1
act = _attrCollector.getCount(); // let's get correct count
reportInputProblem(_attrCollector.getErrorMsg());
}
_attrCount = act;
}
++_depth;
/* Was there any prefix that wasn't bound prior to use?
* That's legal, assuming declaration was found later on...
* let's check
*/
if (!allBound) {
if (!elemName.isBound()) { // element itself unbound
reportUnboundPrefix(_tokenName, false);
}
for (int i = 0, len = _attrCount; i < len; ++i) {
PName attrName = _attrCollector.getName(i);
if (!attrName.isBound()) {
reportUnboundPrefix(attrName, true);
}
}
}
return START_ELEMENT;
}
/**
* This method implements the tight loop for parsing attribute
* values. It's off-lined from the main start element method to
* simplify main method, which makes code more maintainable
* and possibly easier for JIT/HotSpot to optimize.
*/
private final int collectValue(int attrPtr, char quoteChar, PName attrName)
throws XMLStreamException
{
char[] attrBuffer = _attrCollector.startNewValue(attrName, attrPtr);
final int[] TYPES = sCharTypes.ATTR_CHARS;
value_loop:
while (true) {
char c;
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (attrPtr >= attrBuffer.length) {
attrBuffer = _attrCollector.valueBufferFull();
}
int max = _inputEnd;
{
int max2 = ptr + (attrBuffer.length - attrPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = _inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
attrBuffer[attrPtr++] = c;
}
_inputPtr = ptr;
}
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
// fall through
case XmlCharTypes.CT_WS_LF:
markLF();
// fall through
case XmlCharTypes.CT_WS_TAB:
// Plus, need to convert these all to simple space
c = ' ';
break;
case XmlCharTypes.CT_LT:
throwUnexpectedChar(c, "'<' not allowed in attribute value");
case XmlCharTypes.CT_AMP:
{
int d = handleEntityInText(false);
if (d == 0) { // unexpanded general entity... not good
reportUnexpandedEntityInAttr(attrName, false);
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((d >> 16) != 0) {
d -= 0x10000;
attrBuffer[attrPtr++] = (char) (0xD800 | (d >> 10));
d = 0xDC00 | (d & 0x3FF);
if (attrPtr >= attrBuffer.length) {
attrBuffer = _attrCollector.valueBufferFull();
}
}
c = (char) d;
}
break;
case XmlCharTypes.CT_ATTR_QUOTE:
if (c == quoteChar) {
break value_loop;
}
// default:
// Other chars are not important here...
}
} else if (c >= 0xD800) {
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
char d = checkSurrogate(c);
attrBuffer[attrPtr++] = c;
// Need to ensure room for one more
if (attrPtr >= attrBuffer.length) {
attrBuffer = _attrCollector.valueBufferFull();
}
c = d;
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
// We know there's room for at least one more char
attrBuffer[attrPtr++] = c;
}
return attrPtr;
}
/**
* Method called from the main START_ELEMENT handling loop, to
* parse namespace URI values.
*/
private void handleNsDeclaration(PName name, char quoteChar)
throws XMLStreamException
{
int attrPtr = 0;
char[] attrBuffer = _nameBuffer;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c = _inputBuffer[_inputPtr++];
if (c == quoteChar) {
break;
}
if (c == '&') { // entity
int d = handleEntityInText(false);
if (d == 0) { // general entity; should never happen
reportUnexpandedEntityInAttr(name, true);
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((d >> 16) != 0) {
if (attrPtr >= attrBuffer.length) {
_nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
}
d -= 0x10000;
attrBuffer[attrPtr++] = (char) (0xD800 | (d >> 10));
d = 0xDC00 | (d & 0x3FF);
}
c = (char) d;
} else if (c == '<') { // error
throwUnexpectedChar(c, "'<' not allowed in attribute value");
} else {
if (c < INT_SPACE) {
if (c == '\n') {
markLF();
} else if (c == '\r') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
c = '\n';
} else if (c != '\t') {
throwInvalidSpace(c);
}
}
}
if (attrPtr >= attrBuffer.length) {
_nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
}
attrBuffer[attrPtr++] = c;
}
/* Simple optimization: for default ns removal (or, with
* ns 1.1, any other as well), will use empty value... no
* need to try to intern:
*/
if (attrPtr == 0) {
bindNs(name, "");
} else {
String uri = _config.canonicalizeURI(attrBuffer, attrPtr);
bindNs(name, uri);
}
}
protected final int handleEndElement()
throws XMLStreamException
{
--_depth;
_currToken = END_ELEMENT;
// Ok, at this point we have seen '/', need the name
_tokenName = _currElem.getName();
String pname = _tokenName.getPrefixedName();
char c;
int i = 0;
int len = pname.length();
do {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c != pname.charAt(i)) {
reportUnexpectedEndTag(pname);
}
} while (++i < len);
// Can still have a problem, if name didn't end there...
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c <= ' ') {
c = skipInternalWs(false, null);
} else if (c != '>') {
if (c == ':' || XmlChars.is10NameChar(c)) {
reportUnexpectedEndTag(pname);
}
}
if (c != '>') {
throwUnexpectedChar(c, " expected space or closing '>'");
}
return END_ELEMENT;
}
protected final int handleEntityInText(boolean inAttr)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c = _inputBuffer[_inputPtr++];
if (c == '#') {
return handleCharEntity();
}
String start;
if (c == 'a') { // amp or apos?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == 'm') { // amp?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == 'p') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == ';') {
return INT_AMP;
}
start = "amp";
} else {
start = "am";
}
} else if (c == 'p') { // apos?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == 'o') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == 's') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == ';') {
return INT_APOS;
}
start = "apos";
} else {
start = "apo";
}
} else {
start = "ap";
}
} else {
start = "a";
}
} else if (c == 'l') { // lt?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == 't') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == ';') {
return INT_LT;
}
start = "lt";
} else {
start = "l";
}
} else if (c == 'g') { // gt?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == 't') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == ';') {
return INT_GT;
}
start = "gt";
} else {
start = "g";
}
} else if (c == 'q') { // quot?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == 'u') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == 'o') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == 't') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
if (c == ';') {
return INT_QUOTE;
}
start = "quot";
} else {
start = "quo";
}
} else {
start = "qu";
}
} else {
start = "q";
}
} else {
start = "";
}
final int[] TYPES = sCharTypes.NAME_CHARS;
/* All righty: we have the beginning of the name, plus the first
* char too. So let's see what we can do with it.
*/
char[] cbuf = _nameBuffer;
int cix = 0;
for (int len = start.length(); cix < len; ++cix) {
cbuf[cix] = start.charAt(cix);
}
//int colon = -1;
while (c != ';') {
boolean ok;
// Has to be a valid name start char though:
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_NAME_NONE:
case XmlCharTypes.CT_NAME_COLON: // not ok for entities?
case XmlCharTypes.CT_NAME_NONFIRST:
ok = (cix > 0);
break;
case XmlCharTypes.CT_NAME_ANY:
ok = true;
break;
default:
ok = false;
break;
}
} else {
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
int value = decodeSurrogate(c);
if (cix >= cbuf.length) {
_nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
}
cbuf[cix++] = c;
c = _inputBuffer[_inputPtr-1]; // was read by decode func
ok = (cix == 0) ? XmlChars.is10NameStartChar(value)
: XmlChars.is10NameChar(value);
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
ok = false; // never gets here
} else {
ok = true;
}
}
if (!ok) {
reportInvalidNameChar(c, cix);
}
if (cix >= cbuf.length) {
_nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
}
cbuf[cix++] = c;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
}
// Ok, let's construct a (temporary) entity name, then:
String pname = new String(cbuf, 0, cix);
// (note: hash is dummy... not to be compared to anything etc)
_tokenName = new PNameC(pname, null, pname, 0);
/* One more thing: do we actually allow entities in this mode
* and with this event?
*/
if (_config.willExpandEntities()) {
reportInputProblem("General entity reference (&"+pname+";) encountered in entity expanding mode: operation not (yet) implemented");
}
if (inAttr) {
reportInputProblem("General entity reference (&"+pname+";) encountered in attribute value, in non-entity-expanding mode: no way to handle it");
}
return 0;
}
@Override
protected final void finishComment() throws XMLStreamException
{
final int[] TYPES = sCharTypes.OTHER_CHARS;
final char[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.resetWithEmpty();
int outPtr = 0;
main_loop:
while (true) {
char c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = c;
}
_inputPtr = ptr;
}
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
}
c = '\n';
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_HYPHEN: // '-->'?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '-') { // ok, must be end then
++_inputPtr;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr++] != '>') {
reportDoubleHyphenInComments();
}
break main_loop;
}
break;
// default:
// Other types are not important here..
}
} else if (c >= 0xD800) { // high-range, surrogates etc
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
char d = checkSurrogate(c);
outputBuffer[outPtr++] = c;
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = d;
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
// We know there's room for one more:
outputBuffer[outPtr++] = c;
}
_textBuilder.setCurrentLength(outPtr);
}
@Override
protected final void finishPI() throws XMLStreamException
{
final int[] TYPES = sCharTypes.OTHER_CHARS;
final char[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.resetWithEmpty();
int outPtr = 0;
main_loop:
while (true) {
char c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = c;
}
_inputPtr = ptr;
}
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_WS_CR:
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == CHAR_LF) {
++_inputPtr;
}
markLF();
c = '\n';
}
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_QMARK: // '?>'?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '>') {
++_inputPtr;
break main_loop;
}
break;
// default:
// Other types are not important here...
}
} else if (c >= 0xD800) { // high-range, surrogates etc
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
char d = checkSurrogate(c);
outputBuffer[outPtr++] = c;
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = d;
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
// We know there's room for one more:
outputBuffer[outPtr++] = c;
}
_textBuilder.setCurrentLength(outPtr);
}
@Override
protected final void finishDTD(boolean copyContents) throws XMLStreamException
{
char[] outputBuffer = copyContents ?
_textBuilder.resetWithEmpty() : null;
int outPtr = 0;
final int[] TYPES = sCharTypes.DTD_CHARS;
boolean inDecl = false; // in declaration/directive?
int quoteChar = 0; // inside quoted string?
main_loop:
while (true) {
char c;
/* First we'll have a quickie loop for speeding through
* uneventful chars...
*/
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
int max = _inputEnd;
if (outputBuffer != null) {
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
}
while (ptr < max) {
c = _inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
if (outputBuffer != null) {
outputBuffer[outPtr++] = c;
}
}
_inputPtr = ptr;
}
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
}
c = '\n';
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_DTD_QUOTE: // apos or quot
if (quoteChar == 0) {
quoteChar = c;
} else {
if (quoteChar == c) {
quoteChar = 0;
}
}
break;
case XmlCharTypes.CT_DTD_LT:
if (!inDecl) {
inDecl = true;
}
break;
case XmlCharTypes.CT_DTD_GT:
if (quoteChar == 0) {
inDecl = false;
}
break;
case XmlCharTypes.CT_DTD_RBRACKET:
if (!inDecl && quoteChar == 0) {
break main_loop;
}
break;
// default:
// Other types are not important here...
}
} else if (c >= 0xD800) { // high-range, surrogates etc
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
char d = checkSurrogate(c);
if (outputBuffer != null) {
outputBuffer[outPtr++] = c;
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
}
c = d;
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
if (outputBuffer != null) { // has room for one more
outputBuffer[outPtr++] = c;
}
}
if (outputBuffer != null) {
_textBuilder.setCurrentLength(outPtr);
}
// but still need to match the '>'...
char c = skipInternalWs(false, null);
if (c != '>') {
throwUnexpectedChar(c, " expected '>' after the internal subset");
}
}
@Override
protected final void finishCData() throws XMLStreamException
{
final int[] TYPES = sCharTypes.OTHER_CHARS;
final char[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.resetWithEmpty();
int outPtr = 0;
/* At this point, space (if any) has been skipped, and we are
* to parse and store the contents
*/
main_loop:
while (true) {
char c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = c;
}
_inputPtr = ptr;
}
// And then exceptions:
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
}
c = '\n';
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_RBRACKET: // close ']]>' marker?
/* Ok: let's just parse all consequtive right brackets,
* and see if followed by greater-than char. This because
* we can only push back at most one char at a time, and
* thus can't easily just check a subset
*/
int count = 0; // ignore first bracket
char d;
do {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = _inputBuffer[_inputPtr];
if (d != ']') {
break;
}
++_inputPtr;
++count;
} while (true);
// Was the marker found?
boolean ok = (d == '>' && count >= 1);
if (ok) {
--count;
}
// Brackets to copy to output?
for (; count > 0; --count) {
outputBuffer[outPtr++] = ']';
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
}
if (ok) {
++_inputPtr; // to consume '>'
break main_loop;
}
break;
// default:
// Other types are not important here...
}
} else if (c >= 0xD800) { // high-range, surrogates etc
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
char d = checkSurrogate(c);
outputBuffer[outPtr++] = c;
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = d;
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
// Ok, can output the char; there's room for one char at least
outputBuffer[outPtr++] = c;
}
_textBuilder.setCurrentLength(outPtr);
/* 03-Feb-2009, tatu: To support coalescing mode, may need to
* do some extra work
*/
if (_cfgCoalescing && !_entityPending) {
finishCoalescedText();
}
}
@Override
protected final void finishCharacters() throws XMLStreamException
{
int outPtr;
char[] outputBuffer;
// Ok, so what was the first char / entity?
{
int c = mTmpChar;
if (c < 0) { // from entity; can just copy as is
c = -c;
outputBuffer = _textBuilder.resetWithEmpty();
outPtr = 0;
if ((c >> 16) != 0) { // surrogate pair?
c -= 0x10000;
/* Note: after resetting the buffer, it's known to have
* space for more than 2 chars we need to add
*/
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
}
outputBuffer[outPtr++] = (char) c;
} else { // white space that we are interested in?
if (c == INT_CR || c == INT_LF) {
++_inputPtr; // wasn't advanced yet, in this case
outPtr = checkInTreeIndentation((char) c);
if (outPtr < 0) {
return;
}
// Above call also initializes the text builder appropriately
outputBuffer = _textBuilder.getBufferWithoutReset();
} else {
outputBuffer = _textBuilder.resetWithEmpty();
outPtr = 0;
}
}
}
final int[] TYPES = sCharTypes.TEXT_CHARS;
final char[] inputBuffer = _inputBuffer;
main_loop:
while (true) {
char c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = c;
}
_inputPtr = ptr;
}
// And then exceptions:
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
{
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (inputBuffer[ptr] == '\n') {
++_inputPtr;
}
markLF();
}
c = '\n';
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_LT:
--_inputPtr;
break main_loop;
case XmlCharTypes.CT_AMP:
{
int d = handleEntityInText(false);
if (d == 0) { // unexpandable general parsed entity
// _inputPtr set by entity expansion method
_entityPending = true;
break main_loop;
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((d >> 16) != 0) {
d -= 0x10000;
outputBuffer[outPtr++] = (char) (0xD800 | (d >> 10));
// Need to ensure room for one more char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
d = (0xDC00 | (d & 0x3FF));
}
c = (char) d;
}
break;
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// Let's then just count number of brackets --
// in case they are not followed by '>'
int count = 1;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = inputBuffer[_inputPtr];
if (c != ']') {
break;
}
++_inputPtr; // to skip past bracket
++count;
}
if (c == '>' && count > 1) {
reportIllegalCDataEnd();
}
// Nope. Need to output all brackets, then; except
// for one that can be left for normal output
while (count > 1) {
outputBuffer[outPtr++] = ']';
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
// Need to ensure room for one more char
--count;
}
}
// Can just output the first ']' along normal output
c = ']';
break;
// default:
// Other types are not important here...
}
} else if (c >= 0xD800) { // high-range, surrogates etc
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
char d = checkSurrogate(c);
outputBuffer[outPtr++] = c;
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = d;
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
outputBuffer[outPtr++] = c;
}
_textBuilder.setCurrentLength(outPtr);
// 03-Feb-2009, tatu: Need to support coalescing mode too:
if (_cfgCoalescing && !_entityPending) {
finishCoalescedText();
}
}
@Override
protected final void finishSpace() throws XMLStreamException
{
/* Ok: so, mTmpChar contains first space char. If it looks
* like indentation, we can probably optimize a bit...
*/
char tmp = (char)mTmpChar;
char[] outputBuffer;
int outPtr;
if (tmp == '\r' || tmp == '\n') {
outPtr = checkPrologIndentation(tmp);
if (outPtr < 0) {
return;
}
// Above call also initializes the text builder appropriately
outputBuffer = _textBuilder.getBufferWithoutReset();
} else {
outputBuffer = _textBuilder.resetWithEmpty();
outputBuffer[0] = tmp;
outPtr = 1;
}
int ptr = _inputPtr;
while (true) {
if (ptr >= _inputEnd) {
if (!loadMore()) {
break;
}
ptr = _inputPtr;
}
char c = _inputBuffer[ptr];
if (c > INT_SPACE) {
break;
}
++ptr;
if (c == INT_LF) {
markLF(ptr);
} else if (c == INT_CR) {
if (ptr >= _inputEnd) {
if (!loadMore()) { // still need to output the lf
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
outputBuffer[outPtr++] = '\n';
break;
}
ptr = _inputPtr;
}
if (_inputBuffer[ptr] == '\n') {
++ptr;
}
markLF(ptr);
c = '\n'; // need to convert to canonical lf
} else if (c != ' ' && c != '\t') {
_inputPtr = ptr;
throwInvalidSpace(c);
}
// Ok, can output the char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
outputBuffer[outPtr++] = c;
}
_inputPtr = ptr;
_textBuilder.setCurrentLength(outPtr);
}
/*
/**********************************************************************
/* 2nd level parsing for coalesced text
/**********************************************************************
*/
/**
* Method that gets called after a primary text segment (of type
* CHARACTERS or CDATA, not applicable to SPACE) has been read in
* text buffer. Method has to see if the following event would
* be textual as well, and if so, read it (and any other following
* textual segments).
*/
protected final void finishCoalescedText()
throws XMLStreamException
{
while (true) {
// no matter what, will need (and can get) one char
if (_inputPtr >= _inputEnd) {
if (!loadMore()) { // most likely an error, will be handled later on
return;
}
}
if (_inputBuffer[_inputPtr] == '<') { // markup of some kind
/* In worst case, need 3 chars ("<![") all in all to know
* if we are getting a CDATA section
*/
if ((_inputPtr + 3) >= _inputEnd) {
if (!loadAndRetain(3)) {
// probably an error, but will be handled later
return;
}
}
if (_inputBuffer[_inputPtr+1] != '!'
|| _inputBuffer[_inputPtr+2] != '[') {
// can't be CDATA, we are done here
return;
}
// but let's verify it still:
_inputPtr += 3;
for (int i = 0; i < 6; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c = _inputBuffer[_inputPtr++];
if (c != CDATA_STR.charAt(i)) {
reportTreeUnexpChar(c, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)");
}
}
finishCoalescedCData();
} else { // textual (or entity, error etc)
finishCoalescedCharacters();
if (_entityPending) {
break;
}
}
}
}
// note: code mostly copied from 'finishCharacters', just simplified
// in some places
protected final void finishCoalescedCData()
throws XMLStreamException
{
final int[] TYPES = sCharTypes.OTHER_CHARS;
final char[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.getBufferWithoutReset();
int outPtr = _textBuilder.getCurrentLength();
/* At this point, space (if any) has been skipped, and we are
* to parse and store the contents
*/
main_loop:
while (true) {
char c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = c;
}
_inputPtr = ptr;
}
// And then exceptions:
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
}
c = '\n';
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_RBRACKET: // close ']]>' marker?
/* Ok: let's just parse all consequtive right brackets,
* and see if followed by greater-than char. This because
* we can only push back at most one char at a time, and
* thus can't easily just check a subset
*/
int count = 0; // ignore first bracket
char d;
do {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = _inputBuffer[_inputPtr];
if (d != ']') {
break;
}
++_inputPtr;
++count;
} while (true);
// Was the marker found?
boolean ok = (d == '>' && count >= 1);
if (ok) {
--count;
}
// Brackets to copy to output?
for (; count > 0; --count) {
outputBuffer[outPtr++] = ']';
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
}
if (ok) {
++_inputPtr; // to consume '>'
break main_loop;
}
break;
// default:
// Other types are not important here...
}
} else if (c >= 0xD800) { // high-range, surrogates etc
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
char d = checkSurrogate(c);
outputBuffer[outPtr++] = c;
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = d;
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
// Ok, can output the char; there's room for one char at least
outputBuffer[outPtr++] = c;
}
_textBuilder.setCurrentLength(outPtr);
}
// note: code mostly copied from 'finishCharacters', just simplified
// in some places
protected final void finishCoalescedCharacters()
throws XMLStreamException
{
// first char can't be from (char) entity (wrt finishCharacters)
final int[] TYPES = sCharTypes.TEXT_CHARS;
final char[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.getBufferWithoutReset();
int outPtr = _textBuilder.getCurrentLength();
main_loop:
while (true) {
char c;
ascii_loop:
while (true) { // tight loop for ascii chars
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = c;
}
_inputPtr = ptr;
}
// And then exceptions:
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
{
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (inputBuffer[ptr] == '\n') {
++_inputPtr;
}
markLF();
}
c = '\n';
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_LT:
--_inputPtr;
break main_loop;
case XmlCharTypes.CT_AMP:
{
int d = handleEntityInText(false);
if (d == 0) { // unexpandable general parsed entity
// _inputPtr set by entity expansion method
_entityPending = true;
break main_loop;
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((d >> 16) != 0) {
d -= 0x10000;
outputBuffer[outPtr++] = (char) (0xD800 | (d >> 10));
// Need to ensure room for one more char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
d = (0xDC00 | (d & 0x3FF));
}
c = (char) d;
}
break;
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// Let's then just count number of brackets --
// in case they are not followed by '>'
int count = 1;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = inputBuffer[_inputPtr];
if (c != ']') {
break;
}
++_inputPtr; // to skip past bracket
++count;
}
if (c == '>' && count > 1) {
reportIllegalCDataEnd();
}
// Nope. Need to output all brackets, then; except
// for one that can be left for normal output
while (count > 1) {
outputBuffer[outPtr++] = ']';
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
// Need to ensure room for one more char
--count;
}
}
// Can just output the first ']' along normal output
c = ']';
break;
// default:
// Other types are not important here...
}
} else if (c >= 0xD800) { // high-range, surrogates etc
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
char d = checkSurrogate(c);
outputBuffer[outPtr++] = c;
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = d;
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
outputBuffer[outPtr++] = c;
}
_textBuilder.setCurrentLength(outPtr);
}
/**
* Method that gets called after a primary text segment (of type
* CHARACTERS or CDATA, not applicable to SPACE) has been skipped.
* Method has to see if the following event would
* be textual as well, and if so, skip it (and any other following
* textual segments).
*
* @return True if we encountered an unexpandable entity
*/
@Override
protected final boolean skipCoalescedText()
throws XMLStreamException
{
while (true) {
// no matter what, will need (and can get) one char
if (_inputPtr >= _inputEnd) {
if (!loadMore()) { // most likely an error, will be handled later on
return false;
}
}
if (_inputBuffer[_inputPtr] == '<') { // markup of some kind
/* In worst case, need 3 chars ("<![") all in all to know
* if we are getting a CDATA section
*/
if ((_inputPtr + 3) >= _inputEnd) {
if (!loadAndRetain(3)) { // probably an error, but will be handled later
return false;
}
}
if (_inputBuffer[_inputPtr+1] != '!'
|| _inputBuffer[_inputPtr+2] != '[') {
// can't be CDATA, we are done here
return false;
}
// but let's verify it still:
_inputPtr += 3;
for (int i = 0; i < 6; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c = _inputBuffer[_inputPtr++];
if (c != CDATA_STR.charAt(i)) {
reportTreeUnexpChar(c, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)");
}
}
skipCData();
} else { // textual (or entity, error etc)
if (skipCharacters()) {
return true;
}
}
}
}
/*
/**********************************************************************
/* 2nd level parsing for skipping content
/**********************************************************************
*/
@Override
protected final void skipComment()
throws XMLStreamException
{
final int[] TYPES = sCharTypes.OTHER_CHARS;
final char[] inputBuffer = _inputBuffer;
while (true) {
char c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
}
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_HYPHEN: // '-->'?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '-') { // ok, must be end then
++_inputPtr;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr++] != '>') {
reportDoubleHyphenInComments();
}
return;
}
break;
}
// default:
// Other types are not important here...
}
}
}
@Override
protected final void skipPI() throws XMLStreamException
{
final int[] TYPES = sCharTypes.OTHER_CHARS;
final char[] inputBuffer = _inputBuffer;
while (true) {
char c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_WS_CR:
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == CHAR_LF) {
++_inputPtr;
}
markLF();
}
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_QMARK: // '?>'?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '>') {
++_inputPtr;
return;
}
break;
// default:
// Other types are not important here...
}
} else if (c >= 0xD800) { // high-range, surrogates etc
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
/*char d =*/ checkSurrogate(c);
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
// skipping, no need to output
}
}
@Override
protected final boolean skipCharacters() throws XMLStreamException
{
final int[] TYPES = sCharTypes.TEXT_CHARS;
final char[] inputBuffer = _inputBuffer;
while (true) {
char c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == CHAR_LF) {
++_inputPtr;
}
markLF();
}
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_LT:
--_inputPtr;
return false;
case XmlCharTypes.CT_AMP:
{
int d = handleEntityInText(false);
if (d == 0) { // unexpandable general parsed entity
return true;
}
}
break;
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// Let's then just count number of brackets --
// in case they are not followed by '>'
int count = 1;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = inputBuffer[_inputPtr];
if (c != ']') {
break;
}
++_inputPtr; // to skip past bracket
++count;
}
if (c == '>' && count > 1) {
reportIllegalCDataEnd();
}
}
// Can just output the first ']' along normal output
break;
// default:
// Other types are not important here...
}
} else if (c >= 0xD800) { // high-range, surrogates etc
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
/*char d =*/ checkSurrogate(c);
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
}
}
@Override
protected final void skipCData() throws XMLStreamException
{
final int[] TYPES = sCharTypes.OTHER_CHARS;
final char[] inputBuffer = _inputBuffer;
while (true) {
char c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = inputBuffer[ptr++];
if (c <= 0xFF) {
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
} else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
if (c <= 0xFF) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
{
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (inputBuffer[ptr] == CHAR_LF) {
++ptr;
++_inputPtr;
}
markLF(ptr);
}
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// end is nigh?
int count = 0;
do {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
++count;
c = _inputBuffer[_inputPtr++];
} while (c == ']');
if (c == '>') {
if (count > 1) { // gotcha
return;
}
// can still skip plain ']>'...
} else {
--_inputPtr; // need to push back last char
}
}
break;
// default:
// Other types are not important here...
}
} else if (c >= 0xD800) { // high-range, surrogates etc
if (c < 0xE000) {
// if ok, returns second surrogate; otherwise exception
/*char d =*/ checkSurrogate(c);
} else if (c >= 0xFFFE) {
c = handleInvalidXmlChar(c);
}
}
}
}
@Override
protected final void skipSpace() throws XMLStreamException
{
// mTmpChar has a space, but it's been checked, can ignore
int ptr = _inputPtr;
while (true) {
if (ptr >= _inputEnd) {
if (!loadMore()) {
break;
}
ptr = _inputPtr;
}
char c = _inputBuffer[ptr];
if (c > ' ') { // !!! TODO: xml 1.1 ws
break;
}
++ptr;
if (c == '\n') {
markLF(ptr);
} else if (c == '\r') {
if (ptr >= _inputEnd) {
if (!loadMore()) {
break;
}
ptr = _inputPtr;
}
if (_inputBuffer[ptr] == '\n') {
++ptr;
}
markLF(ptr);
} else if (c != ' ' && c != '\t') {
_inputPtr = ptr;
throwInvalidSpace(c);
}
}
_inputPtr = ptr;
}
/*
/**********************************************************************
/* Entity/name handling
/**********************************************************************
*/
/**
* @return First byte following skipped white space
*/
protected char skipInternalWs(boolean reqd, String msg)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c = _inputBuffer[_inputPtr++];
if (c > INT_SPACE) {
if (!reqd) {
return c;
}
reportTreeUnexpChar(c, " (expected white space "+msg+")");
}
do {
// But let's first handle the space we already got:
if (c == '\n') {
markLF();
} else if (c == '\r') {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
} else if (c != ' ' && c != '\t') {
throwInvalidSpace(c);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr++];
} while (c <= INT_SPACE);
return c;
}
private final void matchAsciiKeyword(String keyw)
throws XMLStreamException
{
for (int i = 1, len = keyw.length(); i < len; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c = _inputBuffer[_inputPtr++];
if (c != keyw.charAt(i)) {
reportTreeUnexpChar(c, " (expected '"+keyw.charAt(i)+"' for "+keyw+" keyword)");
}
}
}
/**
*<p>
* Note: consequtive white space is only considered indentation,
* if the following token seems like a tag (start/end). This so
* that if a CDATA section follows, it can be coalesced in
* coalescing mode. Although we could check if coalescing mode is
* enabled, this should seldom have significant effect either way,
* so it removes one possible source of problems in coalescing mode.
*
* @return -1, if indentation was handled; offset in the output
* buffer, if not
*/
protected final int checkInTreeIndentation(char c)
throws XMLStreamException
{
if (c == '\r') {
// First a degenerate case, a lone \r:
if (_inputPtr >= _inputEnd && !loadMore()) {
_textBuilder.resetWithIndentation(0, CHAR_SPACE);
return -1;
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
}
markLF();
// Then need an indentation char (or start/end tag):
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr];
if (c != ' ' && c != '\t') {
// May still be indentation, if it's lt + non-exclamation mark
if (c == '<') {
if ((_inputPtr+1) < _inputEnd && _inputBuffer[_inputPtr+1] != '!') {
_textBuilder.resetWithIndentation(0, ' ');
return -1;
}
}
char[] outputBuffer = _textBuilder.resetWithEmpty();
outputBuffer[0] = '\n';
_textBuilder.setCurrentLength(1);
return 1;
}
// So how many do we get?
++_inputPtr;
int count = 1;
int max = (c == ' ') ? TextBuilder.MAX_INDENT_SPACES : TextBuilder.MAX_INDENT_TABS;
while (count <= max) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c2 = _inputBuffer[_inputPtr];
if (c2 != c) {
// Has to be followed by a start/end tag...
if (c2 == '<' && (_inputPtr+1) < _inputEnd
&& _inputBuffer[_inputPtr+1] != '!') {
_textBuilder.resetWithIndentation(count, c);
return -1;
}
break;
}
++_inputPtr;
++count;
}
// Nope, hit something else, or too long: need to just copy the stuff
// we know buffer has enough room either way
char[] outputBuffer = _textBuilder.resetWithEmpty();
outputBuffer[0] = '\n';
for (int i = 1; i <= count; ++i) {
outputBuffer[i] = c;
}
count += 1; // to account for leading lf
_textBuilder.setCurrentLength(count);
return count;
}
/**
* @return -1, if indentation was handled; offset in the output
* buffer, if not
*/
protected final int checkPrologIndentation(char c)
throws XMLStreamException
{
if (c == '\r') {
// First a degenerate case, a lone \r:
if (_inputPtr >= _inputEnd && !loadMore()) {
_textBuilder.resetWithIndentation(0, CHAR_SPACE);
return -1;
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
}
markLF();
// Ok, indentation char?
if (_inputPtr >= _inputEnd && !loadMore()) {
_textBuilder.resetWithIndentation(0, CHAR_SPACE);
return -1;
}
c = _inputBuffer[_inputPtr]; // won't advance past the char yet
if (c != ' ' && c != '\t') {
// If lt, it's still indentation ok:
if (c == '<') { // need
_textBuilder.resetWithIndentation(0, CHAR_SPACE);
return -1;
}
// Nope... something else
char[] outputBuffer = _textBuilder.resetWithEmpty();
outputBuffer[0] = '\n';
_textBuilder.setCurrentLength(1);
return 1;
}
// So how many do we get?
++_inputPtr;
int count = 1;
int max = (c == ' ') ? TextBuilder.MAX_INDENT_SPACES : TextBuilder.MAX_INDENT_TABS;
while (true) {
if (_inputPtr >= _inputEnd && !loadMore()) {
break;
}
if (_inputBuffer[_inputPtr] != c) {
break;
}
++_inputPtr;
++count;
if (count >= max) { // ok, can't share... but can build it still
// we know buffer has enough room
char[] outputBuffer = _textBuilder.resetWithEmpty();
outputBuffer[0] = '\n';
for (int i = 1; i <= count; ++i) {
outputBuffer[i] = c;
}
count += 1; // to account for leading lf
_textBuilder.setCurrentLength(count);
return count;
}
}
// Ok, gotcha?
_textBuilder.resetWithIndentation(count, c);
return -1;
}
protected PName parsePName(char c)
throws XMLStreamException
{
char[] nameBuffer = _nameBuffer;
/* Let's do just quick sanity check first; a thorough check will be
* done later on if necessary, now we'll just do the very cheap
* check to catch extra spaces etc.
*/
if (c < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode
throwUnexpectedChar(c, "; expected a name start character");
}
nameBuffer[0] = c;
int hash = (int) c;
int ptr = 1;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = _inputBuffer[_inputPtr];
int d = (int) c;
if (d < 65) {
// Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars
if (d < 45 || d > 58 || d == 47) {
// End of name, a single ascii char?
PName n = _symbols.findSymbol(nameBuffer, 0, ptr, hash);
if (n == null) {
n = addPName(nameBuffer, ptr, hash);
}
return n;
}
}
++_inputPtr;
if (ptr >= nameBuffer.length) {
_nameBuffer = nameBuffer = DataUtil.growArrayBy(nameBuffer, nameBuffer.length);
}
nameBuffer[ptr++] = c;
hash = (hash * 31) + d;
}
}
protected final PName addPName(char[] nameBuffer, int nameLen, int hash)
throws XMLStreamException
{
// Let's validate completely, now:
char c = nameBuffer[0];
int namePtr = 1;
int last_colon = -1; // where the colon is
if (c < 0xD800 || c >= 0xE000) {
if (!XmlChars.is10NameStartChar(c)) {
reportInvalidNameChar(c, 0);
}
} else {
if (nameLen == 1) {
reportInvalidFirstSurrogate(c);
}
// Only returns if ok; throws exception otherwise
checkSurrogateNameChar(c, nameBuffer[1], 0);
++namePtr;
}
for (; namePtr < nameLen; ++namePtr) {
c = nameBuffer[namePtr];
if (c < 0xD800 || c >= 0xE000) {
if (c == ':') {
if (last_colon >= 0) {
reportMultipleColonsInName();
}
last_colon = namePtr;
} else {
if (!XmlChars.is10NameChar(c)) {
reportInvalidNameChar(c, namePtr);
}
}
} else {
if ((namePtr+1) >= nameLen) { // unpaired surrogate
reportInvalidFirstSurrogate(c);
}
checkSurrogateNameChar(c, nameBuffer[namePtr+1], namePtr);
}
}
return _symbols.addSymbol(nameBuffer, 0, nameLen, hash);
}
protected String parsePublicId(char quoteChar)
throws XMLStreamException
{
char[] outputBuffer = _nameBuffer;
int outPtr = 0;
final int[] TYPES = XmlCharTypes.PUBID_CHARS;
boolean addSpace = false;
main_loop:
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
// Easier to check without char type table, first:
char c = _inputBuffer[_inputPtr++];
if (c == quoteChar) {
break main_loop;
}
if ((c > 0xFF) || TYPES[c] != XmlCharTypes.PUBID_OK) {
throwUnexpectedChar(c, " in public identifier");
}
// White space? Needs to be coalecsed
if (c <= INT_SPACE) {
addSpace = true;
continue;
}
if (addSpace) {
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
outputBuffer[outPtr++] = ' ';
addSpace = false;
}
if (outPtr >= outputBuffer.length) {
_nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length);
outPtr = 0;
}
outputBuffer[outPtr++] = c;
}
return new String(outputBuffer, 0, outPtr);
}
protected String parseSystemId(char quoteChar)
throws XMLStreamException
{
char[] outputBuffer = _nameBuffer;
int outPtr = 0;
// attribute types are closest matches, so let's use them
final int[] TYPES = sCharTypes.ATTR_CHARS;
//boolean spaceToAdd = false;
main_loop:
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char c = _inputBuffer[_inputPtr++];
if (TYPES[c] != 0) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == '\n') {
++_inputPtr;
}
markLF();
}
c = '\n';
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_ATTR_QUOTE:
if (c == quoteChar) {
break main_loop;
}
}
}
if (outPtr >= outputBuffer.length) {
_nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length);
outPtr = 0;
}
outputBuffer[outPtr++] = c;
}
return new String(outputBuffer, 0, outPtr);
}
/*
/**********************************************************************
/* Other parsing helper methods
/**********************************************************************
*/
/**
* This method is called to verify that a surrogate
* pair found describes a legal surrogate pair (ie. expands
* to a legal XML char)
*/
private char checkSurrogate(char firstChar)
throws XMLStreamException
{
if (firstChar >= 0xDC00) {
reportInvalidFirstSurrogate(firstChar);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char sec = _inputBuffer[_inputPtr++];
if (sec < 0xDC00 || sec >= 0xE000) {
reportInvalidSecondSurrogate(sec);
}
// And the composite, is it ok?
int val = ((firstChar - 0xD800) << 10) + 0x10000;
if (val > XmlConsts.MAX_UNICODE_CHAR) {
reportInvalidXmlChar(val);
}
return sec;
}
private int checkSurrogateNameChar(char firstChar, char sec, int index)
throws XMLStreamException
{
if (firstChar >= 0xDC00) {
reportInvalidFirstSurrogate(firstChar);
}
if (sec < 0xDC00 || sec >= 0xE000) {
reportInvalidSecondSurrogate(sec);
}
// And the composite, is it ok?
int val = ((firstChar - 0xD800) << 10) + 0x10000;
if (val > XmlConsts.MAX_UNICODE_CHAR) {
reportInvalidXmlChar(val);
}
// !!! TODO: xml 1.1 vs 1.0 rules: none valid for 1.0, many for 1.1
if (true) {
reportInvalidNameChar(val, index);
}
return val;
}
/**
* This method is similar to <code>checkSurrogate</code>, but
* returns the actual character code encoded by the surrogate
* pair. This is needed if further validation rules (such as name
* charactert checks) are to be done.
*/
private int decodeSurrogate(char firstChar)
throws XMLStreamException
{
if (firstChar >= 0xDC00) {
reportInvalidFirstSurrogate(firstChar);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
char sec = _inputBuffer[_inputPtr++];
if (sec < 0xDC00 || sec >= 0xE000) {
reportInvalidSecondSurrogate(sec);
}
// And the composite, is it ok?
int val = ((firstChar - 0xD800) << 10) + 0x10000;
if (val > XmlConsts.MAX_UNICODE_CHAR) {
reportInvalidXmlChar(val);
}
return val;
}
private void reportInvalidFirstSurrogate(char ch)
throws XMLStreamException
{
reportInputProblem("Invalid surrogate character (code 0x"+Integer.toHexString((int) ch)+"): can not start a surrogate pair");
}
private void reportInvalidSecondSurrogate(char ch)
throws XMLStreamException
{
reportInputProblem("Invalid surrogate character (code "+Integer.toHexString((int) ch)+"): is not legal as the second part of a surrogate pair");
}
/*
/**********************************************************************
/* Location handling
/**********************************************************************
*/
@Override
public XMLStreamLocation2 getCurrentLocation()
{
return LocationImpl.fromZeroBased
(_config.getPublicId(), _config.getSystemId(),
_pastBytesOrChars + _inputPtr, _currRow, _inputPtr - _rowStartOffset);
}
@Override
public int getCurrentColumnNr() {
return _inputPtr - _rowStartOffset;
}
@Override
public long getStartingByteOffset() {
// N/A for this type
return -1L;
}
@Override
public long getStartingCharOffset() {
return _startRawOffset;
}
@Override
public long getEndingByteOffset() throws XMLStreamException {
// N/A for this type
return -1L;
}
@Override
public long getEndingCharOffset() throws XMLStreamException {
// Have to complete the token to know the ending location...
if (_tokenIncomplete) {
finishToken();
}
return _pastBytesOrChars + _inputPtr;
}
protected final void markLF(int offset)
{
_rowStartOffset = offset;
++_currRow;
}
protected final void markLF()
{
_rowStartOffset = _inputPtr;
++_currRow;
}
protected final void setStartLocation() {
_startRawOffset = _pastBytesOrChars + _inputPtr;
_startRow = _currRow;
_startColumn = _inputPtr - _rowStartOffset;
}
/*
/**********************************************************************
/* Input loading
/**********************************************************************
*/
@Override
protected final boolean loadMore() throws XMLStreamException
{
// If it's a block source, there's no Reader, or any more data:
if (_in == null) {
_inputEnd = 0;
return false;
}
// Otherwise let's update offsets:
_pastBytesOrChars += _inputEnd;
_rowStartOffset -= _inputEnd;
_inputPtr = 0;
try {
int count = _in.read(_inputBuffer, 0, _inputBuffer.length);
if (count < 1) {
_inputEnd = 0;
if (count == 0) {
/* Sanity check; should never happen with correctly written
* InputStreams...
*/
reportInputProblem("Reader returned 0 bytes, even when asked to read up to "+_inputBuffer.length);
}
return false;
}
_inputEnd = count;
return true;
} catch (IOException ioe) {
throw new IoStreamException(ioe);
}
}
protected final char loadOne() throws XMLStreamException
{
if (!loadMore()) {
reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(_currToken));
}
return _inputBuffer[_inputPtr++];
}
protected final char loadOne(int type)
throws XMLStreamException
{
if (!loadMore()) {
reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(type));
}
return _inputBuffer[_inputPtr++];
}
protected final boolean loadAndRetain(int nrOfChars)
throws XMLStreamException
{
/* first: can't move, if we were handed an immutable block
* (alternative to handing Reader as _in)
*/
if (_in == null) {
return false;
}
// otherwise, need to use cut'n pasted code from loadMore()...
_pastBytesOrChars += _inputPtr;
_rowStartOffset -= _inputPtr;
int remaining = (_inputEnd - _inputPtr); // must be > 0
System.arraycopy(_inputBuffer, _inputPtr, _inputBuffer, 0, remaining);
_inputPtr = 0;
_inputEnd = remaining; // temporarily set to cover copied stuff
try {
do {
int max = _inputBuffer.length - remaining;
int count = _in.read(_inputBuffer, remaining, max);
if (count < 1) {
if (count == 0) {
// Sanity check, should never happen with non-buggy readers/stream
reportInputProblem("Reader returned 0 bytes, even when asked to read up to "+max);
}
return false;
}
_inputEnd += count;
} while (_inputEnd < nrOfChars);
return true;
} catch (IOException ioe) {
throw new IoStreamException(ioe);
}
}
}