/*
* XmlParser.java
*
* Copyright (C) 2005-2006 Tommi Laukkanen
* http://www.substanceofcode.com
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
// Expand to define testing define
//#define DNOTEST
// Expand to define logging define
//#define DNOLOGGING
package com.substanceofcode.utils;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Vector;
import com.substanceofcode.utils.CauseException;
import com.substanceofcode.utils.CauseMemoryException;
//#ifdef DLOGGING
//@import net.sf.jlogmicro.util.logging.Logger;
//@import net.sf.jlogmicro.util.logging.Level;
//#endif
/**
* Simple and lightweight XML parser without complete error handling.
*
* @author Tommi Laukkanen
*/
public class XmlParser {
/** Current XML element name (eg. <title> = title) */
final protected StringBuffer m_currentElementName = new StringBuffer();
/** Element data includes element name (e.g. <title>rss) */
final protected StringBuffer m_currentElementData = new StringBuffer();
protected boolean m_currentElementContainsText = false;
//#ifdef DTEST
//@ boolean m_debugTrace = false; // True to add extra trace
//#endif
// The flag for if encoding set with BOM, prologue with encoding,
// or meta tag for HTMLParser.
protected boolean m_encoding_set = false; // The flag for if encoding set
// Allow some errors for get text and get attribute
protected boolean m_acceptErrors = true; // Allow some errors
protected String m_fileEncoding = "ISO8859_1"; // See EncodingUtil
protected String m_docEncoding = ""; // See EncodingUtil
protected String m_defEncoding = "UTF-8"; // Default doc encoding
protected EncodingUtil m_encodingUtil = null;
protected EncodingStreamReader m_encodingStreamReader;
protected InputStreamReader m_inputStream;
private String [] m_namespaces = null;
private boolean m_getPrologue = true;
//#ifdef DLOGGING
//@ private Logger logger = Logger.getLogger("XmlParser");
//@ final private boolean fineLoggable = logger.isLoggable(Level.FINE);
//@ final private boolean finerLoggable = logger.isLoggable(Level.FINER);
//@ final private boolean finestLoggable = logger.isLoggable(Level.FINEST);
//#endif
/** Enumerations for parse function */
public static final int END_DOCUMENT = 0;
public static final int ELEMENT = 1;
public static final int PROLOGUE = 2;
/** Creates a new instance of XmlParser */
public XmlParser(InputStream inputStream) {
this(new EncodingUtil(inputStream));
}
/** Creates a new instance of XmlParser */
public XmlParser(EncodingUtil encodingUtil) {
this.m_encodingUtil = encodingUtil;
m_encodingStreamReader =
m_encodingUtil.getEncodingStreamReader();
m_fileEncoding = m_encodingStreamReader.getFileEncoding();
m_inputStream = m_encodingStreamReader.getInputStream();
}
/** Parse next element */
protected int parseStream(InputStreamReader is)
throws IOException, CauseMemoryException, CauseException {
StringBuffer inputBuffer = new StringBuffer();
boolean parsingElementName = false;
boolean elementFound = false;
boolean elementStart = false;
boolean parsingElementData = false;
boolean prologueFound = false;
char c;
int inputCharacter = is.read();
try {
while ((inputCharacter != -1) && !elementFound) {
c = (char)inputCharacter;
if (elementStart) {
switch (c) {
case '/':
parsingElementName = false;
break;
// If we get ? or ! after '<' this is not an
// element, it's a comment or prologe.
case '?':
case '!':
if(m_currentElementData.charAt(m_currentElementData.length()-1)=='<') {
parsingElementName = false;
// If we find <? and we're looking for the prologue,
// set flag.
if (m_getPrologue && (c == '?')) {
prologueFound = true;
}
}
break;
default:
break;
}
}
if(parsingElementName) {
// Determine if we have found the end of the element
// name and thus started element data.
switch (c) {
case ':':
// For specified namespace, put it into element name
if ((m_namespaces != null) &&
(((m_namespaces.length >= 1) &&
m_namespaces[0].equals(
m_currentElementName.toString())) ||
((m_namespaces.length >= 2) &&
m_namespaces[1].equals(
m_currentElementName.toString())) ||
((m_namespaces.length >= 3) &&
m_namespaces[2].equals(
m_currentElementName.toString())))) {
m_currentElementName.append(c);
break;
}
// Don't break after ':' (above) if not a part of
// namespace as it is the end of the element
// name.
case ' ':
case '/':
parsingElementName = false;
parsingElementData = true;
break;
// Finding '>' is the end of an element name,
// but we process it below.
case '>':
break;
default:
m_currentElementName.append(c);
break;
}
}
// We found the beginning of a tag, so we start an element
// name.
if(c=='<') {
elementStart = true;
parsingElementName = true;
parsingElementData = true;
m_currentElementName.setLength(0);
m_currentElementData.setLength(0);
}
// If parsing element data, add to it.
if(parsingElementData) {
m_currentElementData.append(c);
}
// If we find end tag '>' can also be the
// end of the prologe so we check.
if(c=='>') {
if(m_currentElementName.length()>0) {
elementFound = true;
parsingElementName = false;
// If we find XML without a prologue, need
// to treat as default UTF-8 encoding for XML.
if (m_getPrologue) {
m_getPrologue = false;
m_encodingStreamReader.setGetPrologue(false);
// If BOM is present, use it's definition for
// default
if (m_encodingStreamReader.isUtfDoc()) {
if (m_encodingStreamReader.isUtf16Doc()) {
m_encodingUtil.getEncoding(m_fileEncoding,
"UTF-16");
} else {
m_encodingUtil.getEncoding(m_fileEncoding,
"UTF-8");
}
m_encoding_set = true;
} else {
m_encodingUtil.getEncoding(m_fileEncoding,
m_defEncoding);
}
m_docEncoding = m_encodingUtil.getDocEncoding();
}
} else if (m_getPrologue && prologueFound) {
// If we are looking for the prolog, now
// we have read the end of it, so we can
// get the encoding specified (or null which
// defaults to UTF-8).
// Only process actual prologes. <?xmlstylesheet
// is not what we want.
if (m_currentElementData.toString().
startsWith("<?xml ")) {
m_getPrologue = false;
m_encodingStreamReader.setGetPrologue(false);
//#ifdef DLOGGING
//@ if (finestLoggable) {logger.finest("m_currentElementData.length()=" + m_currentElementData.length());}
//#endif
String cencoding = getAttributeValue("encoding");
if (cencoding == null) {
//#ifdef DLOGGING
//@ if (finestLoggable) {logger.finest("Prologue cencoding,m_defEncoding=" + cencoding + "," + m_defEncoding);}
//#endif
cencoding = m_defEncoding;
} else {
m_encoding_set = true;
}
m_encodingUtil.getEncoding(m_fileEncoding,
cencoding);
// Get doc encoding. The encoding to translate
// the bytes into.
m_docEncoding = m_encodingUtil.getDocEncoding();
return PROLOGUE;
}
}
}
// If we have not found an element, keep parsing.
// Otherwise, we get out of the loop.
if(!elementFound){
inputCharacter = is.read();
}
//#ifdef DTEST
//#ifdef DLOGGING
//@ if (m_debugTrace) {
//@ logger.finest("c=" + c);
//@ logger.finest("m_currentElementName=" + m_currentElementName);
//@ logger.finest("m_currentElementData=" + m_currentElementData);
//@ logger.finest("m_currentElementContainsText=" + m_currentElementContainsText);
//@ logger.finest("parsingElementName=" + parsingElementName);
//@ logger.finest("parsingElementData=" + parsingElementData);
//@ logger.finest("prologueFound=" + prologueFound);
//@ logger.finest("parsingElementData=" + parsingElementData);
//@ logger.finest("parsingElementData=" + parsingElementData);
//@ logger.finest("elementFound=" + elementFound);
//@ logger.finest("elementStart=" + elementStart);
//@ }
//#endif
//#endif
}
// Determine if we actually have element data or a tag
// that ends without data/text (e.g. <br/> has no text)
if( m_currentElementData.charAt( m_currentElementData.length()-2 )=='/' &&
m_currentElementData.charAt( m_currentElementData.length()-1 )=='>' ) {
m_currentElementContainsText = false;
} else {
m_currentElementContainsText = true;
}
//#ifdef DLOGGING
//@ if (finerLoggable) {logger.finer("m_currentElementContainsText,m_currentElementData=" + m_currentElementContainsText + "," + m_currentElementData);}
//#endif
} catch (IOException e) {
//#ifdef DLOGGING
//@ logger.severe("parse read error ", e);
//#endif
System.out.println("parse read error " + e + " " + e.getMessage());
e.printStackTrace();
throw e;
} catch (OutOfMemoryError e) {
//#ifdef DLOGGING
//@ logger.severe("Out of memory parse read error ", e);
//#endif
System.out.println("Out of memory parse read error " + e + " " + e.getMessage());
e.printStackTrace();
CauseMemoryException ce = new CauseMemoryException(
"Parse read error. Out of memory.", e);
throw ce;
} catch (Throwable e) {
//#ifdef DLOGGING
//@ logger.severe("Internal error parse read error ", e);
//#endif
System.out.println("Internal error parse read error " + e + " " + e.getMessage());
e.printStackTrace();
CauseException ce = new CauseException(
"Internal error parse read error. ", e);
throw ce;
}
if( inputCharacter == -1 ) {
return END_DOCUMENT;
} else {
return ELEMENT;
}
}
/** Parse next element */
public int parse()
throws IOException, CauseMemoryException, CauseException {
if (m_encodingStreamReader.isModEncoding()) {
return parseStream(m_encodingStreamReader);
} else {
return parseStream(m_inputStream);
}
}
/** Get element name */
public String getName() {
//#ifdef DLOGGING
//@ if (finerLoggable) {logger.finer("m_currentElementName=" + m_currentElementName);}
//#endif
return m_currentElementName.toString();
}
/** Get element text including inner xml
* If no text, return empty string "" */
private String getTextStream(InputStreamReader is,
final boolean convXmlEnts)
throws IOException, CauseMemoryException, CauseException {
if(!m_currentElementContainsText) {
return "";
}
boolean endParsing = false;
String text = "";
try {
StringBuffer textBuffer = new StringBuffer();
int inputCharacter;
char c;
char lastChars[] = {' ', ' ', ' '};
char elementNameChars[] = new char[3];
int elen = m_currentElementName.length();
switch (elen) {
case 0:
return "";
case 1:
elementNameChars[0] = m_currentElementName.charAt(0);
break;
case 2:
elementNameChars[0] = m_currentElementName.charAt(0);
elementNameChars[1] = m_currentElementName.charAt(1);
break;
default:
// Copy the last 3 characters indexes begin at elen -3
// to before elen to the char array.
m_currentElementName.toString().getChars(elen - 3, elen,
elementNameChars, 0);
break;
}
final String endCurrentElement = m_currentElementName.insert(
0, "</").toString();
while (((inputCharacter = is.read()) != -1) &&
!endParsing) {
c = (char)inputCharacter;
lastChars[0] = lastChars[1];
lastChars[1] = lastChars[2];
lastChars[2] = c;
//System.out.print(c);
textBuffer.append(c);
if( lastChars[0] == elementNameChars[0] &&
lastChars[1] == elementNameChars[1] &&
lastChars[2] == elementNameChars[2]) {
if( textBuffer.toString().endsWith(endCurrentElement)) {
endParsing = true;
}
}
}
if (m_docEncoding.length() == 0) {
text = textBuffer.toString();
} else {
try {
// We read the bytes in as ISO8859_1, so we must get them
// out as that and then encode as they should be.
if (m_fileEncoding.length() == 0) {
text = new String(textBuffer.toString().getBytes(),
m_docEncoding);
} else {
text = new String(textBuffer.toString().getBytes(
m_fileEncoding), m_docEncoding);
}
} catch (IOException e) {
//#ifdef DLOGGING
//@ logger.severe("getTextStream Could not convert string from,to" + m_fileEncoding + "," + m_docEncoding, e);
//#endif
System.out.println("getTextStream Could not convert string " +
"from,to=" + m_fileEncoding + "," + m_docEncoding +
" " + e + " " + e.getMessage());
e.printStackTrace();
text = textBuffer.toString();
}
}
textBuffer = null;
text = StringUtil.replace(text, endCurrentElement, "");
/** Handle some entities and encoded characters */
text = StringUtil.replace(text, "<![CDATA[", "");
text = StringUtil.replace(text, "]]>", "");
if (text.indexOf('&') >= 0) {
text = EncodingUtil.replaceAlphaEntities(convXmlEnts, text);
if (convXmlEnts) {
text = EncodingUtil.replaceXmlEntities(text);
}
// No need to convert from UTF-8 to Unicode using replace
// umlauts now because it is done with new String...,encoding.
// Replace numeric entities including ’, ‘
// “, and ”
text = EncodingUtil.replaceNumEntity(text);
}
// Replace special chars like left quote, etc.
text = m_encodingUtil.replaceSpChars(text);
} catch (OutOfMemoryError t) {
CauseMemoryException ce = new CauseMemoryException(
"Unable to read text. Out of memory.", t);
//#ifdef DLOGGING
//@ logger.severe(ce.getMessage(), ce);
//#endif
System.out.println("getTextStream Could not read a char run time." + t +
" " + t.getMessage());
t.printStackTrace();
throw ce;
} catch (Throwable t) {
CauseException ce = new CauseException("Unable to read text. " +
"Internal error.", t);
//#ifdef DLOGGING
//@ logger.severe(ce.getMessage(), ce);
//#endif
System.out.println("getTextStream Could not read a char run time." + t +
" " + t.getMessage());
t.printStackTrace();
if (m_acceptErrors) {
return null;
} else {
throw ce;
}
}
//#ifdef DLOGGING
//@ if (finerLoggable) {logger.finer("text=" + text);}
//#endif
return text;
}
/** Get element text including inner xml
* save some time by using the normal m_inputStream when we
* know that we are not reading UTF-8/16. */
public String getText()
throws IOException, CauseMemoryException, CauseException {
if (m_encodingStreamReader.isModEncoding()) {
return getTextStream(m_encodingStreamReader, true);
} else {
return getTextStream(m_inputStream, true);
}
}
/** Get element text including inner xml
* save some time by using the normal m_inputStream when we
* know that we are not reading UTF-8/16. */
public String getText(final boolean convXmlEnts)
throws IOException, CauseMemoryException, CauseException {
if (m_encodingStreamReader.isModEncoding()) {
return getTextStream(m_encodingStreamReader, convXmlEnts);
} else {
return getTextStream(m_inputStream, convXmlEnts);
}
}
/**
* Get attribute value from current element
*/
public String getAttributeValue(String attributeName)
throws IOException, CauseMemoryException, CauseException {
try {
/** Check whatever the element contains given attribute */
String ccurrentElementData = m_currentElementData.toString();
int attributeStartIndex = ccurrentElementData.indexOf(attributeName);
if( attributeStartIndex<0 ) {
return null;
}
/** Calculate actual value start index */
int valueStartIndex = attributeStartIndex + attributeName.length() + 2;
/** Check the attribute value end index */
int valueEndIndex = ccurrentElementData.indexOf('\"', valueStartIndex);
if( valueEndIndex<0 ) {
/** Check using windows quote account for other unexplained
quotes */
if ((valueStartIndex + 1) < ccurrentElementData.length()) {
String beginQuote = ccurrentElementData.substring(
valueStartIndex - 1, valueStartIndex);
valueEndIndex = ccurrentElementData.indexOf(beginQuote,
valueStartIndex);
}
if( valueEndIndex<0 ) {
return null;
}
}
/** Parse value */
String value = ccurrentElementData.substring(valueStartIndex, valueEndIndex);
if (m_docEncoding.length() != 0) {
// We read the bytes in as ISO8859_1, so we must get them
// out as that and then encode as they should be.
if (m_fileEncoding.length() == 0) {
value = new String(value.getBytes(),
m_docEncoding);
} else {
value = new String(value.getBytes(
m_fileEncoding), m_docEncoding);
}
}
//#ifdef DLOGGING
//@ if (finerLoggable) {logger.finer("attribute value=" + value);}
//#endif
return value;
} catch (OutOfMemoryError e) {
//#ifdef DLOGGING
//@ logger.severe("Out of memory parse attribute error ", e);
//#endif
System.out.println("Out of memory parse attribute error " + e + " " + e.getMessage());
e.printStackTrace();
CauseMemoryException ce = new CauseMemoryException(
"Parse attribute read error. Out of memory.", e);
throw ce;
} catch (Throwable t) {
//#ifdef DLOGGING
//@ logger.severe("getAttributeValue error.", t);
//#endif
System.out.println("getAttributeValue error." + t + " " +
t.getMessage());
if (m_acceptErrors) {
return null;
} else {
CauseException ce = new CauseException(
"Parse attribute read error. Internal error.", t);
throw ce;
}
}
}
/**
* Get namesapces. Return two dimension array with the first column
* the namespace and the second on the URL for the namespace.
*/
public String [][] parseNamespaces() {
try {
/** Check whatever the element contains given attribute */
String ccurrentElementData = m_currentElementData.toString();
Vector vnamespaces = new Vector();
Vector vnamesurls = new Vector();
int nspos = 0;
while ((nspos = ccurrentElementData.indexOf("xmlns:", nspos)) >= 0) {
nspos+= 6;
int eqpos = ccurrentElementData.indexOf('=', nspos);
if (eqpos < 0) {
continue;
}
String xmlns = ccurrentElementData.substring(nspos, eqpos);
int qpos = ccurrentElementData.indexOf('\"', eqpos + 2);
if (qpos < 0) {
continue;
}
String url = ccurrentElementData.substring(eqpos + 2, qpos);
//#ifdef DLOGGING
//@ if (finerLoggable) {logger.finer("xmlns,url=" + xmlns + "," + url);}
//#endif
vnamespaces.addElement(xmlns);
vnamesurls.addElement(url);
}
if (vnamespaces.size() == 0) {
return new String[0][0];
}
int vlen = vnamespaces.size();
String [][] ns = new String[2][vlen];
for (int ic = 0; ic < vlen; ic++) {
ns[0][ic] = (String)vnamespaces.elementAt(ic);
ns[1][ic] = (String)vnamesurls.elementAt(ic);
}
return ns;
} catch (Throwable t) {
//#ifdef DLOGGING
//@ logger.severe("parseNamespaces error.", t);
//#endif
System.out.println("parseNamespaces error." + t + " " +
t.getMessage());
return new String[0][0];
}
}
public void setNamespaces(String [] namespaces) {
this.m_namespaces = namespaces;
}
public String [] getNamespaces() {
return (m_namespaces);
}
public void setDocEncoding(String docEncoding) {
this.m_docEncoding = docEncoding;
}
public String getDocEncoding() {
return (m_docEncoding);
}
public boolean isWindows() {
return (m_encodingUtil.isWindows());
}
public boolean isUtf() {
return (m_encodingUtil.isUtf());
}
public EncodingUtil getEncodingUtil() {
return (m_encodingUtil);
}
}