public Parser(Reader reader) throws IOException, SAXException {
this(new InputSource(reader));
}
public Parser(InputSource source) throws IOException, SAXException {
final SAXParser parser = new SAXParser();
parser.setFeature("http://xml.org/sax/features/namespaces", true);
parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
final StringBuilder title = new StringBuilder(), body = new StringBuilder();
final DefaultHandler handler = new DefaultHandler() {
private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
if (inHEAD > 0) {
if ("title".equals(localName)) {
inTITLE++;
} else {
if ("meta".equals(localName)) {
String name = atts.getValue("name");
if (name == null) {
name = atts.getValue("http-equiv");
}
final String val = atts.getValue("content");
if (name != null && val != null) {
metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
}
}
}
} else if (inBODY > 0) {
if (SUPPRESS_ELEMENTS.contains(localName)) {
suppressed++;
} else if ("img".equals(localName)) {
// the original javacc-based parser preserved <IMG alt="..."/>
// attribute as body text in [] parenthesis:
final String alt = atts.getValue("alt");
if (alt != null) {
body.append('[').append(alt).append(']');
}
}
} else if ("body".equals(localName)) {
inBODY++;
} else if ("head".equals(localName)) {
inHEAD++;
} else if ("frameset".equals(localName)) {
throw new SAXException("This parser does not support HTML framesets.");
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
if (inBODY > 0) {
if ("body".equals(localName)) {
inBODY--;
} else if (ENDLINE_ELEMENTS.contains(localName)) {
body.append('\n');
} else if (SUPPRESS_ELEMENTS.contains(localName)) {
suppressed--;
}
} else if (inHEAD > 0) {
if ("head".equals(localName)) {
inHEAD--;
} else if (inTITLE > 0 && "title".equals(localName)) {
inTITLE--;
}
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (inBODY > 0 && suppressed == 0) {
body.append(ch, start, length);
} else if (inTITLE > 0) {
title.append(ch, start, length);
}
}
@Override
public InputSource resolveEntity(String publicId, String systemId) {
// disable network access caused by DTDs
return new InputSource(new StringReader(""));
}
};
parser.setContentHandler(handler);
parser.setErrorHandler(handler);
parser.parse(source);
// the javacc-based parser trimmed title (which should be done for HTML in all cases):
this.title = title.toString().trim();
// assign body text