// $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/Parser.java,v 1.2 2004/02/10 13:41:10 woolfel Exp $
/*
* ====================================================================
* Copyright 2002-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
// The developers of JMeter and Apache are greatful to the developers
// of HTMLParser for giving Apache Software Foundation a non-exclusive
// license. The performance benefits of HTMLParser are clear and the
// users of JMeter will benefit from the hard work the HTMLParser
// team. For detailed information about HTMLParser, the project is
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
//
// HTMLParser was originally created by Somik Raha in 2000. Since then
// a healthy community of users has formed and helped refine the
// design so that it is able to tackle the difficult task of parsing
// dirty HTML. Derrick Oswald is the current lead developer and was kind
// enough to assist JMeter.
package org.htmlparser;
//////////////////
// Java Imports //
//////////////////
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Map;
import org.htmlparser.parserHelper.ParserHelper;
import org.htmlparser.parserHelper.TagParser;
import org.htmlparser.scanners.AppletScanner;
import org.htmlparser.scanners.BodyScanner;
import org.htmlparser.scanners.BulletListScanner;
import org.htmlparser.scanners.DivScanner;
import org.htmlparser.scanners.DoctypeScanner;
import org.htmlparser.scanners.FormScanner;
import org.htmlparser.scanners.FrameSetScanner;
import org.htmlparser.scanners.HeadScanner;
import org.htmlparser.scanners.HtmlScanner;
import org.htmlparser.scanners.JspScanner;
import org.htmlparser.scanners.LinkScanner;
import org.htmlparser.scanners.MetaTagScanner;
import org.htmlparser.scanners.ScriptScanner;
import org.htmlparser.scanners.StyleScanner;
import org.htmlparser.scanners.TableScanner;
import org.htmlparser.scanners.TagScanner;
import org.htmlparser.scanners.TitleScanner;
import org.htmlparser.tags.EndTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.Tag;
import org.htmlparser.util.DefaultParserFeedback;
import org.htmlparser.util.IteratorImpl;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserFeedback;
import org.htmlparser.visitors.NodeVisitor;
/**
* This is the class that the user will use, either to get an iterator into
* the html page or to directly parse the page and print the results
* <BR>
* Typical usage of the parser is as follows : <BR>
* [1] Create a parser object - passing the URL and a feedback object to the parser<BR>
* [2] Register the common scanners. See {@link #registerScanners()} <BR>
* You wouldnt do this if you want to configure a custom lightweight parser. In that case,
* you would add the scanners of your choice using {@link #addScanner(TagScanner)}<BR>
* [3] Enumerate through the elements from the parser object <BR>
* It is important to note that the parsing occurs when you enumerate, ON DEMAND. This is a thread-safe way,
* and you only get the control back after a particular element is parsed and returned.
*
* <BR>
* Below is some sample code to parse Yahoo.com and print all the tags.
* <pre>
* Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
* // In this example, we are registering all the common scanners
* parser.registerScanners();
* for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
* Node node = i.nextNode();
* node.print();
* }
* </pre> Below is some sample code to parse Yahoo.com and print only the text
* information. This scanning will run faster, as there are no scanners
* registered here.
* <pre>
* Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
* // In this example, none of the scanners need to be registered
* // as a string node is not a tag to be scanned for.
* for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
* Node node = i.nextNode();
* if (node instanceof StringNode) {
* StringNode stringNode =
* (StringNode)node;
* System.out.println(stringNode.getText());
* }
* }
* </pre>
* The above snippet will print out only the text contents in the html document.<br>
* Here's another snippet that will only print out the link urls in a document.
* This is an example of adding a link scanner.
* <pre>
* Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
* parser.addScanner(new LinkScanner("-l"));
* for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
* Node node = i.nextNode();
* if (node instanceof LinkTag) {
* LinkTag linkTag = (LinkTag)node;
* System.out.println(linkTag.getLink());
* }
* }
* </pre>
* @see Parser#elements()
*/
public class Parser implements Serializable
{
// Please don't change the formatting of the version variables below.
// This is done so as to facilitate ant script processing.
/**
* The floating point version number.
*/
public final static double VERSION_NUMBER = 1.3;
/**
* The type of version.
*/
public final static String VERSION_TYPE = "Release Build";
/**
* The date of the version.
*/
public final static String VERSION_DATE = "May 25, 2003";
/**
* The display version.
*/
public final static String VERSION_STRING =
"" + VERSION_NUMBER + " (" + VERSION_TYPE + " " + VERSION_DATE + ")";
// End of formatting
/**
* The default charset.
* This should be <code>ISO-8859-1</code>,
* see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1
* Another alias is "8859_1".
*/
protected static final String DEFAULT_CHARSET = "ISO-8859-1";
/**
* Trigger for charset detection.
*/
protected static final String CHARSET_STRING = "charset";
/**
* Feedback object.
*/
protected ParserFeedback feedback;
/**
* The URL or filename to be parsed.
*/
protected String resourceLocn;
/**
* The html reader associated with this parser.
*/
protected transient NodeReader reader;
/**
* The list of scanners to apply at the top level.
*/
private Map scanners;
/**
* The encoding being used to decode the connection input stream.
*/
protected String character_set;
/**
* The source for HTML.
*/
protected transient URLConnection url_conn;
/**
* The bytes extracted from the source.
*/
protected transient BufferedInputStream input;
/**
* A quiet message sink.
* Use this for no feedback.
*/
public static ParserFeedback noFeedback =
new DefaultParserFeedback(DefaultParserFeedback.QUIET);
/**
* A verbose message sink.
* Use this for output on <code>System.out</code>.
*/
public static ParserFeedback stdout = new DefaultParserFeedback();
private ParserHelper parserHelper = new ParserHelper();
//
// Static methods
//
/**
* @param lineSeparator New Line separator to be used
*/
public static void setLineSeparator(String lineSeparator)
{
Node.setLineSeparator(lineSeparator);
}
/**
* Return the version string of this parser.
* @return A string of the form:
* <pre>
* "[floating point number] ([build-type] [build-date])"
* </pre>
*/
public static String getVersion()
{
return (VERSION_STRING);
}
/**
* Return the version number of this parser.
* @return A floating point number, the whole number part is the major
* version, and the fractional part is the minor version.
*/
public static double getVersionNumber()
{
return (VERSION_NUMBER);
}
//
// Constructors
//
/**
* Zero argument constructor.
* The parser is in a safe but useless state.
* Set the reader or connection using setReader() or setConnection().
* @see #setReader(NodeReader)
* @see #setConnection(URLConnection)
*/
public Parser()
{
setFeedback(null);
setScanners(null);
resourceLocn = null;
reader = null;
character_set = DEFAULT_CHARSET;
url_conn = null;
input = null;
Tag.setTagParser(new TagParser(getFeedback()));
}
/**
* This constructor enables the construction of test cases, with readers
* associated with test string buffers. It can also be used with readers of the user's choice
* streaming data into the parser.<p/>
* <B>Important:</B> If you are using this constructor, and you would like to use the parser
* to parse multiple times (multiple calls to parser.elements()), you must ensure the following:<br>
* <ul>
* <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
* <li>After the first parse, calls to elements() must be preceded by calls to :
* <pre>
* parser.getReader().reset();
* </pre>
* </li>
* </ul>
* @param rd The reader to draw characters from.
* @param fb The object to use when information,
* warning and error messages are produced. If <em>null</em> no feedback
* is provided.
*/
public Parser(NodeReader rd, ParserFeedback fb)
{
setFeedback(fb);
setScanners(null);
resourceLocn = null;
reader = null;
character_set = DEFAULT_CHARSET;
url_conn = null;
input = null;
setReader(rd);
Tag.setTagParser(new TagParser(feedback));
}
/**
* Constructor for custom HTTP access.
* @param connection A fully conditioned connection. The connect()
* method will be called so it need not be connected yet.
* @param fb The object to use for message communication.
*/
public Parser(URLConnection connection, ParserFeedback fb)
throws ParserException
{
setFeedback(fb);
setScanners(null);
resourceLocn = null;
reader = null;
character_set = DEFAULT_CHARSET;
url_conn = null;
input = null;
Tag.setTagParser(new TagParser(feedback));
setConnection(connection);
}
/**
* Creates a Parser object with the location of the resource (URL or file)
* You would typically create a DefaultHTMLParserFeedback object and pass it in.
* @param resourceLocn Either the URL or the filename (autodetects).
* A standard HTTP GET is performed to read the content of the URL.
* @param feedback The HTMLParserFeedback object to use when information,
* warning and error messages are produced. If <em>null</em> no feedback
* is provided.
* @see #Parser(URLConnection,ParserFeedback)
*/
public Parser(String resourceLocn, ParserFeedback feedback)
throws ParserException
{
this(ParserHelper.openConnection(resourceLocn, feedback), feedback);
}
/**
* Creates a Parser object with the location of the resource (URL or file).
* A DefaultHTMLParserFeedback object is used for feedback.
* @param resourceLocn Either the URL or the filename (autodetects).
*/
public Parser(String resourceLocn) throws ParserException
{
this(resourceLocn, stdout);
}
/**
* This constructor is present to enable users to plugin their own readers.
* A DefaultHTMLParserFeedback object is used for feedback. It can also be used with readers of the user's choice
* streaming data into the parser.<p/>
* <B>Important:</B> If you are using this constructor, and you would like to use the parser
* to parse multiple times (multiple calls to parser.elements()), you must ensure the following:<br>
* <ul>
* <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
* <li>After the first parse, calls to elements() must be preceded by calls to :
* <pre>
* parser.getReader().reset();
* </pre>
* </li>
* @param reader The source for HTML to be parsed.
*/
public Parser(NodeReader reader)
{
this(reader, stdout);
}
/**
* Constructor for non-standard access.
* A DefaultHTMLParserFeedback object is used for feedback.
* @param connection A fully conditioned connection. The connect()
* method will be called so it need not be connected yet.
* @see #Parser(URLConnection,ParserFeedback)
*/
public Parser(URLConnection connection) throws ParserException
{
this(connection, stdout);
}
//
// Serialization support
//
private void writeObject(ObjectOutputStream out) throws IOException
{
if ((null == getConnection()) || /*redundant*/
(null == getURL()))
if (null != getReader());
// commented out by Somik - why are we not allowed to serialize parsers without url
// throw new IOException ("can only serialize parsers with a URL");
out.defaultWriteObject();
}
private void readObject(ObjectInputStream in)
throws IOException, ClassNotFoundException
{
in.defaultReadObject();
try
{
// reopen the connection and create a reader which are transient fields
setURL(getURL());
}
catch (ParserException hpe)
{
throw new IOException(hpe.toString());
}
}
//
// Bean patterns
//
/**
* Set the connection for this parser.
* This method sets four of the fields in the parser object;
* <code>resourceLocn</code>, <code>url_conn</code>, <code>character_set</code>
* and <code>reader</code>. It does not adjust the <code>scanners</code> list
* or <code>feedback</code> object. The four fields are set atomicly by
* this method, either they are all set or none of them is set. Trying to
* set the connection to null is a noop.
* @param connection A fully conditioned connection. The connect()
* method will be called so it need not be connected yet.
* @exception ParserException if the character set specified in the
* HTTP header is not supported, or an i/o exception occurs creating the
* reader.
*/
public void setConnection(URLConnection connection) throws ParserException
{
String res;
NodeReader rd;
String chs;
URLConnection con;
if (null != connection)
{
res = getURL();
rd = getReader();
chs = getEncoding();
con = getConnection();
try
{
resourceLocn = connection.getURL().toExternalForm();
url_conn = connection;
url_conn.connect();
character_set = getCharacterSet(url_conn);
createReader();
}
catch (IOException ioe)
{
String msg =
"setConnection() : Error in opening a connection to "
+ connection.getURL().toExternalForm();
ParserException ex = new ParserException(msg, ioe);
feedback.error(msg, ex);
resourceLocn = res;
url_conn = con;
character_set = chs;
reader = rd;
throw ex;
}
}
}
/**
* Return the current connection.
* @return The connection either created by the parser or passed into this
* parser via <code>setConnection</code>.
* @see #setConnection(URLConnection)
*/
public URLConnection getConnection()
{
return (url_conn);
}
/**
* Set the URL for this parser.
* This method sets four of the fields in the parser object;
* <code>resourceLocn</code>, <code>url_conn</code>, <code>character_set</code>
* and <code>reader</code>. It does not adjust the <code>scanners</code> list
* or <code>feedback</code> object.Trying to set the url to null or an
* empty string is a noop.
* @see #setConnection(URLConnection)
*/
public void setURL(String url) throws ParserException
{
if ((null != url) && !"".equals(url))
setConnection(ParserHelper.openConnection(url, getFeedback()));
}
/**
* Return the current URL being parsed.
* @return The url passed into the constructor or the file name
* passed to the constructor modified to be a URL.
*/
public String getURL()
{
return (resourceLocn);
}
/**
* Set the encoding for this parser.
* If there is no connection (getConnection() returns null) it simply sets
* the character set name stored in the parser (Note: the reader object
* which must have been set in the constructor or by <code>setReader()</code>,
* may or may not be using this character set).
* Otherwise (getConnection() doesn't return null) it does this by reopening the
* input stream of the connection and creating a reader that uses this
* character set. In this case, this method sets two of the fields in the
* parser object; <code>character_set</code> and <code>reader</code>.
* It does not adjust <code>resourceLocn</code>, <code>url_conn</code>,
* <code>scanners</code> or <code>feedback</code>. The two fields are set
* atomicly by this method, either they are both set or none of them is set.
* Trying to set the encoding to null or an empty string is a noop.
* @exception ParserException If the opening of the reader
*/
public void setEncoding(String encoding) throws ParserException
{
String chs;
NodeReader rd;
BufferedInputStream in;
if ((null != encoding) && !"".equals(encoding))
if (null == getConnection())
character_set = encoding;
else
{
rd = getReader();
chs = getEncoding();
in = input;
try
{
character_set = encoding;
recreateReader();
}
catch (IOException ioe)
{
String msg =
"setEncoding() : Error in opening a connection to "
+ getConnection().getURL().toExternalForm();
ParserException ex = new ParserException(msg, ioe);
feedback.error(msg, ex);
character_set = chs;
reader = rd;
input = in;
throw ex;
}
}
}
/**
* The current encoding.
* This item is et from the HTTP header but may be overridden by meta
* tags in the head, so this may change after the head has been parsed.
*/
public String getEncoding()
{
return (character_set);
}
/**
* Set the reader for this parser.
* This method sets four of the fields in the parser object;
* <code>resourceLocn</code>, <code>url_conn</code>, <code>character_set</code>
* and <code>reader</code>. It does not adjust the <code>scanners</code> list
* or <code>feedback</code> object. The <code>url_conn</code> is set to
* null since this cannot be determined from the reader. The
* <code>character_set</code> is set to the default character set since
* this cannot be determined from the reader.
* Trying to set the reader to <code>null</code> is a noop.
* @param rd The reader object to use. This reader will be bound to this
* parser after this call.
*/
public void setReader(NodeReader rd)
{
if (null != rd)
{
resourceLocn = rd.getURL();
reader = rd;
character_set = DEFAULT_CHARSET;
url_conn = null;
reader.setParser(this);
}
}
/**
* Returns the reader associated with the parser
* @return NodeReader
*/
public NodeReader getReader()
{
return reader;
}
/**
* Get the number of scanners registered currently in the scanner.
* @return int number of scanners registered
*/
public int getNumScanners()
{
return scanners.size();
}
/**
* This method is to be used to change the set of scanners in the current parser.
* @param newScanners Vector holding scanner objects to be used during the parsing process.
*/
public void setScanners(Map newScanners)
{
scanners = (null == newScanners) ? new HashMap() : newScanners;
}
/**
* Get an enumeration of scanners registered currently in the parser
* @return Enumeration of scanners currently registered in the parser
*/
public Map getScanners()
{
return scanners;
}
/**
* Sets the feedback object used in scanning.
* @param fb The new feedback object to use.
*/
public void setFeedback(ParserFeedback fb)
{
feedback = (null == fb) ? noFeedback : fb;
}
/**
* Returns the feedback.
* @return HTMLParserFeedback
*/
public ParserFeedback getFeedback()
{
return feedback;
}
//
// Internal methods
//
/**
* Open a stream reader on the <code>InputStream</code>.
* Revise the character set to it's default value if an
* <code>UnsupportedEncodingException</code> is thrown.
* @exception UnsupportedEncodingException in the unlikely event that
* the default character set is not supported on this platform.
*/
protected InputStreamReader createInputStreamReader()
throws UnsupportedEncodingException
{
InputStreamReader ret;
try
{
ret = new InputStreamReader(input, character_set);
}
catch (UnsupportedEncodingException uee)
{
StringBuffer msg;
String message;
msg = new StringBuffer(1024);
msg.append(url_conn.getURL().toExternalForm());
msg.append(" has an encoding (");
msg.append(character_set);
msg.append(") which is not supported, using ");
msg.append(DEFAULT_CHARSET);
message = msg.toString();
feedback.warning(message);
character_set = DEFAULT_CHARSET;
ret = new InputStreamReader(input, character_set);
}
return (ret);
}
/**
* Create a new reader for the URLConnection object.
* The current character set is used to transform the input stream
* into a character reader.
* @exception IOException if there is a problem constructing the reader.
* @see #createInputStreamReader()
* @see #getEncoding()
*/
protected void createReader() throws IOException
{
InputStream stream;
InputStreamReader in;
stream = url_conn.getInputStream();
input = new BufferedInputStream(stream);
input.mark(Integer.MAX_VALUE);
in = createInputStreamReader();
reader = new NodeReader(in, resourceLocn);
reader.setParser(this);
}
/**
* Create a new reader for the URLConnection object but reuse the input stream.
* The current character set is used to transform the input stream
* into a character reader. Defaults to <code>createReader()</code> if
* there is no existing input stream.
* @exception IOException if there is a problem constructing the reader.
* @see #createReader()
* @see #createInputStreamReader()
* @see #getEncoding()
*/
protected void recreateReader() throws IOException
{
InputStreamReader in;
if (null == input)
createReader();
else
{
input.reset();
input.mark(Integer.MAX_VALUE);
in = createInputStreamReader();
reader = new NodeReader(in, resourceLocn);
reader.setParser(this);
}
}
/**
* Try and extract the character set from the HTTP header.
* @param connection The connection with the charset info.
* @return The character set name to use for this HTML page.
*/
protected String getCharacterSet(URLConnection connection)
{
final String field = "Content-Type";
String string;
String ret;
ret = DEFAULT_CHARSET;
string = connection.getHeaderField(field);
if (null != string)
ret = getCharset(string);
return (ret);
}
/**
* Get a CharacterSet name corresponding to a charset parameter.
* @param content A text line of the form:
* <pre>
* text/html; charset=Shift_JIS
* </pre>
* which is applicable both to the HTTP header field Content-Type and
* the meta tag http-equiv="Content-Type".
* Note this method also handles non-compliant quoted charset directives such as:
* <pre>
* text/html; charset="UTF-8"
* </pre>
* and
* <pre>
* text/html; charset='UTF-8'
* </pre>
* @return The character set name to use when reading the input stream.
* For JDKs that have the Charset class this is qualified by passing
* the name to findCharset() to render it into canonical form.
* If the charset parameter is not found in the given string, the default
* character set is returned.
* @see ParserHelper#findCharset
* @see #DEFAULT_CHARSET
*/
protected String getCharset(String content)
{
int index;
String ret;
ret = DEFAULT_CHARSET;
if (null != content)
{
index = content.indexOf(CHARSET_STRING);
if (index != -1)
{
content =
content.substring(index + CHARSET_STRING.length()).trim();
if (content.startsWith("="))
{
content = content.substring(1).trim();
index = content.indexOf(";");
if (index != -1)
content = content.substring(0, index);
//remove any double quotes from around charset string
if (content.startsWith("\"")
&& content.endsWith("\"")
&& (1 < content.length()))
content = content.substring(1, content.length() - 1);
//remove any single quote from around charset string
if (content.startsWith("'")
&& content.endsWith("'")
&& (1 < content.length()))
content = content.substring(1, content.length() - 1);
ret = ParserHelper.findCharset(content, ret);
// Charset names are not case-sensitive;
// that is, case is always ignored when comparing charset names.
if (!ret.equalsIgnoreCase(content))
{
feedback.info(
"detected charset \""
+ content
+ "\", using \""
+ ret
+ "\"");
}
}
}
}
return (ret);
}
//
// Public methods
//
/**
* Add a new Tag Scanner.
* In typical situations where you require a no-frills parser, use the registerScanners() method to add the most
* common parsers. But when you wish to either compose a parser with only certain scanners registered, use this method.
* It is advantageous to register only the scanners you want, in order to achieve faster parsing speed. This method
* would also be of use when you have developed custom scanners, and need to register them into the parser.
* @param scanner TagScanner object (or derivative) to be added to the list of registered scanners
*/
public void addScanner(TagScanner scanner)
{
String ids[] = scanner.getID();
for (int i = 0; i < ids.length; i++)
{
scanners.put(ids[i], scanner);
}
scanner.setFeedback(feedback);
}
/**
* Returns an iterator (enumeration) to the html nodes. Each node can be a tag/endtag/
* string/link/image<br>
* This is perhaps the most important method of this class. In typical situations, you will need to use
* the parser like this :
* <pre>
* Parser parser = new Parser("http://www.yahoo.com");
* parser.registerScanners();
* for (NodeIterator i = parser.elements();i.hasMoreElements();) {
* Node node = i.nextHTMLNode();
* if (node instanceof StringNode) {
* // Downcasting to StringNode
* StringNode stringNode = (StringNode)node;
* // Do whatever processing you want with the string node
* System.out.println(stringNode.getText());
* }
* // Check for the node or tag that you want
* if (node instanceof ...) {
* // Downcast, and process
* }
* }
* </pre>
*/
public NodeIterator elements() throws ParserException
{
boolean remove_scanner;
Node node;
MetaTag meta;
String httpEquiv;
String charset;
boolean restart;
EndTag end;
IteratorImpl ret;
remove_scanner = false;
restart = false;
ret = new IteratorImpl(reader, resourceLocn, feedback);
ret = createIteratorImpl(remove_scanner, ret);
return (ret);
}
public IteratorImpl createIteratorImpl(
boolean remove_scanner,
IteratorImpl ret)
throws ParserException
{
Node node;
MetaTag meta;
String httpEquiv;
String charset;
EndTag end;
if (null != url_conn)
try
{
if (null == scanners.get("-m"))
{
addScanner(new MetaTagScanner("-m"));
remove_scanner = true;
}
/* pre-read up to </HEAD> looking for charset directive */
while (null != (node = ret.peek()))
{
if (node instanceof MetaTag)
{ // check for charset on Content-Type
meta = (MetaTag) node;
httpEquiv = meta.getAttribute("HTTP-EQUIV");
if ("Content-Type".equalsIgnoreCase(httpEquiv))
{
charset = getCharset(meta.getAttribute("CONTENT"));
if (!charset.equalsIgnoreCase(character_set))
{ // oops, different character set, restart
character_set = charset;
recreateReader();
ret =
new IteratorImpl(
reader,
resourceLocn,
feedback);
}
// once we see the Content-Type meta tag we're finished the pre-read
break;
}
}
else if (node instanceof EndTag)
{
end = (EndTag) node;
if (end.getTagName().equalsIgnoreCase("HEAD"))
// or, once we see the </HEAD> tag we're finished the pre-read
break;
}
}
}
catch (UnsupportedEncodingException uee)
{
String msg =
"elements() : The content of "
+ url_conn.getURL().toExternalForm()
+ " has an encoding which is not supported";
ParserException ex = new ParserException(msg, uee);
feedback.error(msg, ex);
throw ex;
}
catch (IOException ioe)
{
String msg =
"elements() : Error in opening a connection to "
+ url_conn.getURL().toExternalForm();
ParserException ex = new ParserException(msg, ioe);
feedback.error(msg, ex);
throw ex;
}
finally
{
if (remove_scanner)
scanners.remove("-m");
}
return ret;
}
/**
* Flush the current scanners registered. The registered scanners list becomes empty with this call.
*/
public void flushScanners()
{
scanners = new Hashtable();
}
/**
* Return the scanner registered in the parser having the
* given id
* @param id The id of the requested scanner
* @return TagScanner The Tag Scanner
*/
public TagScanner getScanner(String id)
{
return (TagScanner) scanners.get(id);
}
/**
* Parse the given resource, using the filter provided
*/
public void parse(String filter) throws Exception
{
Node node;
for (NodeIterator e = elements(); e.hasMoreNodes();)
{
node = e.nextNode();
if (node != null)
{
if (filter == null)
System.out.println(node.toString());
else
{
// There is a filter. Find if the associated filter of this node
// matches the specified filter
if (!(node instanceof Tag))
continue;
Tag tag = (Tag) node;
TagScanner scanner = tag.getThisScanner();
if (scanner == null)
continue;
String tagFilter = scanner.getFilter();
if (tagFilter == null)
continue;
if (tagFilter.equals(filter))
System.out.println(node.toString());
}
}
else
System.out.println("Node is null");
}
}
/**
* This method should be invoked in order to register some common scanners. The scanners that get added are : <br>
* LinkScanner (filter key "-l")<br>
* HTMLImageScanner (filter key "-i")<br>
* HTMLScriptScanner (filter key "-s") <br>
* HTMLStyleScanner (filter key "-t") <br>
* HTMLJspScanner (filter key "-j") <br>
* HTMLAppletScanner (filter key "-a") <br>
* HTMLMetaTagScanner (filter key "-m") <br>
* HTMLTitleScanner (filter key "-t") <br>
* HTMLDoctypeScanner (filter key "-d") <br>
* HTMLFormScanner (filter key "-f") <br>
* HTMLFrameSetScanner(filter key "-r") <br>
* HTMLBaseHREFScanner(filter key "-b") <br>
* <br>
* Call this method after creating the Parser object. e.g. <BR>
* <pre>
* Parser parser = new Parser("http://www.yahoo.com");
* parser.registerScanners();
* </pre>
*/
public void registerScanners()
{
if (scanners.size() > 0)
{
System.err.println(
"registerScanners() should be called first, when no other scanner has been registered.");
System.err.println(
"Other scanners already exist, hence this method call wont have any effect");
return;
}
LinkScanner linkScanner = new LinkScanner(LinkTag.LINK_TAG_FILTER);
// Note - The BaseHREF and Image scanners share the same
// link processor - internally linked up with the factory
// method in the link scanner class
addScanner(linkScanner);
addScanner(linkScanner.createImageScanner(ImageTag.IMAGE_TAG_FILTER));
addScanner(new ScriptScanner("-s"));
addScanner(new StyleScanner("-t"));
addScanner(new JspScanner("-j"));
addScanner(new AppletScanner("-a"));
addScanner(new MetaTagScanner("-m"));
addScanner(new TitleScanner("-T"));
addScanner(new DoctypeScanner("-d"));
addScanner(new FormScanner("-f", this));
addScanner(new FrameSetScanner("-r"));
addScanner(linkScanner.createBaseHREFScanner("-b"));
addScanner(new BulletListScanner("-bulletList", this));
// addScanner(new SpanScanner("-p"));
addScanner(new DivScanner("-div"));
addScanner(new TableScanner(this));
}
/**
* Make a call to registerDomScanners(), instead of registerScanners(),
* when you are interested in retrieving a Dom representation of the html
* page. Upon parsing, you will receive an Html object - which will contain
* children, one of which would be the body. This is still evolving, and in
* future releases, you might see consolidation of Html - to provide you
* with methods to access the body and the head.
*/
public void registerDomScanners()
{
registerScanners();
addScanner(new HtmlScanner());
addScanner(new BodyScanner());
addScanner(new HeadScanner());
}
/**
* Removes a specified scanner object. You can create
* an anonymous object as a parameter. This method
* will use the scanner's key and remove it from the
* registry of scanners.
* e.g.
* <pre>
* removeScanner(new FormScanner(""));
* </pre>
* @param scanner TagScanner object to be removed from the list of registered scanners
*/
public void removeScanner(TagScanner scanner)
{
scanners.remove(scanner.getID()[0]);
}
/**
* The main program, which can be executed from the command line
*/
public static void main(String[] args)
{
System.out.println("HTMLParser v" + VERSION_STRING);
if (args.length < 1 || args[0].equals("-help"))
{
System.out.println();
System.out.println(
"Syntax : java -jar htmlparser.jar <resourceLocn/website> -l");
System.out.println(
" <resourceLocn> the name of the file to be parsed (with complete path if not in current directory)");
System.out.println(
" -l Show only the link tags extracted from the document");
System.out.println(
" -i Show only the image tags extracted from the document");
System.out.println(
" -s Show only the Javascript code extracted from the document");
System.out.println(
" -t Show only the Style code extracted from the document");
System.out.println(
" -a Show only the Applet tag extracted from the document");
System.out.println(" -j Parse JSP tags");
System.out.println(" -m Parse Meta tags");
System.out.println(" -T Extract the Title");
System.out.println(" -f Extract forms");
System.out.println(" -r Extract frameset");
System.out.println(" -help This screen");
System.out.println();
System.out.println(
"HTML Parser home page : http://htmlparser.sourceforge.net");
System.out.println();
System.out.println(
"Example : java -jar htmlparser.jar http://www.yahoo.com");
System.out.println();
System.out.println(
"If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. ");
System.exit(-1);
}
try
{
if (args[0].indexOf("http") < 0)
{
File input = new File(args[0]);
try
{
args[0] = input.toURL().toString();
System.out.println("file converted to URL: " + args[0]);
}
catch (MalformedURLException e)
{
e.printStackTrace();
}
}
Parser parser = new Parser(args[0]);
System.out.println("Parsing " + parser.getURL());
parser.registerScanners();
try
{
long start = System.currentTimeMillis();
if (args.length == 2)
{
parser.parse(args[1]);
}
else
parser.parse(null);
System.out.println(
"Elapsed Time ms: " + (System.currentTimeMillis() - start));
}
catch (Exception e)
{
e.printStackTrace();
}
}
catch (ParserException e)
{
e.printStackTrace();
}
}
public void visitAllNodesWith(NodeVisitor visitor) throws ParserException
{
Node node;
for (NodeIterator e = elements(); e.hasMoreNodes();)
{
node = e.nextNode();
node.accept(visitor);
}
visitor.finishedParsing();
}
/** Initializes the parser with the given input HTML String.
* @param inputHTML the input HTML that is to be parsed.
*/
public void setInputHTML(String inputHTML)
{
if ("".equals(inputHTML))
{
reader = new NodeReader(new StringReader(inputHTML), "");
}
}
public Node[] extractAllNodesThatAre(Class nodeType) throws ParserException
{
NodeList nodeList = new NodeList();
for (NodeIterator e = elements(); e.hasMoreNodes();)
{
e.nextNode().collectInto(nodeList, nodeType);
}
return nodeList.toNodeArray();
}
/**
* Creates the parser on an input string.
* @param inputHTML
* @return Parser
*/
public static Parser createParser(String inputHTML)
{
NodeReader reader = new NodeReader(new StringReader(inputHTML), "");
return new Parser(reader);
}
public static Parser createLinkRecognizingParser(String inputHTML)
{
Parser parser = createParser(inputHTML);
parser.addScanner(new LinkScanner(LinkTag.LINK_TAG_FILTER));
return parser;
}
}