// $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/tags/Tag.java,v 1.4 2005/07/12 20:50:38 mstover1 Exp $
/*
* ====================================================================
* Copyright 2002-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
// The developers of JMeter and Apache are greatful to the developers
// of HTMLParser for giving Apache Software Foundation a non-exclusive
// license. The performance benefits of HTMLParser are clear and the
// users of JMeter will benefit from the hard work the HTMLParser
// team. For detailed information about HTMLParser, the project is
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
//
// HTMLParser was originally created by Somik Raha in 2000. Since then
// a healthy community of users has formed and helped refine the
// design so that it is able to tackle the difficult task of parsing
// dirty HTML. Derrick Oswald is the current lead developer and was kind
// enough to assist JMeter.
package org.htmlparser.tags;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Map;
import org.htmlparser.Node;
import org.htmlparser.NodeReader;
import org.htmlparser.parserHelper.AttributeParser;
import org.htmlparser.parserHelper.TagParser;
import org.htmlparser.scanners.TagScanner;
import org.htmlparser.tags.data.TagData;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;
/**
* Tag represents a generic tag. This class allows users to register specific
* tag scanners, which can identify links, or image references. This tag asks
* the scanners to run over the text, and identify. It can be used to
* dynamically configure a parser.
*
* @author Kaarle Kaila 23.10.2001
*/
public class Tag extends Node {
public static final String TYPE = "TAG";
/**
* Constant used as value for the value of the tag name in parseParameters
* (Kaarle Kaila 3.8.2001)
*/
public final static String TAGNAME = "$<TAGNAME>$";
public final static String EMPTYTAG = "$<EMPTYTAG>$";
private final static int TAG_BEFORE_PARSING_STATE = 1;
private final static int TAG_BEGIN_PARSING_STATE = 2;
private final static int TAG_FINISHED_PARSING_STATE = 3;
private final static int TAG_ILLEGAL_STATE = 4;
private final static int TAG_IGNORE_DATA_STATE = 5;
private final static int TAG_IGNORE_BEGIN_TAG_STATE = 6;
private final static String EMPTY_STRING = "";
private static AttributeParser paramParser = new AttributeParser();
private static TagParser tagParser;
/**
* Tag contents will have the contents of the comment tag.
*/
protected StringBuffer tagContents;
private boolean emptyXmlTag = false;
/**
* tag parameters parsed into this hashtable not implemented yet added by
* Kaarle Kaila 23.10.2001
*/
protected Hashtable attributes = null;
/**
* Scanner associated with this tag (useful for extraction of filtering data
* from a HTML node)
*/
protected TagScanner thisScanner = null;
private java.lang.String tagLine;
/**
* The combined text of all the lines spanned by this tag
*/
private String[] tagLines;
/**
* The line number on which this tag starts
*/
private int startLine;
/**
* Set of tags that breaks the flow.
*/
protected static HashSet breakTags;
static {
breakTags = new HashSet(30);
breakTags.add("BLOCKQUOTE");
breakTags.add("BODY");
breakTags.add("BR");
breakTags.add("CENTER");
breakTags.add("DD");
breakTags.add("DIR");
breakTags.add("DIV");
breakTags.add("DL");
breakTags.add("DT");
breakTags.add("FORM");
breakTags.add("H1");
breakTags.add("H2");
breakTags.add("H3");
breakTags.add("H4");
breakTags.add("H5");
breakTags.add("H6");
breakTags.add("HEAD");
breakTags.add("HR");
breakTags.add("HTML");
breakTags.add("ISINDEX");
breakTags.add("LI");
breakTags.add("MENU");
breakTags.add("NOFRAMES");
breakTags.add("OL");
breakTags.add("P");
breakTags.add("PRE");
breakTags.add("TD");
breakTags.add("TH");
breakTags.add("TITLE");
breakTags.add("UL");
}
/**
* Set the Tag with the beginning posn, ending posn and tag contents (in a
* tagData object.
*
* @param tagData
* The data for this tag
*/
public Tag(TagData tagData) {
super(tagData.getTagBegin(), tagData.getTagEnd());
this.startLine = tagData.getStartLine();
this.tagContents = new StringBuffer();
this.tagContents.append(tagData.getTagContents());
this.tagLine = tagData.getTagLine();
this.tagLines = new String[] { tagData.getTagLine() };
this.emptyXmlTag = tagData.isEmptyXmlTag();
}
public void append(char ch) {
tagContents.append(ch);
}
public void append(String ch) {
tagContents.append(ch);
}
/**
* Locate the tag withing the input string, by parsing from the given
* position
*
* @param reader
* HTML reader to be provided so as to allow reading of next line
* @param input
* Input String
* @param position
* Position to start parsing from
*/
public static Tag find(NodeReader reader, String input, int position) {
return tagParser.find(reader, input, position);
}
/**
* This method is not to be called by any scanner or tag. It is an expensive
* method, hence it has been made private. However, there might be some
* circumstances when a scanner wishes to force parsing of attributes over
* and above what has already been parsed. To make the choice clear - we
* have a method - redoParseAttributes(), which can be used.
*
* @return Hashtable
*/
private Hashtable parseAttributes() {
return paramParser.parseAttributes(this);
}
/**
* In case the tag is parsed at the scan method this will return value of a
* parameter not implemented yet
*
* @param name
* of parameter
*/
public String getAttribute(String name) {
return (String) getAttributes().get(name.toUpperCase());
}
/**
* Set attribute with given key, value pair.
*
* @param key
* @param value
*/
public void setAttribute(String key, String value) {
attributes.put(key, value);
}
/**
* In case the tag is parsed at the scan method this will return value of a
* parameter not implemented yet
*
* @param name
* of parameter
* @deprecated use getAttribute instead
*/
public String getParameter(String name) {
return (String) getAttributes().get(name.toUpperCase());
}
/**
* Gets the attributes in the tag.
*
* @return Returns a Hashtable of attributes
*/
public Hashtable getAttributes() {
if (attributes == null) {
attributes = parseAttributes();
}
return attributes;
}
public String getTagName() {
return (String) getAttributes().get(TAGNAME);
}
/**
* Returns the line where the tag was found
*
* @return java.lang.String
*/
public String getTagLine() {
return tagLine;
}
/**
* Returns the combined text of all the lines spanned by this tag
*
* @return java.lang.String
*/
public String[] getTagLines() {
return tagLines;
}
/**
* Return the text contained in this tag
*/
public String getText() {
return tagContents.toString();
}
/**
* Return the scanner associated with this tag.
*/
public TagScanner getThisScanner() {
return thisScanner;
}
/**
* Extract the first word from the given string. Words are delimited by
* whitespace or equals signs.
*
* @param s
* The string to get the word from.
* @return The first word.
*/
public static String extractWord(String s) {
int length;
boolean parse;
char ch;
StringBuffer ret;
length = s.length();
ret = new StringBuffer(length);
parse = true;
for (int i = 0; i < length && parse; i++) {
ch = s.charAt(i);
if (Character.isWhitespace(ch) || ch == '=')
parse = false;
else
ret.append(Character.toUpperCase(ch));
}
return (ret.toString());
}
/**
* Scan the tag to see using the registered scanners, and attempt
* identification.
*
* @param url
* URL at which HTML page is located
* @param reader
* The NodeReader that is to be used for reading the url
*/
public Node scan(Map scanners, String url, NodeReader reader) throws ParserException {
if (tagContents.length() == 0)
return this;
try {
boolean found = false;
Node retVal = null;
// Find the first word in the scanners
String firstWord = extractWord(tagContents.toString());
// Now, get the scanner associated with this.
TagScanner scanner = (TagScanner) scanners.get(firstWord);
// Now do a deep check
if (scanner != null && scanner.evaluate(tagContents.toString(), reader.getPreviousOpenScanner())) {
found = true;
TagScanner save;
save = reader.getPreviousOpenScanner();
reader.setPreviousOpenScanner(scanner);
retVal = scanner.createScannedNode(this, url, reader, tagLine);
reader.setPreviousOpenScanner(save);
}
if (!found)
return this;
else {
return retVal;
}
} catch (Exception e) {
String errorMsg;
if (tagContents != null)
errorMsg = tagContents.toString();
else
errorMsg = "null";
throw new ParserException("Tag.scan() : Error while scanning tag, tag contents = " + errorMsg
+ ", tagLine = " + tagLine, e);
}
}
/**
* Sets the parsed.
*
* @param parsed
* The parsed to set
*/
public void setAttributes(Hashtable attributes) {
this.attributes = attributes;
}
/**
* Sets the nodeBegin.
*
* @param nodeBegin
* The nodeBegin to set
*/
public void setTagBegin(int tagBegin) {
this.nodeBegin = tagBegin;
}
/**
* Gets the nodeBegin.
*
* @return The nodeBegin value.
*/
public int getTagBegin() {
return (nodeBegin);
}
/**
* Sets the nodeEnd.
*
* @param nodeEnd
* The nodeEnd to set
*/
public void setTagEnd(int tagEnd) {
this.nodeEnd = tagEnd;
}
/**
* Gets the nodeEnd.
*
* @return The nodeEnd value.
*/
public int getTagEnd() {
return (nodeEnd);
}
/**
* Gets the line number on which this tag starts.
*
* @return the start line number
*/
public int getTagStartLine() {
return startLine;
}
/**
* Gets the line number on which this tag ends.
*
* @return the end line number
*/
public int getTagEndLine() {
return startLine + tagLines.length - 1;
}
public void setTagLine(java.lang.String newTagLine) {
tagLine = newTagLine;
// Note: Incur the overhead of resizing each time (versus
// preallocating a larger array), since the average tag
// generally doesn't span multiple lines
String[] newTagLines = new String[tagLines.length + 1];
for (int i = 0; i < tagLines.length; i++)
newTagLines[i] = tagLines[i];
newTagLines[tagLines.length] = newTagLine;
tagLines = newTagLines;
}
public void setText(String text) {
tagContents = new StringBuffer(text);
}
public void setThisScanner(TagScanner scanner) {
thisScanner = scanner;
}
public String toPlainTextString() {
return EMPTY_STRING;
}
/**
* A call to a tag's toHTML() method will render it in HTML Most tags that
* do not have children and inherit from Tag, do not need to override
* toHTML().
*
* @see org.htmlparser.Node#toHTML()
*/
public String toHtml() {
StringBuffer sb = new StringBuffer();
sb.append("<");
sb.append(getTagName());
if (containsMoreThanOneKey())
sb.append(" ");
String key, value;
String empty = null;
int i = 0;
for (Enumeration e = attributes.keys(); e.hasMoreElements();) {
key = (String) e.nextElement();
i++;
if (!key.equals(TAGNAME)) {
if (key.equals(EMPTYTAG)) {
empty = "/";
} else {
value = getAttribute(key);
sb.append(key + "=\"" + value + "\"");
if (i < attributes.size())
sb.append(" ");
}
}
}
if (empty != null)
sb.append(empty);
if (isEmptyXmlTag())
sb.append("/");
sb.append(">");
return sb.toString();
}
private boolean containsMoreThanOneKey() {
return attributes.keySet().size() > 1;
}
/**
* Print the contents of the tag
*/
public String toString() {
return "Begin Tag : " + tagContents + "; begins at : " + elementBegin() + "; ends at : " + elementEnd();
}
/**
* Sets the tagParser.
*
* @param tagParser
* The tagParser to set
*/
public static void setTagParser(TagParser tagParser) {
Tag.tagParser = tagParser;
}
/**
* Determines if the given tag breaks the flow of text.
*
* @return <code>true</code> if following text would start on a new line,
* <code>false</code> otherwise.
*/
public boolean breaksFlow() {
return (breakTags.contains(getText().toUpperCase()));
}
/**
* This method verifies that the current tag matches the provided filter.
* The match is based on the string object and not its contents, so ensure
* that you are using static final filter strings provided in the tag
* classes.
*
* @see org.htmlparser.Node#collectInto(NodeList, String)
*/
public void collectInto(NodeList collectionList, String filter) {
if (thisScanner != null && thisScanner.getFilter().equals(filter))
collectionList.add(this);
}
/**
* Returns table of attributes in the tag
*
* @return Hashtable
* @deprecated This method is deprecated. Use getAttributes() instead.
*/
public Hashtable getParsed() {
return attributes;
}
/**
* Sometimes, a scanner may need to request a re-evaluation of the
* attributes in a tag. This may happen when there is some correction
* activity. An example of its usage can be found in ImageTag. <br>
* <B>Note:<B> This is an intensive task, hence call only when really
* necessary
*
* @return Hashtable
*/
public Hashtable redoParseAttributes() {
return parseAttributes();
}
public void accept(NodeVisitor visitor) {
visitor.visitTag(this);
}
public String getType() {
return TYPE;
}
/**
* Is this an empty xml tag of the form<br>
* <tag/>
*
* @return boolean
*/
public boolean isEmptyXmlTag() {
return emptyXmlTag;
}
public void setEmptyXmlTag(boolean emptyXmlTag) {
this.emptyXmlTag = emptyXmlTag;
}
}