Source Code of org.htmlparser.tags.Tag

// $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/tags/Tag.java,v 1.2.2.1 2004/10/24 01:22:59 sebb Exp $
/*
 * ====================================================================
 * Copyright 2002-2004 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */


// The developers of JMeter and Apache are greatful to the developers
// of HTMLParser for giving Apache Software Foundation a non-exclusive
// license. The performance benefits of HTMLParser are clear and the
// users of JMeter will benefit from the hard work the HTMLParser
// team. For detailed information about HTMLParser, the project is
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
//
// HTMLParser was originally created by Somik Raha in 2000. Since then
// a healthy community of users has formed and helped refine the
// design so that it is able to tackle the difficult task of parsing
// dirty HTML. Derrick Oswald is the current lead developer and was kind
// enough to assist JMeter.


package org.htmlparser.tags;


import java.util.Enumeration;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Map;


import org.htmlparser.Node;
import org.htmlparser.NodeReader;
import org.htmlparser.parserHelper.AttributeParser;
import org.htmlparser.parserHelper.TagParser;
import org.htmlparser.scanners.TagScanner;
import org.htmlparser.tags.data.TagData;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.NodeVisitor;


/**
 * Tag represents a generic tag. This class allows users to register specific
 * tag scanners, which can identify links, or image references. This tag asks the
 * scanners to run over the text, and identify. It can be used to dynamically
 * configure a parser.
 * @author Kaarle Kaila 23.10.2001
 */
public class Tag extends Node
{
    public static final String TYPE = "TAG";
    /**
     * Constant used as value for the value of the tag name
     * in parseParameters  (Kaarle Kaila 3.8.2001)
     */
    public final static String TAGNAME = "$<TAGNAME>$";
    public final static String EMPTYTAG = "$<EMPTYTAG>$";
    private final static int TAG_BEFORE_PARSING_STATE = 1;
    private final static int TAG_BEGIN_PARSING_STATE = 2;
    private final static int TAG_FINISHED_PARSING_STATE = 3;
    private final static int TAG_ILLEGAL_STATE = 4;
    private final static int TAG_IGNORE_DATA_STATE = 5;
    private final static int TAG_IGNORE_BEGIN_TAG_STATE = 6;
    private final static String EMPTY_STRING = "";


    private static AttributeParser paramParser = new AttributeParser();
    private static TagParser tagParser;
    /**
     * Tag contents will have the contents of the comment tag.
     */
    protected StringBuffer tagContents;
    private boolean emptyXmlTag = false;
    /**
     * tag parameters parsed into this hashtable
     * not implemented yet
     * added by Kaarle Kaila 23.10.2001
     */
    protected Hashtable attributes = null;


    /**
     * Scanner associated with this tag (useful for extraction of filtering data from a
     * HTML node)
     */
    protected TagScanner thisScanner = null;
    private java.lang.String tagLine;


    /**
     * The combined text of all the lines spanned by this tag
     */
    private String[] tagLines;


    /**
     * The line number on which this tag starts
     */
    private int startLine;


    /**
     * Set of tags that breaks the flow.
     */
    protected static HashSet breakTags;
    static {
        breakTags = new HashSet(30);
        breakTags.add("BLOCKQUOTE");
        breakTags.add("BODY");
        breakTags.add("BR");
        breakTags.add("CENTER");
        breakTags.add("DD");
        breakTags.add("DIR");
        breakTags.add("DIV");
        breakTags.add("DL");
        breakTags.add("DT");
        breakTags.add("FORM");
        breakTags.add("H1");
        breakTags.add("H2");
        breakTags.add("H3");
        breakTags.add("H4");
        breakTags.add("H5");
        breakTags.add("H6");
        breakTags.add("HEAD");
        breakTags.add("HR");
        breakTags.add("HTML");
        breakTags.add("ISINDEX");
        breakTags.add("LI");
        breakTags.add("MENU");
        breakTags.add("NOFRAMES");
        breakTags.add("OL");
        breakTags.add("P");
        breakTags.add("PRE");
        breakTags.add("TD");
        breakTags.add("TH");
        breakTags.add("TITLE");
        breakTags.add("UL");
    }


    /**
     * Set the Tag with the beginning posn, ending posn and tag contents (in
     * a tagData object.
     * @param tagData The data for this tag
     */
    public Tag(TagData tagData)
    {
        super(tagData.getTagBegin(), tagData.getTagEnd());
        this.startLine = tagData.getStartLine();
        this.tagContents = new StringBuffer();
        this.tagContents.append(tagData.getTagContents());
        this.tagLine = tagData.getTagLine();
        this.tagLines = new String[] { tagData.getTagLine()};
        this.emptyXmlTag = tagData.isEmptyXmlTag();
    }


    public void append(char ch)
    {
        tagContents.append(ch);
    }


    public void append(String ch)
    {
        tagContents.append(ch);
    }


    /**
     * Locate the tag withing the input string, by parsing from the given position
     * @param reader HTML reader to be provided so as to allow reading of next line
     * @param input Input String
     * @param position Position to start parsing from
     */
    public static Tag find(NodeReader reader, String input, int position)
    {
        return tagParser.find(reader, input, position);
    }


    /**
     * This method is not to be called by any scanner or tag. It is
     * an expensive method, hence it has been made private. However,
     * there might be some circumstances when a scanner wishes to force
     * parsing of attributes over and above what has already been parsed.
     * To make the choice clear - we have a method - redoParseAttributes(),
     * which can be used.
     * @return Hashtable
     */
    private Hashtable parseAttributes()
    {
        return paramParser.parseAttributes(this);
    }


    /**
     * In case the tag is parsed at the scan method this will return value of a
     * parameter not implemented yet
     * @param name of parameter
     */
    public String getAttribute(String name)
    {
        return (String) getAttributes().get(name.toUpperCase());
    }


    /**
     * Set attribute with given key, value pair.
     * @param key
     * @param value
     */
    public void setAttribute(String key, String value)
    {
        attributes.put(key, value);
    }


    /**
     * In case the tag is parsed at the scan method this will return value of a
     * parameter not implemented yet
     * @param name of parameter
     * @deprecated use getAttribute instead
     */
    public String getParameter(String name)
    {
        return (String) getAttributes().get(name.toUpperCase());
    }


    /**
     * Gets the attributes in the tag.
     * @return Returns a Hashtable of attributes
     */
    public Hashtable getAttributes()
    {
        if (attributes == null)
        {
            attributes = parseAttributes();
        }
        return attributes;
    }


    public String getTagName()
    {
        return (String) getAttributes().get(TAGNAME);
    }


    /**
     * Returns the line where the tag was found
     * @return java.lang.String
     */
    public String getTagLine()
    {
        return tagLine;
    }


    /**
     * Returns the combined text of all the lines spanned by this tag
     * @return java.lang.String
     */
    public String[] getTagLines()
    {
        return tagLines;
    }


    /**
     * Return the text contained in this tag
     */
    public String getText()
    {
        return tagContents.toString();
    }


    /**
     * Return the scanner associated with this tag.
     */
    public TagScanner getThisScanner()
    {
        return thisScanner;
    }


    /**
     * Extract the first word from the given string.
     * Words are delimited by whitespace or equals signs.
     * @param s The string to get the word from.
     * @return The first word.
     */
    public static String extractWord(String s)
    {
        int length;
        boolean parse;
        char ch;
        StringBuffer ret;


        length = s.length();
        ret = new StringBuffer(length);
        parse = true;
        for (int i = 0; i < length && parse; i++)
        {
            ch = s.charAt(i);
            if (Character.isWhitespace(ch) || ch == '=')
                parse = false;
            else
                ret.append(Character.toUpperCase(ch));
        }


        return (ret.toString());
    }


    /**
     * Scan the tag to see using the registered scanners, and attempt identification.
     * @param url URL at which HTML page is located
     * @param reader The NodeReader that is to be used for reading the url
     */
    public Node scan(Map scanners, String url, NodeReader reader)
        throws ParserException
    {
        if (tagContents.length() == 0)
            return this;
        try
        {
            boolean found = false;
            Node retVal = null;
            // Find the first word in the scanners
            String firstWord = extractWord(tagContents.toString());
            // Now, get the scanner associated with this.
            TagScanner scanner = (TagScanner) scanners.get(firstWord);


            // Now do a deep check
            if (scanner != null
                && scanner.evaluate(
                    tagContents.toString(),
                    reader.getPreviousOpenScanner()))
            {
                found = true;
                TagScanner save;
                save = reader.getPreviousOpenScanner();
                reader.setPreviousOpenScanner(scanner);
                retVal = scanner.createScannedNode(this, url, reader, tagLine);
                reader.setPreviousOpenScanner(save);
            }


            if (!found)
                return this;
            else
            {
                return retVal;
            }
        }
        catch (Exception e)
        {
            String errorMsg;
            if (tagContents != null)
                errorMsg = tagContents.toString();
            else
                errorMsg = "null";
            throw new ParserException(
                "Tag.scan() : Error while scanning tag, tag contents = "
                    + errorMsg
                    + ", tagLine = "
                    + tagLine,
                e);
        }
    }


    /**
     * Sets the parsed.
     * @param parsed The parsed to set
     */
    public void setAttributes(Hashtable attributes)
    {
        this.attributes = attributes;
    }


    /**
     * Sets the nodeBegin.
     * @param nodeBegin The nodeBegin to set
     */
    public void setTagBegin(int tagBegin)
    {
        this.nodeBegin = tagBegin;
    }


    /**
     * Gets the nodeBegin.
     * @return The nodeBegin value.
     */
    public int getTagBegin()
    {
        return (nodeBegin);
    }


    /**
     * Sets the nodeEnd.
     * @param nodeEnd The nodeEnd to set
     */
    public void setTagEnd(int tagEnd)
    {
        this.nodeEnd = tagEnd;
    }


    /**
     * Gets the nodeEnd.
     * @return The nodeEnd value.
     */
    public int getTagEnd()
    {
        return (nodeEnd);
    }


    /**
     * Gets the line number on which this tag starts.
     * @return the start line number
     */
    public int getTagStartLine()
    {
        return startLine;
    }


    /**
     * Gets the line number on which this tag ends.
     * @return the end line number
     */
    public int getTagEndLine()
    {
        return startLine + tagLines.length - 1;
    }


    public void setTagLine(java.lang.String newTagLine)
    {
        tagLine = newTagLine;


        // Note: Incur the overhead of resizing each time (versus
        // preallocating a larger array), since the average tag
        // generally doesn't span multiple lines
        String[] newTagLines = new String[tagLines.length + 1];
        for (int i = 0; i < tagLines.length; i++)
            newTagLines[i] = tagLines[i];
        newTagLines[tagLines.length] = newTagLine;
        tagLines = newTagLines;
    }


    public void setText(String text)
    {
        tagContents = new StringBuffer(text);
    }


    public void setThisScanner(TagScanner scanner)
    {
        thisScanner = scanner;
    }


    public String toPlainTextString()
    {
        return EMPTY_STRING;
    }


    /**
     * A call to a tag's toHTML() method will render it in HTML
     * Most tags that do not have children and inherit from Tag,
     * do not need to override toHTML().
     * @see org.htmlparser.Node#toHTML()
     */
    public String toHtml()
    {
        StringBuffer sb = new StringBuffer();
        sb.append("<");
        sb.append(getTagName());
        if (containsMoreThanOneKey())
            sb.append(" ");
        String key, value;
        String empty = null;
        int i = 0;
        for (Enumeration e = attributes.keys(); e.hasMoreElements();)
        {
            key = (String) e.nextElement();
            i++;
            if (!key.equals(TAGNAME))
            {
                if (key.equals(EMPTYTAG))
                {
                    empty = "/";
                }
                else
                {
                    value = getAttribute(key);
                    sb.append(key + "=\"" + value + "\"");
                    if (i < attributes.size())
                        sb.append(" ");
                }
            }
        }
        if (empty != null)
            sb.append(empty);
        if (isEmptyXmlTag())
            sb.append("/");
        sb.append(">");
        return sb.toString();
    }


    private boolean containsMoreThanOneKey()
    {
        return attributes.keySet().size() > 1;
    }


    /**
     * Print the contents of the tag
     */
    public String toString()
    {
        return "Begin Tag : "
            + tagContents
            + "; begins at : "
            + elementBegin()
            + "; ends at : "
            + elementEnd();
    }


    /**
     * Sets the tagParser.
     * @param tagParser The tagParser to set
     */
    public static void setTagParser(TagParser tagParser)
    {
        Tag.tagParser = tagParser;
    }


    /**
     * Determines if the given tag breaks the flow of text.
     * @return <code>true</code> if following text would start on a new line,
     * <code>false</code> otherwise.
     */
    public boolean breaksFlow()
    {
        return (breakTags.contains(getText().toUpperCase()));
    }


    /**
     * This method verifies that the current tag matches the provided
     * filter. The match is based on the string object and not its contents,
     * so ensure that you are using static final filter strings provided
     * in the tag classes.
     * @see org.htmlparser.Node#collectInto(NodeList, String)
     */
    public void collectInto(NodeList collectionList, String filter)
    {
        if (thisScanner != null && thisScanner.getFilter().equals(filter))
            collectionList.add(this);
    }


    /**
     * Returns table of attributes in the tag
     * @return Hashtable
     * @deprecated This method is deprecated. Use getAttributes() instead.
     */
    public Hashtable getParsed()
    {
        return attributes;
    }


    /**
     * Sometimes, a scanner may need to request a re-evaluation of the
     * attributes in a tag. This may happen when there is some correction
     * activity. An example of its usage can be found in ImageTag.
     * <br>
     * <B>Note:<B> This is an intensive task, hence call only when
     * really necessary
     * @return Hashtable
     */
    public Hashtable redoParseAttributes()
    {
        return parseAttributes();
    }


    public void accept(NodeVisitor visitor)
    {
        visitor.visitTag(this);
    }


    public String getType()
    {
        return TYPE;
    }


    /**
     * Is this an empty xml tag of the form<br>
     * &lt;tag/&gt; 
     * @return boolean
     */
    public boolean isEmptyXmlTag()
    {
        return emptyXmlTag;
    }


    public void setEmptyXmlTag(boolean emptyXmlTag)
    {
        this.emptyXmlTag = emptyXmlTag;
    }


}
Source Code of org.htmlparser.tags.Tag

Related Classes of org.htmlparser.tags.Tag