Package org.htmlparser.scanners

Source Code of org.htmlparser.scanners.LinkScanner

// $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/scanners/LinkScanner.java,v 1.2 2004/02/10 13:41:09 woolfel Exp $
/*
* ====================================================================
* Copyright 2002-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

// The developers of JMeter and Apache are greatful to the developers
// of HTMLParser for giving Apache Software Foundation a non-exclusive
// license. The performance benefits of HTMLParser are clear and the
// users of JMeter will benefit from the hard work the HTMLParser
// team. For detailed information about HTMLParser, the project is
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
//
// HTMLParser was originally created by Somik Raha in 2000. Since then
// a healthy community of users has formed and helped refine the
// design so that it is able to tackle the difficult task of parsing
// dirty HTML. Derrick Oswald is the current lead developer and was kind
// enough to assist JMeter.

package org.htmlparser.scanners;

//////////////////
// Java Imports //
//////////////////
import java.util.Hashtable;

import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.Tag;
import org.htmlparser.tags.data.CompositeTagData;
import org.htmlparser.tags.data.LinkData;
import org.htmlparser.tags.data.TagData;
import org.htmlparser.util.LinkProcessor;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserUtils;

/**
* Scans for the Link Tag. This is a subclass of TagScanner, and is called using a
* variant of the template method. If the evaluate() method returns true, that means the
* given string contains an image tag. Extraction is done by the scan method thereafter
* by the user of this class.
*/
public class LinkScanner extends CompositeTagScanner
{
    private static final String MATCH_NAME[] = { "A" };
    public static final String LINK_SCANNER_ID = "A";
    public static final String DIRTY_TAG_MESSAGE =
        " is a dirty link tag - the tag was not closed. \nWe encountered an open tag, before the previous end tag was found.\nCorrecting this..";
    private LinkProcessor processor;
    private final static String ENDERS[] =
        { "TD", "TR", "FORM", "LI", "BODY", "HTML" };
    private final static String ENDTAG_ENDERS[] =
        { "TD", "TR", "FORM", "LI", "BODY", "HTML" };

    /**
     * Overriding the default constructor
     */
    public LinkScanner()
    {
        this("");
    }

    /**
     * Overriding the constructor to accept the filter
     */
    public LinkScanner(String filter)
    {
        super(filter, MATCH_NAME, ENDERS, ENDTAG_ENDERS, false);
        processor = new LinkProcessor();
    }

    public Tag createTag(TagData tagData, CompositeTagData compositeTagData)
        throws ParserException
    {

        String link =
            extractLink(
                compositeTagData.getStartTag(),
                tagData.getUrlBeingParsed());
        int mailto = link.indexOf("mailto");
        boolean mailLink = false;
        if (mailto == 0)
        {
            // yes it is
            mailto = link.indexOf(":");
            link = link.substring(mailto + 1);
            mailLink = true;
        }
        int javascript = link.indexOf("javascript:");
        boolean javascriptLink = false;
        if (javascript == 0)
        {
            link = link.substring(11);
            // this magic number is "javascript:".length()
            javascriptLink = true;
        }
        String accessKey = getAccessKey(compositeTagData.getStartTag());
        String myLinkText = compositeTagData.getChildren().toString();

        LinkTag linkTag =
            new LinkTag(
                tagData,
                compositeTagData,
                new LinkData(
                    link,
                    myLinkText,
                    accessKey,
                    mailLink,
                    javascriptLink));
        linkTag.setThisScanner(this);
        return linkTag;
    }

    /**
     * Template Method, used to decide if this scanner can handle the Link tag type. If
     * the evaluation returns true, the calling side makes a call to scan().
     * @param s The complete text contents of the Tag.
     * @param previousOpenScanner Indicates any previous scanner which hasnt completed, before the current
     * scan has begun, and hence allows us to write scanners that can work with dirty html
     */
    public boolean evaluate(String s, TagScanner previousOpenScanner)
    {
        char ch;
        boolean ret;

        // eat up leading blanks
        s = absorbLeadingBlanks(s);
        if (5 > s.length())
            ret = false;
        else
        {
            ch = s.charAt(0);
            if ((ch == 'a' || ch == 'A')
                && Character.isWhitespace(s.charAt(1)))
                ret = -1 != s.toUpperCase().indexOf("HREF");
            else
                ret = false;
        }

        return (ret);
    }

    /**
     * Extract the link from the given string. The URL of the actual html page is also
     * provided.   
     */
    public String extractLink(Tag tag, String url) throws ParserException
    {
        try
        {
            Hashtable table = tag.getAttributes();
            String relativeLink = (String) table.get("HREF");
            if (relativeLink != null)
            {
                relativeLink = ParserUtils.removeChars(relativeLink, '\n');
                relativeLink = ParserUtils.removeChars(relativeLink, '\r');
            }
            return processor.extract(relativeLink, url);
        }
        catch (Exception e)
        {
            String msg;
            if (tag != null)
                msg = tag.getText();
            else
                msg = "null";
            throw new ParserException(
                "HTMLLinkScanner.extractLink() : Error while extracting link from tag "
                    + msg
                    + ", url = "
                    + url,
                e);
        }
    }

    /**
     * Extract the access key from the given tag.
     * @param text Text to be parsed to pick out the access key.
     * @return The value of the ACCESSKEY attribute.
     */
    private String getAccessKey(Tag tag)
    {
        return tag.getAttribute("ACCESSKEY");
    }

    public BaseHrefScanner createBaseHREFScanner(String filter)
    {
        return new BaseHrefScanner(filter, processor);
    }

    public ImageScanner createImageScanner(String filter)
    {
        return new ImageScanner(filter, processor);
    }

    /**
     * @see org.htmlparser.scanners.TagScanner#getID()
     */
    public String[] getID()
    {
        return MATCH_NAME;
    }

}
TOP

Related Classes of org.htmlparser.scanners.LinkScanner

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.