Package au.id.jericho.lib.html

Examples of au.id.jericho.lib.html.Attribute


    }


    protected void processGeneralTag(CrawlURI curi, Element element,
            Attributes attributes) {
        Attribute attr;
        String attrValue;
        List<Attribute> attrList;
        String elementName = element.getName();

        // Just in case it's an OBJECT or APPLET tag
        String codebase = null;
        ArrayList<String> resources = null;

        final boolean framesAsEmbeds = getTreatFramesAsEmbedLinks();

        final boolean ignoreFormActions = getIgnoreFormActionUrls();
       
        final boolean overlyEagerLinkDetection = getExtractValueAttributes();

        // HREF
        if (((attr = attributes.get("href")) != null) &&
            ((attrValue = attr.getValue()) != null)) {
            CharSequence context = elementContext(elementName, attr
                    .getKey());
            if ("link".equals(elementName)) {
                // <LINK> elements treated as embeds (css, ico, etc)
                processEmbed(curi, attrValue, context);
            } else {
                // other HREFs treated as links
                processLink(curi, attrValue, context);
            }
            if ("base".equals(elementName)) {
                try {
                    UURI base = UURIFactory.getInstance(attrValue);
                    curi.setBaseURI(base);
                } catch (URIException e) {
                    logUriError(e, curi.getUURI(), attrValue);
                }
            }
        }
        // ACTION
        if (((attr = attributes.get("action")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            if (!ignoreFormActions) {
                CharSequence context = elementContext(elementName, attr
                        .getKey());
                processLink(curi, attrValue, context);
            }
        }
        // ON_
        if ((attrList = findOnAttributes(attributes)).size() != 0) {
            for (Iterator<Attribute> attrIter = attrList.iterator(); attrIter.hasNext();) {
                attr = (Attribute) attrIter.next();
                CharSequence valueSegment = attr.getValueSegment();
                if (valueSegment != null)
                    processScriptCode(curi, valueSegment);

            }
        }
        // SRC atc.
        if ((((attr = attributes.get("src")) != null)
                || ((attr = attributes.get("lowsrc")) != null)
                || ((attr = attributes.get("background")) != null)
                || ((attr = attributes.get("cite")) != null)
                || ((attr = attributes.get("longdesc")) != null)
                || ((attr = attributes.get("usemap")) != null)
                || ((attr = attributes.get("profile")) != null)
                || ((attr = attributes.get("datasrc")) != null)) &&
                   ((attrValue = attr.getValue()) != null)) {

            final Hop hopType;
            CharSequence context = elementContext(elementName, attr.getKey());

            if (!framesAsEmbeds
                    && ("frame".equals(elementName) || "iframe"
                            .equals(elementName)))
                hopType = Hop.NAVLINK;
            else
                hopType = Hop.EMBED;

            processEmbed(curi, attrValue, context, hopType);
        }
        // CODEBASE
        if (((attr = attributes.get("codebase")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            codebase = StringEscapeUtils.unescapeHtml(attrValue);
            CharSequence context = elementContext(elementName, attr.getKey());
            processEmbed(curi, codebase, context);
        }
        // CLASSID DATA
        if ((((attr = attributes.get("classid")) != null)
                || ((attr = attributes.get("data")) != null)) &&
                   ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList<String>();
            resources.add(attrValue);
        }
        // ARCHIVE
        if (((attr = attributes.get("archive")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList<String>();
            String[] multi = TextUtils.split(WHITESPACE, attrValue);
            for (int i = 0; i < multi.length; i++) {
                resources.add(multi[i]);
            }
        }
        // CODE
        if (((attr = attributes.get("code")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList<String>();
            // If element is applet and code value does not end with
            // '.class' then append '.class' to the code value.
            if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
                resources.add(attrValue + CLASSEXT);
            } else {
                resources.add(attrValue);
            }
        }
        // VALUE
        if (((attr = attributes.get("value")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            CharSequence valueContext = elementContext(elementName, attr.getKey());
            if("PARAM".equalsIgnoreCase(elementName)
                    && "flashvars".equalsIgnoreCase(attributes.get("name").getValue())) {
                // special handling for <PARAM NAME='flashvars" VALUE="">
                String queryStringLike = attrValue.toString();
                // treat value as query-string-like "key=value[;key=value]*" pairings
                considerQueryStringValues(curi, queryStringLike, valueContext,Hop.SPECULATIVE);
            } else {
                // regular VALUE handling
                if (overlyEagerLinkDetection) {
                    considerIfLikelyUri(curi,attrValue,valueContext,Hop.NAVLINK);
                }
            }
        }
        // STYLE
        if (((attr = attributes.get("style")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            // STYLE inline attribute
            // then, parse for URIs
            numberOfLinksExtracted.addAndGet(ExtractorCSS.processStyleCode(
                    this, curi, attrValue));
        }
       
        // FLASHVARS
        if (((attr = attributes.get("flashvars")) != null) &&
                ((attrValue = attr.getValue()) != null)) {
            // FLASHVARS inline attribute
            CharSequence valueContext = elementContext(elementName, attr.getKey());
            considerQueryStringValues(curi, attrValue, valueContext,Hop.SPECULATIVE);
       }

        // handle codebase/resources
        if (resources == null)
View Full Code Here

TOP

Related Classes of au.id.jericho.lib.html.Attribute

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.