Package org.archive.modules.extractor

Source Code of org.archive.modules.extractor.ExtractorSWF$CrawlUriSWFAction

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.modules.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.util.logging.Logger;

import org.archive.modules.CrawlURI;
import org.archive.util.UriUtils;
import org.springframework.beans.factory.annotation.Autowired;

import com.anotherbigidea.flash.interfaces.SWFActions;
import com.anotherbigidea.flash.interfaces.SWFTagTypes;
import com.anotherbigidea.flash.interfaces.SWFTags;
import com.anotherbigidea.flash.readers.ActionParser;
import com.anotherbigidea.flash.readers.SWFReader;
import com.anotherbigidea.flash.readers.TagParser;
import com.anotherbigidea.flash.structs.AlphaTransform;
import com.anotherbigidea.flash.structs.Matrix;
import com.anotherbigidea.flash.writers.SWFActionsImpl;
import com.anotherbigidea.io.InStream;

/**
* Extracts URIs from SWF (flash/shockwave) files.
*
* To test, here is a link to an swf that has links
* embedded inside of it: http://www.hitspring.com/index.swf.
*
* @author Igor Ranitovic
*/
public class ExtractorSWF extends ContentExtractor {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;

    private static Logger logger =
        Logger.getLogger(ExtractorSWF.class.getName());

    private static final int MAX_READ_SIZE = 1024 * 1024; // 1MB

    protected static final String JSSTRING = "javascript:";

    /**
     * Javascript extractor to use to process inline javascript. Autowired if
     * available. If null, links will not be extracted from inline javascript.
     */
    transient protected ExtractorJS extractorJS;
    public ExtractorJS getExtractorJS() {
        return extractorJS;
    }
    @Autowired
    public void setExtractorJS(ExtractorJS extractorJS) {
        this.extractorJS = extractorJS;
    }
   
    /**
     * @param name
     */
    public ExtractorSWF() {
    }

   
    @Override
    protected boolean shouldExtract(CrawlURI uri) {
        String contentType = uri.getContentType();
        if (contentType == null) {
            return false;
        }
        if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)
                && (!uri.toString().toLowerCase().endsWith(".swf"))) {
            return false;
        }
        return true;
    }

   
    @Override
    protected boolean innerExtract(CrawlURI curi) {
        InputStream documentStream = null;
        // Get the SWF file's content stream.
        try {
            documentStream = curi.getRecorder().getContentReplayInputStream();
            if (documentStream == null) {
                return false;
            }

            // Create SWF action that will add discovered URIs to CrawlURI
            // alist(s).
            CrawlUriSWFAction curiAction = new CrawlUriSWFAction(curi,this);

            // Overwrite parsing of specific tags that might have URIs.
            CustomSWFTags customTags = new CustomSWFTags(curiAction);
            // Get a SWFReader instance.
            SWFReader reader =
                new ExtractorSWFReader(new ExtractorTagParser(customTags), documentStream);
           
            reader.readFile();
            numberOfLinksExtracted.addAndGet(curiAction.getLinkCount());
            logger.fine(curi + " has " + curiAction.getLinkCount() + " links.");
        } catch (IOException e) {
            curi.getNonFatalFailures().add(e);
        } finally {
            try {
                documentStream.close();
            } catch (IOException e) {
                curi.getNonFatalFailures().add(e);
            }
        }


        // Set flag to indicate that link extraction is completed.
        return true;
    }

    class ExtractorSWFReader extends SWFReader
    {
        public ExtractorSWFReader(SWFTags consumer, InputStream inputstream) {
            super(consumer, inputstream);
        }

        public ExtractorSWFReader(SWFTags consumer, InStream instream) {
            super(consumer, instream);
        }   

        /**
         * Override because a corrupt SWF file can cause us to try read lengths
         * that are hundreds of megabytes in size causing us to OOME.
         *
         * Below is copied from SWFReader parent class.
         */
        public int readOneTag() throws IOException {
            int header = mIn.readUI16();
            int type = header >> 6; // only want the top 10 bits
            int length = header & 0x3F; // only want the bottom 6 bits
            boolean longTag = (length == 0x3F);
            if (longTag) {
                length = (int) mIn.readUI32();
            }
            // Below test added for Heritrix use.
            if (length > MAX_READ_SIZE) {
                // skip to next, rather than throw IOException ending
                // processing
                mIn.skipBytes(length);
                logger.info("oversized SWF tag (type=" + type + ";length="
                        + length + ") skipped");
            } else {
                byte[] contents = mIn.read(length);
                mConsumer.tag(type, longTag, contents);
            }
            return type;
        }
    }
    /**
     * TagParser customized to ignore SWFTags that
     * will never contain extractable URIs.
     */
    protected class ExtractorTagParser extends TagParser {

        protected ExtractorTagParser(SWFTagTypes tagtypes) {
            super(tagtypes);
        }

        protected void parseDefineBits(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in bits
        }

        protected void parseDefineBitsJPEG3(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in bits
        }

        protected void parseDefineBitsLossless(InStream in, int length, boolean hasAlpha) throws IOException {
            // DO NOTHING - no URLs to be found in bits
        }

        protected void parseDefineButtonSound(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in sound
        }

        protected void parseDefineFont(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in font
        }

        protected void parseDefineJPEG2(InStream in, int length) throws IOException {
            // DO NOTHING - no URLs to be found in jpeg
        }

        protected void parseDefineJPEGTables(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in jpeg
        }

        protected void parseDefineShape(int type, InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in shape
        }

        protected void parseDefineSound(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in sound
        }

        protected void parseFontInfo(InStream in, int length, boolean isFI2) throws IOException {
            // DO NOTHING - no URLs to be found in font info
        }

        protected void parseDefineFont2(InStream in) throws IOException {
            // DO NOTHING - no URLs to be found in bits
        }
       
        // heritrix: Overridden to use our TagParser and SWFReader. The rest of
        // the code is the same.
        @Override
        protected void parseDefineSprite(InStream in) throws IOException {
            int id = in.readUI16();
            in.readUI16(); // frame count

            SWFTagTypes sstt = mTagtypes.tagDefineSprite(id);

            if (sstt == null)
                return;

            // heritrix: only these two lines differ from
            // super.parseDefineSprite()
            TagParser parser = new ExtractorTagParser(sstt);
            SWFReader reader = new SWFReader(parser, in);

            reader.readTags();
        }
       
        // Overridden to read 32 bit clip event flags when flash version >= 6.
        // All the rest of the code is copied directly. Fixes HER-1509.
        @Override
        protected void parsePlaceObject2( InStream in ) throws IOException
        {
            boolean hasClipActions    = in.readUBits(1) != 0;
            boolean hasClipDepth      = in.readUBits(1) != 0;
            boolean hasName           = in.readUBits(1) != 0;
            boolean hasRatio          = in.readUBits(1) != 0;
            boolean hasColorTransform = in.readUBits(1) != 0;
            boolean hasMatrix         = in.readUBits(1) != 0;
            boolean hasCharacter      = in.readUBits(1) != 0;
            boolean isMove            = in.readUBits(1) != 0;

            int depth = in.readUI16();

            int            charId    = hasCharacter      ? in.readUI16()            : 0;
            Matrix         matrix    = hasMatrix         ? new Matrix( in )         : null;
            AlphaTransform cxform    = hasColorTransform ? new AlphaTransform( in ) : null;
            int            ratio     = hasRatio          ? in.readUI16()            : -1;       
            String         name      = hasName           ? in.readString(mStringEncoding: null
            int            clipDepth = hasClipDepth      ? in.readUI16()            : 0;

            int clipEventFlags = 0;

            if (hasClipActions) {
                in.readUI16(); // reserved

                // heritrix: flags size changed in swf version 6
                clipEventFlags = mFlashVersion < 6 ? in.readUI16() : in.readSI32();
            }

            SWFActions actions = mTagtypes.tagPlaceObject2(isMove, clipDepth,
                    depth, charId, matrix, cxform, ratio, name, clipEventFlags);

            if (hasClipActions && actions != null) {
                int flags = 0;

                // heritrix: flags size changed in swf version 6
                while ((flags = mFlashVersion < 6 ? in.readUI16() : in.readSI32()) != 0) {
                    in.readUI32(); // length

                    actions.start(flags);
                    ActionParser parser = new ActionParser(actions, mFlashVersion);

                    parser.parse(in);
                }

                actions.done();
            }
        }

    }
   
   
    /**
     * SWF action that handles discovered URIs.
     *
     * @author Igor Ranitovic
     */
    public class CrawlUriSWFAction extends SWFActionsImpl {
       
        protected CrawlURI curi;
       
        private long linkCount;
        private Extractor ext;

        /**
         *
         * @param curi
         */
        public CrawlUriSWFAction(CrawlURI curi, Extractor ext) {
            assert (curi != null) : "CrawlURI should not be null";
            this.curi = curi;
            this.linkCount = 0;
            this.ext = ext;
        }
       
        /**
         * Overwrite handling of discovered URIs.
         *
         * @param url Discovered URL.
         * @param target Discovered target (currently not being used.)
         * @throws IOException
         */
        public void getURL(String url, String target)
        throws IOException {
            if (url.startsWith(JSSTRING)) {
                if (getExtractorJS() != null) {
                    linkCount += getExtractorJS().considerStrings(ext, curi, url);
                }
            } else {
                int max = ext.getExtractorParameters().getMaxOutlinks();
                CrawlURI relToVia = addRelativeToVia(curi, max, url,
                        LinkContext.EMBED_MISC, Hop.EMBED);
                CrawlURI relToBase = addRelativeToBase(curi, max, url,
                        LinkContext.EMBED_MISC, Hop.EMBED);
                addAnnotations(relToVia, relToBase);
                linkCount++;
            }
        }

        protected void addAnnotations(CrawlURI relToVia, CrawlURI relToBase) {
            if (relToVia != null && relToBase != null
                    && relToVia.getUURI().equals(relToBase.getUURI())) {
                relToVia.getAnnotations().add("extractorSWFRelToBoth");
                relToBase.getAnnotations().add("extractorSWFRelToBoth");
            } else {
                if (relToVia != null) {
                    relToVia.getAnnotations().add("extractorSWFRelToVia");
                }
                if (relToBase != null) {
                    relToBase.getAnnotations().add("extractorSWFRelToBase");
                }
            }
        }

        public void considerStringAsUri(String str) throws IOException {
            if (UriUtils.isVeryLikelyUri(str)) {
                int max = ext.getExtractorParameters().getMaxOutlinks();
                CrawlURI relToVia = addRelativeToVia(curi, max, str,
                        LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
                CrawlURI relToBase = addRelativeToBase(curi, max, str,
                        LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
                addAnnotations(relToVia, relToBase);
                linkCount++;
            }
        }

        public void lookupTable(String[] strings) throws IOException {
            for (String str : strings) {
                considerStringAsUri(str);
            }
        }

        public void push(String value) throws IOException {
            considerStringAsUri(value);
        }
       
        /**
         * @return Total number of links extracted from a swf file.
         */
        public long getLinkCount() {
            return linkCount;
        }
    }
}
TOP

Related Classes of org.archive.modules.extractor.ExtractorSWF$CrawlUriSWFAction

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.