Package slash.navigation.download.tools.base

Source Code of slash.navigation.download.tools.base.WebsiteDataSourcesXmlGenerator

/*
    This file is part of RouteConverter.

    RouteConverter is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    RouteConverter is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with RouteConverter; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

    Copyright (C) 2007 Christian Pesch. All Rights Reserved.
*/
package slash.navigation.download.tools.base;

import slash.navigation.datasources.binding.*;
import slash.navigation.download.tools.helpers.AnchorParser;
import slash.navigation.download.tools.helpers.ContentLengthAndLastModified;
import slash.navigation.rest.Get;
import slash.navigation.rest.Head;

import java.io.File;
import java.io.IOException;
import java.util.*;

import static java.lang.Integer.MAX_VALUE;
import static java.lang.Integer.parseInt;
import static java.lang.Math.min;
import static java.lang.System.currentTimeMillis;
import static java.util.Arrays.asList;
import static java.util.Arrays.sort;

/**
* Base for generation of data sources XML from websites.
*
* @author Christian Pesch
*/

public abstract class WebsiteDataSourcesXmlGenerator extends BaseDataSourcesXmlGenerator {

    public void run(String[] args) throws Exception {
        if (args.length < 6) {
            System.err.println(getClass().getSimpleName() + ": <id> <name> <startUrl> <baseUrl> <directory> <writeXmlFile> [<startindex>] [<endindex>]");
            System.exit(20);
        }

        long start = currentTimeMillis();
        DatasourceType datasourceType = new ObjectFactory().createDatasourceType();
        datasourceType.setId(args[0]);
        datasourceType.setName(args[1]);
        String baseUrl = args[2];
        datasourceType.setBaseUrl(baseUrl);
        datasourceType.setDirectory(args[4]);

        List<String> collectedUris = collectUris(args[3], baseUrl);
        System.out.println("Collected URIs: " + collectedUris + " (" + collectedUris.size() + " elements)");

        int startIndex = args.length > 7 ? parseInt(args[6]) : 0;
        int endIndex = args.length > 8 ? parseInt(args[7]) : MAX_VALUE;
        endIndex = min(collectedUris.size(), endIndex);
        List<String> parsingUris = collectedUris.subList(startIndex, endIndex);
        System.out.println("Parsing URIs from " + startIndex + " to " + endIndex + " (" + parsingUris.size() + " elements)");

        List<FileType> fileTypes = new ArrayList<>();
        List<ThemeType> themeTypes = new ArrayList<>();
        List<MapType> mapTypes = new ArrayList<>();

        parseUris(baseUrl, parsingUris, fileTypes, mapTypes, themeTypes);

        datasourceType.getFile().addAll(sortFileTypes(fileTypes));
        datasourceType.getMap().addAll(sortMapTypes(mapTypes));
        datasourceType.getTheme().addAll(sortThemeTypes(themeTypes));

        File writeXmlFile = new File(args[5]);
        writeXml(datasourceType, writeXmlFile);

        long end = currentTimeMillis();
        System.out.println(getClass().getSimpleName() + ": Took " + ((end - start) / 1000) + " seconds to collect " +
                fileTypes.size() + " files, " + mapTypes.size() + " maps and " + themeTypes.size() + " themes");
        System.exit(0);
    }

    private void recursiveCollect(String startUrl, String baseUrl, String uri, Set<String> uris, Set<String> visitedUris) throws IOException {
        if(visitedUris.contains(uri))
            return;
        visitedUris.add(uri);

        // avoid server overload
        try {
            Thread.sleep(100);
        } catch (InterruptedException e) {
            // intentionally do nothing
        }

        System.out.println(getClass().getSimpleName() + ": Downloading Webpage from " + startUrl + uri);
        Get get = new Get(startUrl + uri);
        String result = get.executeAsString();
        // System.out.println(result);

        AnchorParser parser = new AnchorParser();
        List<String> anchors = parser.parseAnchors(result);

        for (String anchor : anchors) {
            // System.out.println(anchor);

            if (anchor.startsWith("./"))
                anchor = anchor.substring(2);

            if (isRecurseAnchor(anchor)) {
                String nextPath = createPath(uri, anchor);
                if (nextPath.startsWith(startUrl))
                    nextPath = nextPath.substring(startUrl.length());
                recursiveCollect(startUrl, baseUrl, nextPath, uris, visitedUris);
            } else if (isIncludeAnchor(anchor)) {
                String nextUri = createUri(uri, anchor);
                if (nextUri.startsWith(baseUrl))
                    nextUri = nextUri.substring(baseUrl.length());
                uris.add(nextUri);
            }
        }
    }

    private String createPath(String uri, String anchor) {
        int index = uri.lastIndexOf('/');
        return index != -1 ? uri.substring(0, index + 1) + anchor : anchor;
    }

    private String createUri(String uri, String anchor) {
        int index = uri.lastIndexOf('/');
        return index != -1 && !anchor.contains("/") ? uri.substring(0, index + 1) + anchor : anchor;
    }

    private List<String> collectUris(String startUrl, String baseUrl) throws IOException {
        Set<String> uris = new HashSet<>();
        recursiveCollect(startUrl, baseUrl, "", uris, new HashSet<String>());

        String[] sortedUris = uris.toArray(new String[uris.size()]);
        sort(sortedUris);
        return asList(sortedUris);
    }

    protected abstract boolean isRecurseAnchor(String anchor);

    protected abstract boolean isIncludeAnchor(String anchor);

    private void parseUris(String baseUrl, List<String> uris, List<FileType> fileTypes, List<MapType> mapTypes, List<ThemeType> themeTypes) throws IOException {
        System.out.println(getClass().getSimpleName() + ": Parsing " + uris.size() + " URIs");
        for (int i = 0; i < uris.size(); i++)
            parseUri(baseUrl, uris.get(i), i, fileTypes, mapTypes, themeTypes);
    }

    protected ContentLengthAndLastModified extractContentLengthAndLastModified(String baseUrl, String uri, int index) {
        // avoid server overload
        try {
            Thread.sleep(100);
        } catch (InterruptedException e) {
            // intentionally do nothing
        }
        System.out.println(getClass().getSimpleName() + ": Extracting content length and last modified from " + baseUrl + uri + " (" + index + ")");
        try {
            Head head = new Head(baseUrl + uri);
            head.executeAsString();
            if (head.isSuccessful()) {
                return new ContentLengthAndLastModified(head.getContentLength(), head.getLastModified());
            }
        } catch (IOException e) {
            System.err.println(getClass().getSimpleName() + ": " + e.getMessage());
        }
        return null;
    }

    protected abstract void parseUri(String baseUrl, String uri, int index, List<FileType> fileTypes, List<MapType> mapTypes, List<ThemeType> themeTypes) throws IOException;
}
TOP

Related Classes of slash.navigation.download.tools.base.WebsiteDataSourcesXmlGenerator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.