Source Code of org.apache.sling.samples.webloader.internal.WebloaderJob$DocTooBigException

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.sling.samples.webloader.internal;


import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;


import javax.jcr.Item;
import javax.jcr.Node;
import javax.jcr.RepositoryException;
import javax.jcr.Session;
import javax.swing.text.BadLocationException;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;


import org.apache.sling.commons.mime.MimeTypeService;
import org.apache.sling.samples.webloader.WebloaderException;
import org.apache.sling.samples.webloader.WebloaderJobStatus;
import org.apache.sling.jcr.api.SlingRepository;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/** A Webloader job, manages retrieval of documents from the web and storage
 *  in the Sling repository. This code is based on the "populate.jsp" example
 *  of the jackrabbit-webapp module.
 */ 
class WebloaderJob extends Thread implements WebloaderJobStatus {
    private Throwable error;
    private int numDocsLoaded;
    private final int maxDocsToRetrieve;
    private final int maxDocSize;
    private String statusInfo = "initialized";
    private String statusDetails = "";
    private boolean running = true;
    private final String jobId;
    private final String webQuery;
    private String storagePath;
    private final SlingRepository repository;
    private final MimeTypeService mimeTypeService;
    private Session session;
    private Node storageRoot;
    private static int idCounter;
    private final String [] filetypes;
    
    public static final String [] DEFAULT_FILETYPES = { "pdf", "rtf", "ppt", "doc", "xls" };
    public static final int URL_RETRIEVE_TIMEOUT_SECONDS = 10;
    
    private static final Logger log = LoggerFactory.getLogger(WebloaderJob.class);
    
    @SuppressWarnings("serial")
    static class DocTooBigException extends IOException {
        DocTooBigException(URL url, int size) {
            super("Document at URL " + url + " too big (" + size + " bytes), will be ignored");
        }
    }
    
    WebloaderJob(SlingRepository repository, MimeTypeService mimeTypeService, 
            String webQuery, String storagePath, String fileExtensions, int maxDocsToRetrieve, int maxDocSize) {
        synchronized (WebloaderJob.class) {
            jobId = String.valueOf(idCounter++);
        }
        
        this.repository = repository;
        this.mimeTypeService = mimeTypeService;
        this.webQuery = webQuery;
        this.storagePath = storagePath;
        this.maxDocsToRetrieve = maxDocsToRetrieve;
        this.maxDocSize = maxDocSize;
        
        final String [] ft = fileExtensions == null ? null : fileExtensions.split(",");
        if(ft!=null && ft.length > 0) {
            filetypes = new String[ft.length];
            for(int i=0; i < ft.length; i++) {
                filetypes[i] = ft[i].trim().toLowerCase();
            }
        } else {
            filetypes = DEFAULT_FILETYPES;
        }
        
        if(mimeTypeService == null) {
            throw new WebloaderException("Missing MimeTypeService");
        }
        if(repository == null) {
            throw new WebloaderException("Missing Repository");
        }
        
        setDaemon(true);
        start();
    }
    
    @Override
    public String toString() {
        final StringBuffer sb = new StringBuffer();
        for(String str : filetypes) {
            if(sb.length() > 0) {
                sb.append(",");
            }
            sb.append(str);
        }
        
        return getClass().getSimpleName() + ", webQuery=" + webQuery 
            + ", storagePath=" + storagePath
            + ", fileTypes=" + sb.toString()
            + ", maxDocsToRetrieve=" + maxDocsToRetrieve
            + ", maxDocSize=" + maxDocSize
        ;
    }
    
    @Override
    public void run() {
        log.debug("Job thread starting: {}", this);
        
        // TODO should use a session provided by client, but can we use it
        // safely for our async job?
        session = null;
        
        if(storagePath.charAt(0) == '/') {
            storagePath = storagePath.substring(1);
        }
        final String absStoragePath = "/" + storagePath;
        
        try {
            session = repository.loginAdministrative(null);
            if(session.itemExists(absStoragePath)) {
                final Item i = session.getItem(absStoragePath);
                if(i.isNode()) {
                    storageRoot = (Node)i;
                } else {
                    throw new WebloaderException("Item at " + storagePath + " is not a Node");
                }
            } else {
                // TODO deep-create hierarchy if needed
                storageRoot = session.getRootNode().addNode(storagePath);
                session.save();
            }
            
            int offset = 0;
            for(String type : filetypes) {
                final URL[] urls = getDocumentUrlsFromGoogle(type, offset);
                for(URL url : urls) {
                    try {
                        getAndStoreDocument(url);
                        session.save();
                        numDocsLoaded++;
                        if(numDocsLoaded >= maxDocsToRetrieve) {
                            break;
                        }
                    } catch(DocTooBigException dtb) {
                        log.info(dtb.getMessage());
                    } catch(Exception e) {
                        log.warn("Exception while retrieving url " + url, e);
                    } finally {
                        session.refresh(false);
                    }
                }
                offset += 10;
                
                if(numDocsLoaded >= maxDocsToRetrieve) {
                    break;
                }
            }
            
            statusInfo = "All done.";
            
        } catch(Exception e) {
            error = e;
            log.warn("Exception in WebloaderJob.run()", e);
            statusInfo = "Exception while running job: " + e;
            
        } finally {
            if(session != null) {
                session.logout();
            }
            statusDetails = "";
            running = false;
        }
        
        if(numDocsLoaded >= maxDocsToRetrieve) {
            log.info("Stopped after retrieving maximum number of documents ({})", maxDocsToRetrieve);
        }
        
        log.info("Job thread ends: {}, {} documents loaded", this, numDocsLoaded);
    }
    
    private URL [] getDocumentUrlsFromGoogle(String currentFiletype, int start) throws IOException, BadLocationException {
        final List urls = new ArrayList();
        String query = webQuery + " filetype:" + currentFiletype;
        final URL google = new URL("http://www.google.com/search?q=" +
                URLEncoder.encode(query, "UTF-8") + "&start=" + start);
        log.debug("Querying {}", google.toString());
        statusInfo = "Querying " + google.toString();
        statusDetails = "";
        URLConnection con = google.openConnection();
        con.setRequestProperty("User-Agent", "");
        InputStream in = con.getInputStream();
        try {
            HTMLEditorKit kit = new HTMLEditorKit();
            HTMLDocument doc = new HTMLDocument();
            doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
            kit.read(new InputStreamReader(in, "UTF-8"), doc, 0);
            HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
            while (it.isValid()) {
                if(it.getAttributes() != null) {
                    String href = (String) it.getAttributes().getAttribute(HTML.Attribute.HREF);
                    if (href != null && href.endsWith("." + currentFiletype)) {
                        URL url = new URL(new URL("http", "www.google.com", "dummy"), href);
                        if (url.getHost().indexOf("google") == -1) {
                            log.debug("Got document URL from google: {}", url);
                            statusDetails = "Got URL " + url;
                            urls.add(url);
                        }
                    }
                }
                it.next();
            }
        } finally {
            in.close();
        }
        return (URL[]) urls.toArray(new URL[urls.size()]);


    }
    
    private void getAndStoreDocument(URL currentURL) throws RepositoryException, IOException {


        statusInfo = "Retrieving document " + currentURL;
        statusDetails = "";
        
        // build JCR path for storing document, based on its URL
        String path = currentURL.getPath();
        if (path.startsWith("/")) {
            path = path.substring(1);
        }
        final String host = currentURL.getHost();
        final List folderNames = new ArrayList();
        folderNames.addAll(Arrays.asList(host.split("\\.")));
        Collections.reverse(folderNames);
        folderNames.addAll(Arrays.asList(path.split("/", 0)));
        final String filename = URLDecoder.decode((String) folderNames.remove(folderNames.size() - 1), "UTF-8").replaceAll(":", "_");
        Node node = storageRoot;
        for (Iterator fn = folderNames.iterator(); fn.hasNext(); ) {
            String name = URLDecoder.decode((String) fn.next(), "UTF-8");
            name = name.replaceAll(":", "_");
            if (name.length() == 0) {
                continue;
            }
            if (!node.hasNode(name)) {
                node.addNode(name, "nt:folder");
            }
            node = node.getNode(name);
        }
        
        log.debug("Retrieving document {}, will be stored at {}", currentURL, node.getPath() + "/" + filename);
        
        if (!node.hasNode(filename)) {
            Node file = node.addNode(filename, "nt:file");
            final Node resource = file.addNode("jcr:content", "nt:resource");
            getAndStoreContent(currentURL, resource, filename);
        }


    }
    
    private void getAndStoreContent(URL currentURL, Node resource, String filename) 
    throws RepositoryException, IOException {
        statusInfo = "Retrieving content from " + currentURL;
        statusDetails = "";
        
        final URLConnection con = currentURL.openConnection();
        con.setReadTimeout(URL_RETRIEVE_TIMEOUT_SECONDS * 1000);
        InputStream in = con.getInputStream();
        try {
            // Read with a ProgressInputStream, so that our status is updated while
            // downloading
            int length = con.getContentLength();
            if (length != -1) {
                if(length > maxDocSize * 1024) {
                    throw new DocTooBigException(currentURL, length);
                }
                in = new ProgressInputStream(in, length) {
                    int nextReport = 0;
                    protected void reportProgress(int bytesRead, int totalBytesToRead) {
                        if(bytesRead > nextReport) {
                            nextReport += 1024;
                            statusDetails = "Downloaded " + bytesRead + " bytes out of " + totalBytesToRead;
                        }
                    }
                };
            }
            
            resource.setProperty("jcr:data", in);
            final String mimeType = mimeTypeService.getMimeType(filename);
            resource.setProperty("jcr:mimeType", mimeType);
            final Calendar lastModified = Calendar.getInstance();
            lastModified.setTimeInMillis(con.getLastModified());
            resource.setProperty("jcr:lastModified", lastModified);
        } finally {
            if(in != null) {
                in.close();
            }
        }
    }
    
    String getJobId() {
        return jobId;
    }
    
    /** {@inheritDoc} */
    public Throwable getError() {
        return error;
    }


    /** {@inheritDoc} */
    public int getNumberOfDocumentsLoaded() {
        return numDocsLoaded;
    }


    /** {@inheritDoc} */
    public String getStatusInfo() {
        return statusInfo;
    }


    /** {@inheritDoc} */
    public String getStatusDetails() {
        return statusDetails;
    }


    /** {@inheritDoc} */
    public boolean isRunning() {
        return running;
    }


}
Source Code of org.apache.sling.samples.webloader.internal.WebloaderJob$DocTooBigException

Related Classes of org.apache.sling.samples.webloader.internal.WebloaderJob$DocTooBigException