Package org.apache.oodt.cas.pushpull.protocol.http

Source Code of org.apache.oodt.cas.pushpull.protocol.http.HttpClient

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.oodt.cas.pushpull.protocol.http;

//OODT imports
import org.apache.oodt.cas.pushpull.exceptions.ProtocolException;
import org.apache.oodt.cas.pushpull.protocol.Protocol;
import org.apache.oodt.cas.pushpull.protocol.ProtocolFile;
import org.apache.oodt.cas.pushpull.protocol.ProtocolPath;
import org.apache.oodt.cas.metadata.util.MimeTypeUtils;

//TIKA imports
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.Link;
import org.apache.tika.sax.LinkContentHandler;

//JDK imports
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
*
* Http Protocol communication class
*
*
* @author bfoster
* @version $Revision$
*
*/
public class HttpClient extends Protocol {

  static String DIR = "dir";

  static String FILE = "file";

  static String IGNORE = "ignore";

  static HashMap<String, LinkedList<ProtocolFile>> linkChildren = new HashMap<String, LinkedList<ProtocolFile>>();

  static boolean takeAllFiles = true;

  HttpPath parentPath;

  boolean abort;

  HttpPath currentPath;

  boolean isConnected;

  MimeTypeUtils mimeTypes;

  public HttpClient() throws InstantiationException {
    super("http");
    try {
      mimeTypes = new MimeTypeUtils();
    } catch (Exception e) {
      e.printStackTrace();
      throw new InstantiationException(
          "Failed to load tika configuration file : " + e.getMessage());
    }
    isConnected = false;
  }

  protected void chDir(ProtocolPath path) throws ProtocolException {
    if (!(path instanceof HttpPath))
      throw new ProtocolException(
          "HttpClient must receive a HttpPath - failed to cd");

    HttpPath httpPath = (HttpPath) path;
    try {
      if (!this
          .isDirectory(httpPath.getLink().toString(), path.getPathString()))
        throw new ProtocolException(path
            + " is not a directory (mime type must be text/html)");
      this.currentPath = httpPath;
    } catch (Exception e) {
      throw new ProtocolException("Failed to cd to " + path + " : "
          + e.getMessage());
    }
  }

  public void cdToRoot() {
    this.currentPath = this.parentPath;
  }

  public void connect(String host, String username, String password)
      throws ProtocolException {
    try {
      URL newURL = new URL("http://" + host + "/");
      newURL.openStream().close();
      currentPath = parentPath = new HttpPath("/", true, newURL, null);
      isConnected = true;
    } catch (Exception e) {
      throw new ProtocolException("Failed to connect to http://" + host + " : "
          + e.getMessage());
    }
  }

  public void disconnectFromServer() throws ProtocolException {
    currentPath = parentPath = null;
  }

  public void getFile(ProtocolFile file, File toLocalFile)
      throws ProtocolException {

    OutputStream out = null;
    InputStream in = null;
    try {
      this.abort = false;
      out = new BufferedOutputStream(new FileOutputStream(toLocalFile));
      in = ((HttpPath) file.getProtocolPath()).getLink().openStream();

      byte[] buffer = new byte[1024];
      int numRead;
      long numWritten = 0;
      while ((numRead = in.read(buffer)) != -1 && !this.abort) {
        out.write(buffer, 0, numRead);
        numWritten += numRead;
      }
      in.close();
      out.close();
    } catch (Exception e) {
      throw new ProtocolException("Failed to get file " + file + " : "
          + e.getMessage());
    } finally {
      if (in != null)
        try {
          in.close();
        } catch (Exception e) {
          // log failure
        }
      if (out != null)
        try {
          out.close();
        } catch (Exception e) {
          // log failure
        }
    }
  }

  public void abortCurFileTransfer() {
    this.abort = true;
  }

  public List<ProtocolFile> listFiles() throws ProtocolException {
    return parseLink(currentPath);
  }

  public ProtocolFile getCurrentWorkingDir() throws ProtocolException {
    try {
      return new ProtocolFile(this.getRemoteSite(), currentPath);
    } catch (Exception e) {
      throw new ProtocolException("Failed to get current working directory : "
          + e.getMessage());
    }
  }

  public boolean isConnected() throws ProtocolException {
    return this.isConnected;
  }

  public LinkedList<ProtocolFile> parseLink(HttpPath path)
      throws ProtocolException {
    LinkedList<ProtocolFile> children = linkChildren.get(path.getLink()
        .toString());
    if (path.isDirectory() && children == null) {
      try {

        // Open link
        HttpURLConnection con = (HttpURLConnection) path.getLink()
            .openConnection();
        con.connect();
        con.getResponseMessage();

        // if redirection took place, then change the ProtocolFile's URL
        if (!path.getLink().toString().equals(con.getURL().toString()))
          path = new HttpPath(path.getPathString(), path.isDirectory(), con
              .getURL(), path);

        // create URL source reader
        Scanner scanner = new Scanner(con.getInputStream());

        // Read in link
        StringBuffer sb = new StringBuffer("");
        while (scanner.hasNext())
          sb.append(scanner.nextLine());

        HtmlParser parser = new HtmlParser();
        Metadata met = new Metadata();
        LinkContentHandler handler = new LinkContentHandler();

        parser.parse(new ByteArrayInputStream(sb.toString().getBytes()),
            handler, met);
        List<Link> links = handler.getLinks();
        children = new LinkedList<ProtocolFile>();
        for (Link link : links) {
          String href = link.getUri();
          String linkName = link.getTitle();
          String curPath = this.pwd().getProtocolPath().getPathString();
          String linkPath = curPath + (curPath.endsWith("/") ? "" : "/")
              + linkName;
          children.add(new ProtocolFile(this.getRemoteSite(), new HttpPath(
              linkPath, isDirectory(href, linkPath), new URL(href), path)));
        }
        linkChildren.put(path.getLink().toString(), children);

      } catch (Exception e) {
        e.printStackTrace();
        throw new ProtocolException("Failed to get children links for " + path
            + " : " + e.getMessage());
      }
    }
    return children;
  }

  public static String findLinkInATag(String aTag) {
    // find 'href' attribute
    String find = aTag.substring(aTag.indexOf("href") + 4);
    // USE STRICT FINDING FIRST
    // (['\"])\s*?[(http)(./)(..)/#].+?\\1
    // finds link between ' or ", which starts with one of
    // the following: http, ./, .., /, #
    // these starting possibilities can then be followed any
    // number of characters until the corresponding
    // ' or " is reached.
    String patternRegExp = "(['\"])\\s*?[\\(http\\)\\(\\./\\)\\(\\.\\.\\)/#].+?\\1";
    Pattern linkPattern = Pattern.compile(patternRegExp);
    Matcher linkMatch = linkPattern.matcher(find);
    if (linkMatch.find())
      find = find.substring(linkMatch.start() + 1, linkMatch.end() - 1);
    else {
      // RELAX FINDING SOME
      patternRegExp = "(['\"])\\s*?[^./].+?\\1";
      linkPattern = Pattern.compile(patternRegExp);
      linkMatch = linkPattern.matcher(find);
      if (linkMatch.find())
        find = find.substring(linkMatch.start() + 1, linkMatch.end() - 1);
      else {
        // EXTREMELY RELAX FINDING
        patternRegExp = "[^\"='/>\\s]+?[^\\s>\"']*?";
        linkPattern = Pattern.compile(patternRegExp);
        linkMatch = linkPattern.matcher(find);
        if (linkMatch.find())
          find = find.substring(linkMatch.start(), linkMatch.end());
        else {
          return null;
        }
      }
    }
    return find;
  }

  public boolean isDirectory(String link, String virtualPath)
      throws ProtocolException, IOException {
    // connect URL and get content type
    try {
      String mime = this.mimeTypes.autoResolveContentType(link, MimeTypeUtils
          .readMagicHeader(new URL(link).openStream()));
      return (mime.equals("text/html") && !virtualPath.endsWith(".html"));
    } catch (Exception e) {
      throw new IOException("URL does not exist " + link);
    }
  }

  public static String createLinkFromHref(HttpPath parent, String href) {
    if (!href.startsWith("http")) {
      String link = parent.getLink().toExternalForm();
      if (href.startsWith("..")) {
        int index = link.substring(0, link.lastIndexOf("/")).lastIndexOf("/");
        href = (index < 7) ? link + href.substring(2) : link.substring(0, link
            .substring(0, link.lastIndexOf("/")).lastIndexOf("/"))
            + href.substring(2);
      } else if (href.startsWith("./")) {
        int index = link.lastIndexOf("/");
        href = (index < 7) ? link + href.substring(1) : link
            .substring(0, index)
            + href.substring(1);
      } else if (href.startsWith("/")) {
        URL url = parent.getLink();
        href = url.getProtocol() + "://" + url.getHost() + href;
      } else {
        // find the last / in current link
        int index = link.lastIndexOf("/");
        // (index < 7) checks if in the current link, "/" only exists
        // in the protocol section of link (i.e. http://jpl.nasa.gov)
        href = (index < 7) ? link + "/" + href : link.substring(0, index) + "/"
            + href;
      }
    }

    // remove "/" at end of link
    if (href.endsWith("/"))
      href = href.substring(0, href.length() - 1);
    href = href.trim();

    return href;
  }

  public ProtocolFile getProtocolFileFor(String path, boolean isDir)
      throws ProtocolException {
    try {
      StringTokenizer st = new StringTokenizer(path, "/ ");
      HttpPath curPath = this.parentPath;
      // System.out.println(parentPath);
      if (st.hasMoreTokens()) {
        do {
          String token = st.nextToken();
          LinkedList<ProtocolFile> children = this.parseLink(curPath);
          for (ProtocolFile pFile : children) {
            if (pFile.getName().equals(token)) {
              // System.out.println("token " + token + " " +
              // pFile);
              curPath = (HttpPath) pFile.getProtocolPath();
              continue;
            }
          }
        } while (st.hasMoreTokens());
        if (curPath.equals(this.parentPath))
          return new ProtocolFile(this.getRemoteSite(), new HttpPath(path,
              isDir, new URL("http://"
                  + this.getRemoteSite().getURL().getHost() + path), curPath));
      }
      return new ProtocolFile(this.getRemoteSite(), curPath);
    } catch (Exception e) {
      throw new ProtocolException("Failed to get ProtocolPath for " + path);
    }
  }

  @Override
  public boolean deleteFile(ProtocolFile file) {
    return false;
  }

  public static void main(String[] args) throws Exception {
    String urlString = null, downloadToDir = null;
    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("--url"))
        urlString = args[++i];
      else if (args[i].equals("--downloadToDir"))
        downloadToDir = args[++i];
    }

    if (urlString == null)
      throw new Exception("Must specify a url to download: --url <url>");

    URL url = new URL(urlString);
    ProtocolFile urlFile = new ProtocolFile(null, new HttpPath(url.getPath(),
        false, url, null));
    File toFile = new File(downloadToDir, urlFile.getName());
    toFile = toFile.getAbsoluteFile();
    toFile.createNewFile();
    new HttpClient().getFile(urlFile, toFile);
  }

}
TOP

Related Classes of org.apache.oodt.cas.pushpull.protocol.http.HttpClient

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.