Package org.archive.wayback.util.url

Source Code of org.archive.wayback.util.url.UrlOperations

/*
*  This file is part of the Wayback archival access software
*   (http://archive-access.sourceforge.net/projects/wayback/).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.wayback.util.url;

import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.URIException;
import org.archive.url.UsableURI;
import org.archive.url.UsableURIFactory;
import org.archive.wayback.archivalurl.ArchivalUrl;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.webapp.AccessPoint;

/**
* Class containing common static URL methods. Primarily resolveUrl() and
* the (currently) unused isAuthority().
*
* @author brad
* @version $Date$, $Revision$
*/
public class UrlOperations {
  private static final Logger LOGGER = Logger.getLogger(
      UrlOperations.class.getName());
 
  /**
   * ARC/WARC specific DNS resolution record.
   */
  public final static String DNS_SCHEME = "dns:";
  /**
   * HTTP
   */
  public final static String HTTP_SCHEME = "http://";
  /**
   * HTTPS
   */
  public final static String HTTPS_SCHEME = "https://";
  /**
   * FTP
   */
  public final static String FTP_SCHEME = "ftp://";
  /**
   * MMS
   */
  public final static String MMS_SCHEME = "mms://";
  /**
   * RTSP
   */
  public final static String RTSP_SCHEME = "rtsp://";
 
  /**
   * Default scheme to assume if unspecified. No context implied...
   */
  public final static String DEFAULT_SCHEME = HTTP_SCHEME; 
 
  /**
   * go brewster
   */
  public final static String WAIS_SCHEME = "wais://";
 
  /**
   * array of static Strings for all "known" schemes
   */
  public final static String ALL_SCHEMES[] = {
    HTTP_SCHEME,
    HTTPS_SCHEME,
    FTP_SCHEME,
    MMS_SCHEME,
    RTSP_SCHEME,
    WAIS_SCHEME
  };
 
 
  /**
   * character separating host from port within a URL authority
   */
  public final static char PORT_SEPARATOR = ':';
  /**
   * character which delimits the path from the authority in a... in some
   * URLs.
   */
  public final static char PATH_START = '/';
 
  private static final String ALL_TLDS = "ac|ad|ae|aero|af|ag|ai|al|am|an" +
      "|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi" +
      "|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci" +
      "|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec" +
      "|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh" +
      "|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id" +
      "|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh" +
      "|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc" +
      "|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum" +
      "|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz" +
      "|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro" +
      "|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv" +
      "|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv" +
      "|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|xn--0zwm56d" +
      "|xn--11b5bs3a9aj6g|xn--80akhbyknj4f|xn--9t4b11yi5a|xn--deba0ad" +
      "|xn--g6w251d|xn--hgbk6aj7f53bba|xn--hlcj6aya9esc7a|xn--jxalpdlp" +
      "|xn--kgbechtv|xn--mgbaam7a8h|xn--mgberp4a5d4ar|xn--p1ai" +
      "|xn--wgbh1c|xn--zckzah|ye|yt|za|zm|zw";

  private static final String IP_PATTERN = "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+";
 
    private static final Pattern AUTHORITY_REGEX =
        Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLDS + "))|" +
            "(" + IP_PATTERN + ")");

//    private static final Pattern AUTHORITY_REGEX_SIMPLE =
//        Pattern.compile("([0-9a-z_.-]++)");
    private static final Pattern HOST_REGEX_SIMPLE =
        Pattern.compile("(?:[0-9a-z_.:-]+@)?([0-9a-z_.-]++)");
    private static final Pattern USERINFO_REGEX_SIMPLE =
        Pattern.compile("^([0-9a-z_.:-]+)(?:@[0-9a-z_.-]++)");
   
    /**
     * Tests if the String argument looks like it could be a legitimate
     * authority fragment of a URL, that is, is it an IP address, or, are the
     * characters legal in an authority, and does the string end with a legal
     * TLD.
     *
   * @param authString String representation of a fragment of a URL
   * @return boolean indicating whether urlPart might be an Authority.
   */
  public static boolean isAuthority(String authString) {
    Matcher m = AUTHORITY_REGEX.matcher(authString);
   
    return (m != null) && m.matches();
  }
 
  /** Resolve URL, but return a minimally escaped version in case of
   *  error
   * @param baseUrl the base URL against which the url should be resolved
   * @param url the URL, possibly relative, to make absolute.
   * @return url resolved against baseUrl, unless it is absolute already, and
   * further transformed by whatever escaping normally takes place with a
   * UsableURI.
   * In case of error, return URL.
   */
  public static String resolveUrl(String baseUrl, String url) {
    String resolvedUrl = resolveUrl(baseUrl, url, null);
    if (resolvedUrl == null) {
      resolvedUrl = url.replace(" ", "%20");
      resolvedUrl = resolvedUrl.replace("\r", "%0D");
    }
    return resolvedUrl;
  }
 
  /**
   * Resolve a possibly relative url argument against a base URL.
   * @param baseUrl the base URL against which the url should be resolved
   * @param url the URL, possibly relative, to make absolute.
         * @param defaultValue The default value to return if the supplied values can't be resolved.
   * @return url resolved against baseUrl, unless it is absolute already, and
   * further transformed by whatever escaping normally takes place with a
   * UsableURI.
   * In case of error, return the defaultValue
   */
  public static String resolveUrl(String baseUrl, String url, String defaultValue) {
       
    for(final String scheme : ALL_SCHEMES) {
      if(url.startsWith(scheme)) {
        try {
          return UsableURIFactory.getInstance(url).getEscapedURI();
        } catch (URIException e) {
          LOGGER.warning(e.getLocalizedMessage() + ": " + url);
          // can't let a space exist... send back close to whatever came
          // in...
          return defaultValue;
        }
      }
    }
    UsableURI absBaseURI;
    UsableURI resolvedURI = null;
    try {
      absBaseURI = UsableURIFactory.getInstance(baseUrl);
      resolvedURI = UsableURIFactory.getInstance(absBaseURI, url);
    } catch (URIException e) {
      LOGGER.warning(e.getLocalizedMessage() + ": " + url);
      return defaultValue;
    }
   
    return resolvedURI.getEscapedURI();
  }
 
  /**
   * Attempt to find the scheme (http://, https://, etc) from a given URL.
   * @param url URL String to parse for a scheme.
   * @return the scheme, including trailing "://" if known, null otherwise.
   */
  public static String urlToScheme(final String url) {
    for(final String scheme : ALL_SCHEMES) {
      if(url.startsWith(scheme)) {
        return scheme;
      }
    }
    return null;
  }
 
  /**
   * Return the default port for the scheme String argument, if known.
   * @param scheme String scheme, including '://', as in, "http://", "ftp://"
   * @return the default port for the scheme, or -1 if the scheme isn't known.
   */
  public static int schemeToDefaultPort(final String scheme) {
    if(scheme.equals(HTTP_SCHEME)) {
      return 80;
    }
    if(scheme.equals(HTTPS_SCHEME)) {
      return 443;
    }
    if(scheme.equals(FTP_SCHEME)) {
      return 21;
    }
    if(scheme.equals(RTSP_SCHEME)) {
      return 554;
    }
    if(scheme.equals(MMS_SCHEME)) {
      return 1755;
    }
    return -1;
  }

  /**
   * Attempt to extract the path component of a url String argument.
   * @param url the URL which may contain a path, sans scheme.
   * @return the path component of the URL, or "" if it contains no path.
   */
  public static String getURLPath(String url) {
    url = stripURLScheme(url);
    int pathIdx = url.indexOf(UrlOperations.PATH_START);
    if(pathIdx == -1) {
      return "/";
    }
    return url.substring(pathIdx);
  }
  /**
   * Attempt to extract the path component of a url String argument.
   * @param url the URL which may contain a path, sans scheme.
   * @return the path component of the URL, or "" if it contains no path.
   */
  public static String stripURLScheme(String url) {
    String lcUrl = url.toLowerCase();
    for(String scheme : ALL_SCHEMES) {
      if(lcUrl.startsWith(scheme)) {
        return url.substring(scheme.length());
      }
    }
    return url;
 
  /**
   * Attempt to strip default ports out of URL strings.
   * @param url the original URL possibly including a port
   * @return the URL sans port, if the scheme was recognized and the default
   * port was supplied, otherwise, the original URL.
   */
  public static String stripDefaultPortFromUrl(String url) {
    String scheme = urlToScheme(url);
    if(scheme == null) {
      return url;
    }
    int defaultPort = schemeToDefaultPort(scheme);
    if(defaultPort == -1) {
      return url;
    }

    String portStr = null;
    // is there a slash after the scheme?
    int slashIdx = url.indexOf('/', scheme.length());
    if(slashIdx == -1) {
      portStr = String.format(":%d", defaultPort);
      if(url.endsWith(portStr)) {
        return url.substring(0,url.length() - portStr.length());
      }
    }
    portStr = String.format(":%d/", defaultPort);
    int idx = url.indexOf(portStr);
    if(idx == -1) {
      return url;
    }
    // if that occurred before the first / (after the scheme) then strip it:
    if(slashIdx < idx) {
      return url;
    }
    // we want to strip out the portStr:
    StringBuilder sb = new StringBuilder(url.length());
    sb.append(url.substring(0,idx));
    sb.append(url.substring(idx + (portStr.length()-1)));
    return sb.toString();
  }

  /**
   * @param orig String containing a URL, possibly beginning with "http:/".
   * @return original string if orig begins with "http://", or a new String
   * with the extra slash, if orig only had one slash.
   * @see #fixupScheme
   */
  public static String fixupHTTPUrlWithOneSlash(String orig) {
    if(orig.startsWith("http:/") && ! orig.startsWith(HTTP_SCHEME)) {
      // very likely the IE "you must have meant 1 slash, not 2 bug:
      StringBuilder sb = new StringBuilder(orig.length()+1);
      sb.append(HTTP_SCHEME);
      return sb.append(orig.substring(6)).toString();
    }
    return orig;
  }
   
  /**
   * fixes up malformed scheme part.
   * <p>currently supports fixing missing second slash for protocols
   * {@code http}, {@code https}, {@code ftp}, {@code rtsp} and
   * {@code mms}. For example fixing {@code http:/} to {@code https://}</p>
   * @param url URL to be checked and fixed
   * @return new String, or {@code url} if not fix is required.
   * @since 1.8.1
   */
  public static String fixupScheme(String url) {
    final String[] SCHEMES = {
      "http:/", "https:/", "ftp:/", "rtsp:/", "mms:/"
    };
    int ul = url.length();
    for (String scheme : SCHEMES) {
      int sl = scheme.length();
      if (url.startsWith(scheme) && (ul == sl || url.charAt(sl) != '/')) {
        return scheme + "/" + url.substring(sl);
      }
    }
    return url;
  }

  /**
   * Attempt to extract the hostname component of an absolute URL argument.
   * @param url the url String from which to extract the hostname
   * @return the hostname within the URL, or the url argument if the host
   * cannot be found.
   */
  public static String urlToHost(String url) {
    String lcUrl = url.toLowerCase();
    if(lcUrl.startsWith(DNS_SCHEME)) {
      return lcUrl.substring(DNS_SCHEME.length());
    }
    for(String scheme : ALL_SCHEMES) {
      if(lcUrl.startsWith(scheme)) {
        int authorityIdx = scheme.length();

        Matcher m =
          HOST_REGEX_SIMPLE.matcher(lcUrl.substring(authorityIdx));
        if(m.find()) {
          return m.group(1);
        }
      }
    }
    return url;
  }

  /**
   * Extract userinfo from the absolute URL argument, that is, "username@", or
   * "username:password@" if present.
   * @param url the URL from which to extract the userinfo
   * @return the userinfo found, not including the "@", or null if no userinfo
   * is found
   */
  public static String urlToUserInfo(String url) {
    String lcUrl = url.toLowerCase();
    if(lcUrl.startsWith(DNS_SCHEME)) {
      return null;
    }
    for(String scheme : ALL_SCHEMES) {
      if(lcUrl.startsWith(scheme)) {
        int authorityIdx = scheme.length();

        Matcher m =
          USERINFO_REGEX_SIMPLE.matcher(lcUrl.substring(authorityIdx));
        if(m.find()) {
          return m.group(1);
        }
      }
    }
    return null;
  }
 
  /**
   * Find and return the parent directory of the URL argument
   * @param url to find the parent directory of
   * @return parent directory of URL, or null, if either the url argument is
   * invalid, or if the url is the root of the authority.
   */
  public static String getUrlParentDir(String url) {
   
    try {
      UsableURI uri = UsableURIFactory.getInstance(url);
      String path = uri.getPath();
      if(path.length() > 1) {
        int startIdx = path.length()-1;
        if(path.charAt(path.length()-1) == '/') {
          startIdx--;
        }
        int idx = path.lastIndexOf('/',startIdx);
        if(idx >= 0) {
          uri.setPath(path.substring(0,idx+1));
          uri.setQuery(null);
          return uri.toString();
        }
      }
    } catch (URIException e) {
      LOGGER.warning(e.getLocalizedMessage() + ": " + url);
    }
    return null;
  }
 
  public static String computeIdentityUrl(WaybackRequest wbRequest)
  {
    AccessPoint accessPoint = wbRequest.getAccessPoint();

    boolean origIdentity = wbRequest.isIdentityContext();
    wbRequest.setIdentityContext(true);
   
    ArchivalUrl aUrl = new ArchivalUrl(wbRequest);
    String bestPath = aUrl.toString();
    String betterURI = accessPoint.getReplayPrefix() + bestPath;

    //reset the isIdentity flag just in case
    wbRequest.setIdentityContext(origIdentity);
   
    return betterURI;
  }
}
TOP

Related Classes of org.archive.wayback.util.url.UrlOperations

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.