Source Code of thredds.crawlabledataset.CrawlableDatasetDods

/*
 * Copyright 1998-2009 University Corporation for Atmospheric Research/Unidata
 *
 * Portions of this software were developed by the Unidata Program at the
 * University Corporation for Atmospheric Research.
 *
 * Access and use of this software shall impose the following obligations
 * and understandings on the user. The user is granted the right, without
 * any fee or cost, to use, copy, modify, alter, enhance and distribute
 * this software, and any derivative works thereof, and its supporting
 * documentation for any purpose whatsoever, provided that this entire
 * notice appears in all copies of the software, derivative works and
 * supporting documentation.  Further, UCAR requests that the user credit
 * UCAR/Unidata in any publications that result from the use of this
 * software or in any product that includes this software. The names UCAR
 * and/or Unidata, however, may not be used in any advertising or publicity
 * to endorse or promote any products or commercial entity unless specific
 * written permission is obtained from UCAR/Unidata. The user also
 * understands that UCAR/Unidata is not obligated to provide the user with
 * any support, consulting, training or assistance of any kind with regard
 * to the use, operation and performance of this software nor to provide
 * the user with any updates, revisions, new versions or "bug fixes."
 *
 * THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL,
 * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
 * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
 * WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE.
 */
package thredds.crawlabledataset;


import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;


import thredds.util.DodsURLExtractor;


/**
 * A description
 * 
 * @author Ethan Davis
 * @author Bas Retsios
 * @since Jun 8, 2005 15:34:04 -0600
 */
public class CrawlableDatasetDods implements CrawlableDataset {
  static private org.slf4j.Logger log = org.slf4j.LoggerFactory
      .getLogger(CrawlableDatasetDods.class);


  private static DodsURLExtractor urlExtractor = null;
  
  private static Map listDatasetsMap = null; // maintain an in-memory copy for performance reasons .. TODO: add a version-check
  
  private String path;
  
  private URLConnection pathUrlConnection = null; // store this, for performance reasons


  private String name;


  private Object configObj = null;
  
  protected CrawlableDatasetDods() {
  }


  protected CrawlableDatasetDods(String path, Object configObj)
  {


    if (urlExtractor == null)
      urlExtractor = new DodsURLExtractor();
    
    if (listDatasetsMap == null) // for performance
      listDatasetsMap = new HashMap();


    if (configObj != null) {
      log.debug("CrawlableDatasetDods(): config object not null, it will be ignored <"
              + configObj.toString() + ">.");
      this.configObj = configObj;
    }
    
    if (path.startsWith("http:")) {
      this.path = path;


      try {
        new URI(path); // check syntax .. URISyntaxException if its not good
        name = getName(path);
      } catch (URISyntaxException e) {
            String tmpMsg = "Bad URI syntax for path <" + path + ">: " + e.getMessage();
            log.debug( "CrawlableDatasetDods(): " + tmpMsg);
            throw new IllegalArgumentException( tmpMsg);
      }


        // Check if this accessPoint URL is an OPeNDAP server URL.
      // For now commented-out because it takes far too long when expanding a directory:
      // all links would be tested, because a CrawlableDataset is new-ed too fast (when its parent is expanded).
      /*
        String apVersionString = path + (path.endsWith("/") ? "version" : "/version");
        String apVersionResultContent = null;
        try
        {
          apVersionResultContent = urlExtractor.getTextContent( apVersionString);
        }
        catch (java.io.IOException e)
        {
          String tmpMsg = "The accessPoint URL is not an OPeNDAP server URL (no version info) <" + apVersionString + ">";
          log.error( "CrawlableDatasetDods(): " + tmpMsg, e);
        }
        if ( apVersionResultContent == null ||
           (apVersionResultContent.indexOf( "DODS") == -1 &&
             apVersionResultContent.indexOf( "OPeNDAP") == -1 &&
             apVersionResultContent.indexOf( "DAP") == -1))
        {
          String tmpMsg = "The accessPoint URL version info is not valid <" + apVersionResultContent + ">";
          log.error(  "CrawlableDatasetDods(): " + tmpMsg);
        }
        */
    } else {
      String tmpMsg = "Invalid url <" + path + ">.";
      log.debug("CrawlableDatasetDods(): " + tmpMsg);
      throw new IllegalArgumentException(tmpMsg);
    }
  }
  
  private CrawlableDatasetDods(CrawlableDatasetDods parent, String childPath)
  {
    String normalChildPath = childPath.startsWith("/")?childPath.substring(1):childPath;
    this.path = parent.getPath();
    this.path += this.path.endsWith("/") ? normalChildPath : "/" + normalChildPath;
    this.name = getName(path);
    this.configObj = null;
  }


  private String getName(String path) {
    // Attempt to return the last name in the path name sequence.
    if (!path.equals("/")) {
      String tmpName = path.endsWith("/")
                       ? path.substring(0, path.length() - 1) : path;
      int index = tmpName.lastIndexOf("/");
      if (index != -1)
        tmpName = tmpName.substring(index + 1);
      return tmpName;
    } else
      return path;
  }


  /**
   * Provide access to the java.net.URI that this CrawlableDataset represents.
   *
   * @return the java.net.URI that this CrawlableDataset represents.
   */
  public URI getUri()
  {
    try
    {
      return new URI( this.path);
    }
    catch ( URISyntaxException e )
    {
      return null;
    }
  }


  public Object getConfigObject() {
    return configObj;
  }


  public String getPath() {
    return (this.path);
  }


  public String getName() {
    return (this.name);
  }


  public boolean isCollection() {
    return isCollection(path);
  }


  public CrawlableDataset getDescendant( String relativePath )
  {
    if ( relativePath.startsWith( "/"))
      throw new IllegalArgumentException( "Path must be relative <" + relativePath + ">.");
    return new CrawlableDatasetDods(this, relativePath);
  }


  // how do we determine if a url is a collection?
  // we can't count on a trailing backslash, as this was removed by CrawlableDatasetFactory
  // for now, assume collection unless a known file extension is encountered
  private static String [] knownFileExtensions = {".hdf", ".xml", ".nc", ".bz2", ".cdp", ".jpg"};
  
  private static boolean isCollection(String path)
  {
    String testPath = path.toLowerCase(); // otherwise our matches may fail
    if (isDodsDataset(testPath))
      return false;
    else
    {
      int i = 0;
      while ((i < (knownFileExtensions.length)) && !testPath.endsWith(knownFileExtensions[i]))
          ++i;
      return (i >= knownFileExtensions.length); // i < length means we deal with a known file ==> no collection 
    }
  }


  private static String [] dodsExtensions = {".html", ".htm", ".das", ".dds", ".info"};


  private static String getDodsExtension(String path)
  {
    String extension = "";
    String testPath = path.toLowerCase(); // otherwise our matches may fail


    int i = 0;
    while ((i < (dodsExtensions.length)) && !testPath.endsWith(dodsExtensions[i]))
      ++i;
    if (i < dodsExtensions.length)
      extension = dodsExtensions[i];
    return extension;
  }


  private static boolean isDodsDataset(String path)
  {
    return getDodsExtension(path).length() > 0;
  }
  
  private static String removeDodsExtension(String path)
  {
    String dodsExtension = getDodsExtension(path);
    if (dodsExtension.length() > 0)
      path = path.substring(0, path.length() - dodsExtension.length());


    return path;
  }
  
  // This function shouldn't be here !!!
  // It is a workaround for many OPeNDAP servers that crop part of their urls (the /opendap-bin/nph-dods/ part)
  // e.g. of server with problem (2-Nov-2006): http://acdisc.sci.gsfc.nasa.gov/opendap-bin/nph-dods/OPENDAP/Giovanni/
  private String forceChild(String url)
  {
    String prefix = path;
    if (prefix.endsWith("/"))
      prefix = path.substring(0, path.length() - 1); // because the url also contains a '/' that we will use      
    int j = url.substring(0, url.length() - 1).lastIndexOf('/'); // url.length() - 1 was intentional .. if the last char is a '/', we're interested in the previous one.
    if (j >= 0)
    {
      String ret = prefix + url.substring(j);
      return ret;
    }
    else // relative paths .. leave intact
      return url;
  }


  public List listDatasets() throws IOException {


    if (!this.isCollection()) {
      String tmpMsg = "This dataset <" + this.getPath()
          + "> is not a collection dataset.";
      log.error("listDatasets(): " + tmpMsg);
      throw new IllegalStateException(tmpMsg);
    }
    
    if (listDatasetsMap.containsKey(path)) // shortcut .. for performance
      return (List)listDatasetsMap.get(path);
    else
    {
      List list = new ArrayList();
      List pathList = new ArrayList(); // only for detecting duplicates (after removing the extension, sometimes we end up with duplicates)
  
      // Get list of possible datasets from current URL.
      List possibleDsList = null;
      try {
        String openPath = path;
        if (!openPath.endsWith("/")) // if you skip this, you will find that relative URLs don't work (fails in "extract", and in particular in URL u = new URL(baseURL, value))
          openPath += "/";
        possibleDsList = urlExtractor.extract(openPath);
      } catch (java.io.IOException e) {
        log.warn("listDatasets(): IOException while extracting dataset info from given OPeNDAP directory <"
                + path + ">, return empty list: " + e.getMessage());
        return (list);
      }
  
      // Handle each link in the current access path.
      String curDsUrlString = null;
      for (Iterator it = possibleDsList.iterator(); it.hasNext(); ) {
        curDsUrlString = (String) it.next();
        // Perform some tests on curDsUrlString
        // Skip datasets that aren't OPeNDAP datasets (".html") or
        // collection datasets ("/").
        if ((!isDodsDataset(curDsUrlString)) && (!isCollection(curDsUrlString))) {
          log.warn("expandThisLevel(): Dataset isn't an OPeNDAP dataset or collection dataset, skip <"
                  + path + ">.");
          continue;
        }
  
        curDsUrlString = removeDodsExtension(curDsUrlString);


        // This function goes a bit too far trying to recover from servers that drop part of URL path.
        // However, it also converts URLs that point to external servers to be subdirectories of this CrDS.
        //curDsUrlString = forceChild(curDsUrlString);


        // Skip any URLs that aren't children of this CrDs
        if ( !curDsUrlString.startsWith( path ) )
        {
          log.debug( "listDatasets(): skipping URL <" + curDsUrlString + ">, not child of this CrDs <" + path + ">." );
          continue;
        }


        if (pathList.contains(curDsUrlString))
          continue; // duplicate
        else
          pathList.add(curDsUrlString);
        
  
        // Avoid links back down the path hierarchy (i.e., parent directory links).
        // Comment: this call was taken over from CrawlableDatasetFile. Since we use forceChild, this call is currently useless.
        if (!curDsUrlString.startsWith(path)) {
          log.debug("listDatasets(): current path <" + curDsUrlString
              + "> not child of given" + " location <" + path
              + ">, skip.");
          continue;
        }
  
        try {
          new URI(curDsUrlString); // syntax check
        } catch (URISyntaxException e) {
          log.error("listDatasets(): Skipping dataset  <"
              + curDsUrlString + "> due to URISyntaxException: "
              + e.getMessage());
          continue;
        }
  
        log.debug("listDatasets(): handle dataset (" + curDsUrlString
                + ")");
  
        // So far so good .. curDsUrlString passed all tests, thus add it to the list
        try {
          list.add(CrawlableDatasetFactory.createCrawlableDataset(
              curDsUrlString, this.getClass().getName(), null));
        } catch (ClassNotFoundException e) {
          log.warn("listDatasets(): Can't make CrawlableDataset for child url <"
                  + curDsUrlString + ">: " + e.getMessage());
        } catch (NoSuchMethodException e) {
          log.warn("listDatasets(): Can't make CrawlableDataset for child url <"
                  + curDsUrlString + ">: " + e.getMessage());
        } catch (IllegalAccessException e) {
          log.warn("listDatasets(): Can't make CrawlableDataset for child url <"
                  + curDsUrlString + ">: " + e.getMessage());
        } catch (InvocationTargetException e) {
          log.warn("listDatasets(): Can't make CrawlableDataset for child url <"
                  + curDsUrlString + ">: " + e.getMessage());
        } catch (InstantiationException e) {
          log.warn("listDatasets(): Can't make CrawlableDataset for child url <"
                  + curDsUrlString + ">: " + e.getMessage());
        }
      }
      
      listDatasetsMap.put(path, list); // remember it next time, for performance
  
      return list;
    }
  }


  public List listDatasets(CrawlableDatasetFilter filter) throws IOException {
    List list = this.listDatasets();
    if (filter == null)
      return list;
    List retList = new ArrayList();
    for (Iterator it = list.iterator(); it.hasNext();) {
      CrawlableDataset curDs = (CrawlableDataset) it.next();
      if (filter.accept(curDs)) {
        retList.add(curDs);
      }
    }
    return (retList);
  }


  public CrawlableDataset getParentDataset() {
    if (!path.equals("/")) {
      String parentPath = path;
      int index = parentPath.lastIndexOf( "/", parentPath.endsWith( "/") ? parentPath.length() - 2 : parentPath.length() - 1 );
      if ( index != -1 )
        parentPath = parentPath.substring( 0, index + 1 );
      
      return new CrawlableDatasetDods( parentPath, null);
    } else
      return null;
  }


  public boolean exists() {
  if (pathUrlConnection == null)
  try {
    URL u = new URL(path);
    pathUrlConnection = u.openConnection();
  } catch (MalformedURLException e) {
  } catch (IOException e) {
  }
  if ( pathUrlConnection != null )
  try {
    int responseCode = ((HttpURLConnection)pathUrlConnection).getResponseCode();
    if (responseCode >= 200 && responseCode < 300) // Successful
      return true;
  } catch (IOException e) {
  }
  return false;
  }


  public long length() {
    if (this.isCollection())
      return (0);
    if (pathUrlConnection == null)
    {
      try {
          URL u = new URL(path);
          pathUrlConnection = u.openConnection();
      } catch (MalformedURLException e) {
      } catch (IOException e) {
      }
    }
    if (pathUrlConnection != null)
      return pathUrlConnection.getContentLength();
    else
      return (-1);
  }


  public Date lastModified() {
    if (pathUrlConnection == null)
    {
      try {
        URL u = new URL(path);
        pathUrlConnection = u.openConnection();
      } catch (MalformedURLException e) {
      } catch (IOException e) {
      }
    }


    if (pathUrlConnection != null)
    {
      long lastModified = pathUrlConnection.getLastModified();
      if (lastModified != 0)
      {
        Calendar cal = Calendar.getInstance();
        cal.clear();
        cal.setTimeInMillis(lastModified);
        return (cal.getTime());
      }
      else
        return null;
    }
    else
      return null;
  }


  public String toString()
  {
    return this.path;
  }
}
Source Code of thredds.crawlabledataset.CrawlableDatasetDods

Related Classes of thredds.crawlabledataset.CrawlableDatasetDods