Source Code of org.archive.wayback.resourceindex.NutchResourceIndex

/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.resourceindex;


import it.unimi.dsi.lang.MutableString;


import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;


import org.archive.wayback.ResourceIndex;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.SearchResults;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.AccessControlException;
import org.archive.wayback.exception.BadQueryException;
import org.archive.wayback.exception.ConfigurationException;
import org.archive.wayback.exception.ResourceIndexNotAvailableException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.archive.wayback.util.Timestamp;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


/**
 *
 *
 * @author brad
 * @version $Date$, $Revision$
 */
public class NutchResourceIndex implements ResourceIndex {
     private static final Logger LOGGER =
          Logger.getLogger(NutchResourceIndex.class.getName());


  private final static int MAX_RECORDS = 1000;
  private int maxRecords = MAX_RECORDS;


  
   private static final String NUTCH_NS =
       "http://www.nutch.org/opensearchrss/1.0/";
   private String searchUrlBase;
   private DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
   private DocumentBuilder builder;
//   private static final String NUTCH_ARCNAME = "arcname";
//   private static final String NUTCH_ARCOFFSET = "arcoffset";
   private static final String NUTCH_FILENAME = "filename";
   private static final String NUTCH_FILEOFFSET = "fileoffset";
   private static final String NUTCH_ARCDATE = "date";
//   private static final String NUTCH_ARCDATE_ALT = "arcdate";
   private static final String NUTCH_DIGEST = "digest";
   private static final String NUTCH_MIME_TYPE = "type";
//   private static final String NUTCH_PRIMARY_TYPE = "primaryType";
//   private static final String NUTCH_SUB_TYPE = "subType";
//   private static final String NUTCH_CAPTURE_HOST = "site";
   private static final String NUTCH_CAPTURE_URL = "link";


   private static final String NUTCH_SEARCH_RESULT_TAG = "item";
   private static final String NUTCH_SEARCH_RESULTS_TAG = "channel";
   private static final String NUTCH_FIRST_RESULT = "opensearch:startIndex";
   private static final String NUTCH_NUM_RESULTS = "opensearch:totalResults";
   private static final String NUTCH_NUM_RETURNED = "opensearch:itemsPerPage";
   
   private static final String NUTCH_DEFAULT_HTTP_CODE = "200";
   private static final String NUTCH_DEFAULT_REDIRECT_URL = "-";
   
  /**
   * @throws ConfigurationException
   */
  public void init() throws ConfigurationException {
    LOGGER.info("initializing NutchResourceIndex...");
    LOGGER.info("Using base search url " + this.searchUrlBase);


    this.factory.setNamespaceAware(true);
    try {
      this.builder = this.factory.newDocumentBuilder();
    } catch (ParserConfigurationException e) {
      // TODO: quiet extra stacktrace..
      e.printStackTrace();
      throw new ConfigurationException(e.getMessage());
    }
  }
  /* (non-Javadoc)
   * @see org.archive.wayback.ResourceIndex#query(org.archive.wayback.core.WaybackRequest)
   */
  public SearchResults query(WaybackRequest wbRequest) 
    throws ResourceIndexNotAvailableException,
    ResourceNotInArchiveException, BadQueryException, 
    AccessControlException {


    // Get the URL for the request:
    String requestUrl = getRequestUrl(wbRequest);
    Document document = null;
    try {
      // HTTP Request + parse
      LOGGER.info("Requesting OpenSearch: " + requestUrl);
      document =  getHttpDocument(requestUrl);
    } catch (IOException e) {
      // TODO: better error for user:
      e.printStackTrace();
      throw new ResourceIndexNotAvailableException(e.getMessage());
    } catch (SAXException e) {
      e.printStackTrace();
      throw new ResourceIndexNotAvailableException("Unexpected SAX: " + 
          e.getMessage());
    }


    CaptureSearchResults results;
    if(wbRequest.isReplayRequest() || wbRequest.isCaptureQueryRequest()) {
      results = new CaptureSearchResults();      
    } else {
      // TODO: this is wrong, but needs exploration into what NutchWax 
      //       can actually do.
      throw new BadQueryException("Unable to perform path " +
          "prefix requests with this index type");
    }
    NodeList channel = getSearchChannel(document);
    NodeList nodes = getSearchItems(document);


    if (channel == null || channel.getLength() != 1) {
      // TODO: better error for user:
         throw new ResourceNotInArchiveException("No results for " +
             requestUrl);
       }


       if (nodes == null) {
      // TODO: better error for user:
         throw new ResourceNotInArchiveException("No results for " +
             requestUrl);
       }


       for (int i = 0; i < nodes.getLength(); i++) {
         
           Element e = (Element) nodes.item(i);


           List<CaptureSearchResult> resultsList = itemToSearchResults(e);
           if(resultsList != null) {
             for(CaptureSearchResult result : resultsList) {
               results.addSearchResult(result);
             }
           }
       }
       Element channelElement = (Element) channel.item(0);
       
       results.putFilter(SearchResults.RESULTS_FIRST_RETURNED,
           getNodeContent(channelElement,NUTCH_FIRST_RESULT));
       
       results.putFilter(SearchResults.RESULTS_NUM_RESULTS,
           getNodeContent(channelElement,NUTCH_NUM_RESULTS));
       
       results.putFilter(SearchResults.RESULTS_NUM_RETURNED,
           getNodeContent(channelElement,NUTCH_NUM_RETURNED));
       
       results.putFilter(SearchResults.RESULTS_REQUESTED,
           String.valueOf(wbRequest.getResultsPerPage()));
       
    results.putFilter(WaybackRequest.REQUEST_START_DATE,
        Timestamp.earliestTimestamp().getDateStr());
    
       results.putFilter(WaybackRequest.REQUEST_END_DATE,
           Timestamp.latestTimestamp().getDateStr());
    return results;
  }


  private List<CaptureSearchResult> itemToSearchResults(Element e)
    throws ResourceIndexNotAvailableException {


    String fileName = getNodeNutchContent(e,NUTCH_FILENAME);
    String httpCode = NUTCH_DEFAULT_HTTP_CODE;
    String digest = getNodeNutchContent(e,NUTCH_DIGEST);
    String mimeType = getNodeNutchContent(e,NUTCH_MIME_TYPE);
    String offsetStr = getNodeNutchContent(e,NUTCH_FILEOFFSET);
    long offset = 0;
    if(offsetStr != null && offsetStr.length() > 0) {
      offset = Long.parseLong(offsetStr);
    }
    String redirectUrl = NUTCH_DEFAULT_REDIRECT_URL;
    String originalUrl = getNodeContent(e,NUTCH_CAPTURE_URL);
    String urlKey = originalUrl;
    
    NodeList nodes = e.getElementsByTagNameNS(NUTCH_NS,NUTCH_ARCDATE);
    int numDates = nodes.getLength();
    ArrayList<CaptureSearchResult> results = null;


    if(numDates > 0) {
      results = new ArrayList<CaptureSearchResult>();
    
      for(int i = 0; i < numDates; i++) {
        String captureDate = getNodeTextValue(nodes.item(i));
        CaptureSearchResult result = new CaptureSearchResult();
        result.setFile(fileName);
        result.setCaptureTimestamp(captureDate);
        result.setHttpCode(httpCode);
        result.setDigest(digest);
        result.setMimeType(mimeType);
        result.setOffset(offset);
        result.setRedirectUrl(redirectUrl);
        result.setOriginalUrl(originalUrl);
        result.setUrlKey(urlKey);
        results.add(result);
      }
    }
    return results;
  }
  
   protected NodeList getSearchChannel(Document d) {
       if (d ==  null) {
           return null;
       }
       // Jump to the search item list.
       NodeList nodes = d.getElementsByTagName(NUTCH_SEARCH_RESULTS_TAG);
       return (nodes.getLength() <= 0)? null: nodes;
   }
   
   protected NodeList getSearchItems(Document d) {
       if (d ==  null) {
           return null;
       }
       // Jump to the search item list.
       NodeList nodes = d.getElementsByTagName(NUTCH_SEARCH_RESULT_TAG);
       return (nodes.getLength() <= 0)? null: nodes;
   }


  
   protected String getRequestUrl(WaybackRequest wbRequest) 
   throws BadQueryException {


     String urlStr = wbRequest.getRequestUrl();
     String exactDateStr = wbRequest.getReplayTimestamp();
      if (exactDateStr != null && exactDateStr.length() == 0) {
          exactDateStr = null;
      }
       String endDateStr = wbRequest.getEndTimestamp();
       if (endDateStr == null || endDateStr.length() == 0) {
            endDateStr = Timestamp.latestTimestamp().getDateStr();
       }
       String startDateStr = wbRequest.getStartTimestamp();
       if (startDateStr == null || startDateStr.length() == 0) {
         startDateStr = Timestamp.earliestTimestamp().getDateStr();
       }
       int hitsPerPage = wbRequest.getResultsPerPage();
       if(hitsPerPage < 1) {
         throw new BadQueryException("Hits per page must be positive");
       }
       if(hitsPerPage > maxRecords) {
         throw new BadQueryException("Hits per page must be less than " +
             maxRecords);
       }
       int start = (wbRequest.getPageNum()-1) * hitsPerPage;
       if (urlStr == null || urlStr.length() <= 0) {
           throw new BadQueryException("Url is empty.");
       }
       // Construct the search url.
       MutableString ms = new MutableString(this.searchUrlBase)
           .append("?query=");
       // Add 'date:...+' to query string.
       ms.append("date%3A").append(startDateStr).append('-').append(endDateStr);
       ms.append('+');
       // Add 'url:URL'.
       if(wbRequest.isUrlQueryRequest()) {
           ms.append("url%3A");
       } else {
           ms.append("exacturl%3A");
       }
       try {
            ms.append(java.net.URLEncoder.encode("\""+urlStr+"\"", "UTF-8"));
       } catch (UnsupportedEncodingException e) {
         throw new BadQueryException(e.toString());
       }
       ms.append("&hitsPerPage=").append(hitsPerPage);
       ms.append("&start=").append(start);
       ms.append("&dedupField=site");
       // As we are always searching agains an url, a
       // higher perDup/Site will return just more versions
       ms.append("&hitsPerDup=").append(hitsPerPage);
       ms.append("&hitsPerSite=").append(hitsPerPage);
       
       return ms.toString();
   }


  
  // extract the text content of a single nutch: tag under a node
   protected String getNodeNutchContent(Element e, String key) {
       NodeList nodes = e.getElementsByTagNameNS(NUTCH_NS, key);
       String result = null;
       if (nodes != null && nodes.getLength() > 0) {
           result = getNodeTextValue(nodes.item(0));
       }
       return (result == null || result.length() == 0)? null: result;
   }


   // extract the text content of a single tag under a node
   protected String getNodeContent(Element e, String key) {
       NodeList nodes = e.getElementsByTagName(key);
       String result = null;
       if (nodes != null && nodes.getLength() > 0) {
           result = getNodeTextValue(nodes.item(0));
       }
       return (result == null || result.length() == 0)? null: result;
   }
  private String getNodeTextValue(Node n) {
    if(n.hasChildNodes()) {
      if(n.getFirstChild().getNodeName().equals("#text")) {
        return n.getFirstChild().getNodeValue();
      }
    }
    return "";
  }


   // do an HTTP request, plus parse the result into an XML DOM
   protected synchronized Document getHttpDocument(String url) 
     throws IOException, SAXException {
     
       Document d = null;
       d = this.builder.parse(url);
       return d;
   }
  /**
   * @return the searchUrlBase
   */
  public String getSearchUrlBase() {
    return searchUrlBase;
  }
  /**
   * @param searchUrlBase the searchUrlBase to set
   */
  public void setSearchUrlBase(String searchUrlBase) {
    this.searchUrlBase = searchUrlBase;
  }
  /**
   * @return the maxRecords
   */
  public int getMaxRecords() {
    return maxRecords;
  }
  /**
   * @param maxRecords the maxRecords to set
   */
  public void setMaxRecords(int maxRecords) {
    this.maxRecords = maxRecords;
  }
  public void shutdown() throws IOException {
    
  }
}
Source Code of org.archive.wayback.resourceindex.NutchResourceIndex

Related Classes of org.archive.wayback.resourceindex.NutchResourceIndex