Package org.archive.wayback.resourceindex

Source Code of org.archive.wayback.resourceindex.NutchResourceIndex

/* NutchResourceIndex
*
* $Id: NutchResourceIndex.java 2226 2008-04-11 03:48:48Z bradtofel $
*
* Created on 2:29:41 PM Oct 11, 2006.
*
* Copyright (C) 2006 Internet Archive.
*
* This file is part of Wayback.
*
* Wayback is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Wayback is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Wayback; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package org.archive.wayback.resourceindex;

import it.unimi.dsi.mg4j.util.MutableString;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.logging.Logger;
import java.net.URL;
import java.net.MalformedURLException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.archive.wayback.ResourceIndex;
import org.archive.wayback.WaybackConstants;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.SearchResult;
import org.archive.wayback.core.SearchResults;
import org.archive.wayback.core.Timestamp;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.AccessControlException;
import org.archive.wayback.exception.BadQueryException;
import org.archive.wayback.exception.ConfigurationException;
import org.archive.wayback.exception.ResourceIndexNotAvailableException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


/**
*
*
* @author brad
* @version $Date: 2008-04-11 04:48:48 +0100 (Sex, 11 Abr 2008) $, $Revision: 2226 $
*/
public class NutchResourceIndex implements ResourceIndex {
     private static final Logger LOGGER =
          Logger.getLogger(NutchResourceIndex.class.getName());
 
   public final static int MAX_RECORDS = 10000; // TODO should be parameterized with the same value of entry maxRRecords in webapps/wayback/WEB-INF/wayback.xml        
   private int maxRecords = MAX_RECORDS; 
    
   private String searchUrlBase;
 
   private static final String NUTCH_ARCNAME = "pwa:arcname";
   private static final String NUTCH_ARCOFFSET = "pwa:arcoffset";
   private static final String NUTCH_ARCDATE = "pwa:tstamp";
   private static final String NUTCH_ARCDATE_ALT = "pwa:arcdate";
   private static final String NUTCH_DIGEST = "pwa:digest";
   private static final String NUTCH_PRIMARY_TYPE = "pwa:primaryType";
   private static final String NUTCH_SUB_TYPE = "pwa:subType";
   private static final String NUTCH_CAPTURE_HOST = "pwa:site";
   private static final String NUTCH_CAPTURE_URL = "source";  
   private static final String NUTCH_DOC_ID = "pwa:id";
   private static final String NUTCH_INDEX_ID = "pwa:index";   

   private static final String NUTCH_SEARCH_RESULT_TAG = "item";
   private static final String NUTCH_SEARCH_RESULTS_TAG = "channel";
   private static final String NUTCH_FIRST_RESULT = "opensearch:startIndex";
   private static final String NUTCH_NUM_RESULTS = "opensearch:totalResults";
   private static final String NUTCH_NUM_RETURNED = "opensearch:itemsPerPage";
  
   private static final String NUTCH_DEFAULT_HTTP_CODE = "200";
   private static final String NUTCH_DEFAULT_REDIRECT_URL = "-";
  
  /**
   * @throws ConfigurationException
   */
  public void init() throws ConfigurationException {     
    //LOGGER.info("initializing NutchResourceIndex...");
    //LOGGER.info("Using base search url " + this.searchUrlBase);

    /*
    this.factory.setNamespaceAware(true);
    try {
      this.builder = this.factory.newDocumentBuilder();
    } catch (ParserConfigurationException e) {
      // TODO: quiet extra stacktrace..
      e.printStackTrace();
      throw new ConfigurationException(e.getMessage());
    }
    */
  }
  /* (non-Javadoc)
   * @see org.archive.wayback.ResourceIndex#query(org.archive.wayback.core.WaybackRequest)
   */
  public SearchResults query(WaybackRequest wbRequest)
    throws ResourceIndexNotAvailableException,
    ResourceNotInArchiveException, BadQueryException,
    AccessControlException {

    // Get the URL for the request:
    String requestUrl = getRequestUrl(wbRequest);
    Document document = null;
    try {
      // HTTP Request + parse
      document =  getHttpDocument(requestUrl);
    } catch (IOException e) {
      // TODO: better error for user:
      e.printStackTrace();
      throw new ResourceIndexNotAvailableException(e.getMessage());
    } catch (SAXException e) {
      e.printStackTrace();
      throw new ResourceIndexNotAvailableException("Unexpected SAX: " +
          e.getMessage());
    }

    SearchResults results;
    String type = wbRequest.get(WaybackConstants.REQUEST_TYPE);
    if(type.equals(WaybackConstants.REQUEST_REPLAY_QUERY) ||
        type.equals(WaybackConstants.REQUEST_URL_QUERY)) {
      results = new CaptureSearchResults();     
    } else {
      // TODO: this is wrong, but needs exploration into what NutchWax can actually do.
      throw new BadQueryException("Unable to perform path prefix requests with this index type");
    }
    NodeList channel = getSearchChannel(document);
    NodeList nodes = getSearchItems(document);

    if (channel == null || channel.getLength() != 1) {
      // TODO: better error for user:
         throw new ResourceNotInArchiveException("No results for " +
             requestUrl);
       }

       if (nodes == null) {
      // TODO: better error for user:
         throw new ResourceNotInArchiveException("No results for " +
             requestUrl);
       }

       for (int i = 0; i < nodes.getLength(); i++) {
        
           Element e = (Element) nodes.item(i);

           SearchResult result = elementToSearchResult(e);
           results.addSearchResult(result);
       }
       Element channelElement = (Element) channel.item(0);
      
       results.putFilter(WaybackConstants.RESULTS_FIRST_RETURNED,
           getNodeContent(channelElement,NUTCH_FIRST_RESULT));
      
       results.putFilter(WaybackConstants.RESULTS_NUM_RESULTS,
           getNodeContent(channelElement,NUTCH_NUM_RESULTS));
      
       results.putFilter(WaybackConstants.RESULTS_NUM_RETURNED,
           getNodeContent(channelElement,NUTCH_NUM_RETURNED));
      
       results.putFilter(WaybackConstants.RESULTS_REQUESTED,
           String.valueOf(wbRequest.getResultsPerPage()));
      
    results.putFilter(WaybackConstants.REQUEST_START_DATE,
        Timestamp.earliestTimestamp().getDateStr());
   
       results.putFilter(WaybackConstants.REQUEST_END_DATE,
           Timestamp.latestTimestamp().getDateStr());
    return results;
  }

  private SearchResult elementToSearchResult(Element e)
    throws ResourceIndexNotAvailableException {

    SearchResult result = new SearchResult();

    String arcFile = getNodeNutchContent(e,NUTCH_ARCNAME);
    if (arcFile!=null) {
      result.put(WaybackConstants.RESULT_ARC_FILE,arcFile);
    }
   
        // The date in nutchwax is now named 'tstamp' and its
        // 17 characters rather than 14.  Pass first 14 only.
        String d = getNodeNutchContent(e,NUTCH_ARCDATE);
        if(d == null) {
          d = getNodeNutchContent(e,NUTCH_ARCDATE_ALT);
        }
        if(d == null) {
          throw new ResourceIndexNotAvailableException("Missing arcdate field in search results");
        }
        if (d.length() == 17) {
            d = d.substring(0, 14);
        }
    result.put(WaybackConstants.RESULT_CAPTURE_DATE, d);
   
    //result.put(WaybackConstants.RESULT_HTTP_CODE,getNodeContent(e,""));
    result.put(WaybackConstants.RESULT_HTTP_CODE,NUTCH_DEFAULT_HTTP_CODE);
   
    String digest = getNodeNutchContent(e,NUTCH_DIGEST);
    if (digest!=null) {
      result.put(WaybackConstants.RESULT_MD5_DIGEST, digest);
    }
   
    if (getNodeNutchContent(e,NUTCH_PRIMARY_TYPE)!=null && getNodeNutchContent(e,NUTCH_SUB_TYPE)!=null) {
      result.put(WaybackConstants.RESULT_MIME_TYPE,
        getNodeNutchContent(e,NUTCH_PRIMARY_TYPE) + "/" +
        getNodeNutchContent(e,NUTCH_SUB_TYPE));
    }
   
    String arcOffset = getNodeNutchContent(e,NUTCH_ARCOFFSET);
    if (arcOffset!=null) {
      result.put(WaybackConstants.RESULT_OFFSET, arcOffset);
    }
   
    String host = getNodeNutchContent(e,NUTCH_CAPTURE_HOST);
    if (host!=null) {
      result.put(WaybackConstants.RESULT_ORIG_HOST, host);
    }

    result.put(WaybackConstants.RESULT_REDIRECT_URL,NUTCH_DEFAULT_REDIRECT_URL);
    String url = getAttributeContent(e,NUTCH_CAPTURE_URL,"url");
    if (url!=null) {
      result.put(WaybackConstants.RESULT_URL, url);
    }
   
    /* BUG 0000155 */
    String digestDiff=getNodeNutchContent(e,WaybackConstants.RESULT_DIGEST_DIFF);
    if (digestDiff!=null) {
      result.put(WaybackConstants.RESULT_DIGEST_DIFF, digestDiff);
    }
   
    String docId=getNodeNutchContent(e,NUTCH_DOC_ID);
    if (docId!=null) {
      result.put(WaybackConstants.REQUEST_DOC_ID, docId);
    }
   
    String indexId=getNodeNutchContent(e,NUTCH_INDEX_ID);
    if (indexId!=null) {
      result.put(WaybackConstants.REQUEST_INDEX_ID, indexId);
    }
    /* BUG 0000155 */
   
    return result;
  }
 
   protected NodeList getSearchChannel(Document d) {
       if (d ==  null) {
           return null;
       }
       // Jump to the search item list.
       NodeList nodes = d.getElementsByTagName(NUTCH_SEARCH_RESULTS_TAG);
       return (nodes.getLength() <= 0)? null: nodes;
   }
  
   protected NodeList getSearchItems(Document d) {
       if (d ==  null) {
           return null;
       }
       // Jump to the search item list.
       NodeList nodes = d.getElementsByTagName(NUTCH_SEARCH_RESULT_TAG);
       return (nodes.getLength() <= 0)? null: nodes;
   }

 
   protected String getRequestUrl(WaybackRequest wbRequest)
   throws BadQueryException {

     boolean existStartDate=true; // BUG 120608
     boolean existEndDate=true; // BUG 120608
    
     String urlStr = wbRequest.get(WaybackConstants.REQUEST_URL);
     String exactDateStr = wbRequest.get(WaybackConstants.REQUEST_EXACT_DATE);
      if (exactDateStr != null && exactDateStr.length() == 0) {
          exactDateStr = null;
      }
       String endDateStr = wbRequest.get(WaybackConstants.REQUEST_END_DATE);
       if (endDateStr == null || endDateStr.length() == 0) {
         existEndDate=false; // BUG 120608
            endDateStr = Timestamp.latestTimestamp().getDateStr();
       }
       String startDateStr = wbRequest.get(WaybackConstants.REQUEST_START_DATE);
       if (startDateStr == null || startDateStr.length() == 0) {
         existStartDate=false; // BUG 120608
         startDateStr = Timestamp.earliestTimestamp().getDateStr();
       }
       int hitsPerPage = wbRequest.getResultsPerPage();
       if(hitsPerPage < 1) {
         throw new BadQueryException("Hits per page must be positive");
       }
       if(hitsPerPage > maxRecords) {
         throw new BadQueryException("Hits per page must be less than " +
             maxRecords);
       }
       int start = (wbRequest.getPageNum()-1) * hitsPerPage;
      
     /* BUG 0000155 */
        String multDet = wbRequest.get(WaybackConstants.REQUEST_MULT_DETAILS);
     String docId = wbRequest.get(WaybackConstants.REQUEST_DOC_ID);
     String indexId = wbRequest.get(WaybackConstants.REQUEST_INDEX_ID);
     /* BUG 0000155 */                          
    
       if ((urlStr==null || urlStr.length()<=0) && (docId==null || indexId==null)) {
           throw new BadQueryException("Url is empty.");
       }
       // Construct the search url.
       MutableString ms = new MutableString(this.searchUrlBase)
           .append("?query=");
       // Add 'date:...+' to query string.   
       if (existStartDate || existEndDate) { // BUG wayback 0000051;  if exist startDate OR endDate
         ms.append("date%3A").append(startDateStr).append('-').append(endDateStr);  
       }    
       else if (exactDateStr!=null) { // BUG wayback 0000153
         ms.append("closestdate%3A").append(exactDateStr);
       }
      
       ms.append('+');
       // Add 'url:URL'.
       if(wbRequest.get(WaybackConstants.REQUEST_TYPE).equals(
                  WaybackConstants.REQUEST_URL_PREFIX_QUERY)) {
           ms.append("url%3A").append(urlStr);
       } else {
           try {       
           if (docId!=null && indexId!=null) {
             // do nothing
           }
           else if (wbRequest.get(WaybackConstants.REQUEST_ALIASES)!=null && wbRequest.get(WaybackConstants.REQUEST_ALIASES).equals("true")) {
                ms.append("exacturlexpand%3A").append(java.net.URLEncoder.encode(urlStr, "UTF-8"));
           }
           else {                        
            URL url=null;
            boolean error=false;
            try {
              url=new URL(urlStr);
            }
            catch (MalformedURLException e) {
              error=true;
            }
            
            if (!error && !urlStr.endsWith("/") && url.getQuery()==null && url.getPath().indexOf('.')==-1) { // BUG nutchwax 0000357 - add also a "/" if the url's query is null and is not a file
              ms.append("exacturlexpandmin%3A").append(java.net.URLEncoder.encode(urlStr, "UTF-8"));
            }
            else {            
              ms.append("exacturl%3A").append(java.net.URLEncoder.encode(urlStr, "UTF-8"));
            }
           }
           }
           catch (UnsupportedEncodingException e) {
             throw new BadQueryException(e.toString());
           }
           catch (NullPointerException e) {
               throw new BadQueryException(e.toString());
           }
       }
       ms.append("&hitsPerPage=").append(hitsPerPage);
       ms.append("&start=").append(start);
       ms.append("&dedupField=site");       
       // As we are always searching agains an url, a
       // higher perDup/Site will return just more versions
       ms.append("&hitsPerDup=").append(hitsPerPage);
       ms.append("&hitsPerSite=").append(hitsPerPage);      
       ms.append("&waybackQuery=true"); // indicates that this OpenSearch request came from wayback      
                     
       /* BUG 0000155 */       
        if (multDet!=null) {
          ms.append("&multDet=").append(multDet);
        }              
     if (docId!=null) {
       ms.append("&id=").append(docId);
     }        
     if (indexId!=null) {
       ms.append("&index=").append(indexId);
     }
     /* BUG 0000155 */       
      
       return ms.toString();
   }

 
  // extract the text content of a single nutch: tag under a node
   protected String getNodeNutchContent(Element e, String key) {
       //NodeList nodes = e.getElementsByTagNameNS(NUTCH_NS, key); TODO remove
     NodeList nodes = e.getElementsByTagName(key);
       String result = null;
       if (nodes != null && nodes.getLength() > 0) {
           result = nodes.item(0).getTextContent();
       }
       return (result == null || result.length() == 0)? null: result;
   }

   // extract the text content of a single tag under a node
   protected String getNodeContent(Element e, String key) {
       NodeList nodes = e.getElementsByTagName(key);
       String result = null;
       if (nodes != null && nodes.getLength() > 0) {
           result = nodes.item(0).getTextContent();
       }
       return (result == null || result.length() == 0)? null: result;
   }
  
   /**
    * Get attribute from element
    * @param e parent element
    * @param elemKey element name  
    * @param attrKey attribute name
    */
   protected String getAttributeContent(Element e, String elemKey, String attrKey) {
       NodeList nodes = e.getElementsByTagName(elemKey);
       String result = null;
       if (nodes != null && nodes.getLength() > 0) {
           result = ((Element)nodes.item(0)).getAttribute(attrKey);
       }
       return (result == null || result.length() == 0) ? null: result;
   }

   // do an HTTP request, plus parse the result into an XML DOM
   protected /*synchronized*/ Document getHttpDocument(String url) // BUG wayback - 0000393 - deadlock
     throws IOException, SAXException {
    
     DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
     factory.setNamespaceAware(true);
     DocumentBuilder builder = null;
     try {   
       builder = factory.newDocumentBuilder();
     }
     catch (ParserConfigurationException e) {   
       e.printStackTrace();
       throw new IOException(e.getMessage());
     }
                  
       Document d = builder.parse(url);
       return d;
   }
  /**
   * @return the searchUrlBase
   */
  public String getSearchUrlBase() {
    return searchUrlBase;
  }
  /**
   * @param searchUrlBase the searchUrlBase to set
   */
  public void setSearchUrlBase(String searchUrlBase) {
    this.searchUrlBase = searchUrlBase;
  }
  /**
   * @return the maxRecords
   */
  public int getMaxRecords() {
    return maxRecords;
  }
  /**
   * @param maxRecords the maxRecords to set
   */
  public void setMaxRecords(int maxRecords) {
    this.maxRecords = maxRecords;
  }
  public void shutdown() throws IOException {
   
  }
}
TOP

Related Classes of org.archive.wayback.resourceindex.NutchResourceIndex

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.