Source Code of org.archive.access.nutch.indexer.WaxIndexingFilter

/* WaxIndexingFilter
 * 
 * $Id: WaxIndexingFilter.java 1896 2007-08-01 21:44:31Z jlee-archive $
 * 
 * Copyright (C) 2005 Internet Archive.
 * 
 * This file is part of the archive-access tools project
 * (http://sourceforge.net/projects/archive-access).
 * 
 * The archive-access tools are free software; you can redistribute them and/or
 * modify them under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or any
 * later version.
 * 
 * The archive-access tools are distributed in the hope that they will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 * Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License along with
 * the archive-access tools; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
package org.archive.access.nutch.indexer;


import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.MapWritable;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.util.mime.MimeType;
import org.apache.nutch.util.mime.MimeTypeException;
import org.apache.nutch.util.mime.MimeTypes;
import org.archive.access.nutch.NutchwaxConfiguration;
import org.archive.util.ArchiveUtils;
import org.archive.util.Base32;




/**
  * Add to the index fields needed by Internet Archive searching.
  * 
  * @author Stack
  */
public class WaxIndexingFilter implements IndexingFilter
{
  public static final Log LOGGER =
    LogFactory.getLog(WaxIndexingFilter.class.getName());


  private Configuration conf;


  // Below names need to sync with whats in
  // {@link org.archive.access.nutch.Arc2Segment}
  public static final String ARCFILENAME_KEY = "arcname";
  public static final String ARCFILEOFFSET_KEY = "arcoffset";
  public static final String ARCCOLLECTION_KEY = "collection";
  public static final String DATE_KEY = "date";
  private static final String CONTENT_TYPE_KEY = "content-type";


  public static final String EXACTURL_KEY = "exacturl";
  public static final String DOMAIN_KEY = "domain"; // TODO MC


  /**
    * Set into metadata by the nutch html parser.
    */
    private static final String ENCODING_KEY = "CharEncodingForConversion";


  private MessageDigest md = null;


  /** Get the MimeTypes resolver instance. */
  private final static MimeTypes MIME =
    MimeTypes.get(NutchwaxConfiguration.getConfiguration().
    get("mime.types.file"));


  public WaxIndexingFilter() throws NoSuchAlgorithmException
  {
    super();
    this.md = MessageDigest.getInstance("MD5");
  }


  public Document filter(Document doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks)
  {
    if (url == null || url.getLength() <= 0)
    {
      LOGGER.error(doc.toString() + " has no url");
      
      return doc;
    }
    
    String urlStr = url.toString();
    
    // Stored, indexed and un-tokenized. Date is already GMT so don't
    // mess w/ timezones. Date is stored as seconds since epoch to
    // facilitate sorting (The Lucene Sort interprets the IA 14-char
    // date string as a float; rounding of float values equates floats
    // that shouldn't equate: e.g:
    // float f = Float.parseFloat("20050524133833");
    // float g = Float.parseFloat("20050524133834");
    // float h = Float.parseFloat("20050524133835");
    // System.out.println(f == g);
    // System.out.println(f == h);
    // ...prints true twice.
    // So, have seconds since epoch for the date we index.
    long seconds = datum.getFetchTime() / 1000;
    
    if (seconds > Integer.MAX_VALUE)
    {
      LOGGER.warn("Fetch time " + Long.toString(seconds) +
        " is > Integer.MAX_VALUE. Setting to zero");
        
      seconds = 0;
    }
    
    doc.add(new Field(DATE_KEY, ArchiveUtils.zeroPadInteger((int) seconds),
      Field.Store.YES, Field.Index.UN_TOKENIZED));


    // Add as stored, unindexed, and untokenized. Don't warn if absent.
    // Its not a tradegy.
    add(urlStr, doc, "encoding", parse.getData().getMeta(ENCODING_KEY),
      false, true, true, false, false);


    // Get metadatas.
    MapWritable mw = datum.getMetaData();
    ParseData pd = parse.getData();


    // Add as stored, indexed, and untokenized but not lowercased.
    add(urlStr, doc, ARCCOLLECTION_KEY,
      getMetadataValue(ARCCOLLECTION_KEY, pd, mw),
      false, true, true, false);


    // Add as stored, indexed, and untokenized. Preserve case for
    // arcname since eventually it will be used to find an arc on
    // filesystem.
    add(urlStr, doc, ARCFILENAME_KEY,
      getMetadataValue(ARCFILENAME_KEY, pd, mw),
      false, true, true, false);
      
    add(urlStr, doc, ARCFILEOFFSET_KEY,
      getMetadataValue(ARCFILEOFFSET_KEY, pd, mw),
      false, true, false, false);


    // This is a nutch 'more' field.
    add(urlStr, doc, "contentLength",
      parse.getData().getMeta("contentLength"),
      false, true, false, false);
      
    // Mimetype. The ARC2Segment tool stores the content-type into
    // metadata with a key of 'content-type'.
    String mimetype = parse.getData().getMeta(CONTENT_TYPE_KEY);
    
    if (mimetype == null || mimetype.length() == 0)
    {
      MimeType mt = (MIME.getMimeType(urlStr));
      
      if (mt != null)
      {
        mimetype = mt.getName();
      }
    }
    
    try
    {
      // Test the mimetype makes some sense. If not, don't add.
      mimetype = (new MimeType(mimetype)).getName();
    }
    catch (MimeTypeException e)
    {
      LOGGER.error(urlStr + ", mimetype " + mimetype + ": "
        + e.toString());


      // Clear mimetype because caused exception.
      mimetype = null;
    }


    if (mimetype != null)
    {
      // wera wants the sub and primary types in index. So they are
      // stored but not searchable. nutch adds primary and subtypes
      // as well as complete type all to one 'type' field.
      final String type = "type";
      add(urlStr, doc, type, mimetype, true, false, true, false);
      int index = mimetype.indexOf('/');
      
      if (index > 0)
      {
        String tmp = mimetype.substring(0, index);
        add(urlStr, doc, "primaryType", tmp, true, true, false, false);
        add(urlStr, doc, type, tmp, true, false, true, false);
        
        if (index + 1 < mimetype.length())
        {
          tmp = mimetype.substring(index + 1);
          add(urlStr, doc, "subType", tmp, true, true, false, false);
          add(urlStr, doc, type, tmp, true, false, true, false);
        }
      }
    }


    // Add as not lowercased, not stored, indexed, and not tokenized.
    add(urlStr, doc, EXACTURL_KEY, escapeUrl(url.toString()), false, false,
      true, false);
    
    // TODO MC - for site search
    try {
      java.net.URL netUrl=new java.net.URL(urlStr);    
      String reverseDomain = (new StringBuffer(netUrl.getHost())).reverse().toString();
      add(urlStr, doc, DOMAIN_KEY, reverseDomain, false, true, true, false);
    }
    catch (Exception MalformedURLException) {
      LOGGER.error("Malformed url "+urlStr+".");
    }
    // TODO MC - for site search


    return doc;
  }


  private String getMetadataValue(final String key, final ParseData pd,
    final MapWritable mw)
  {       
    String v = pd.getMeta(key);    
    if (v == null || v.length() == 0 && mw != null)
    {
      Writable w = mw.get(new Text(key));
      
      if (w != null)
      {
        v = w.toString();       
      }
    }
    
    return v;
  }


  private String escapeUrl(String url)
  {
    this.md.reset();       
    return Base32.encode(this.md.digest(url.getBytes()));
  }


  private void add(final String url, final Document doc,
    final String fieldName, final String fieldValue,
    boolean lowerCase, boolean store, boolean index,
    boolean tokenize)
  {
    add(url, doc, fieldName, fieldValue, lowerCase, store, index, tokenize,
      true);
  }


  private void add(final String url, final Document doc,
    final String fieldName, final String fieldValue,
    boolean lowerCase, boolean store, boolean index,
    boolean tokenize, final boolean warn)
  {
    if (fieldValue == null || fieldValue.length() <= 0)
    {
      if (warn)
      {
        LOGGER.error("No " + fieldName + " for url " + url);
      }
      
      return;
    }
    
    doc.add(new Field(fieldName,
      (lowerCase? fieldValue.toLowerCase(): fieldValue),
      store? Field.Store.YES: Field.Store.NO,
      index?
      (tokenize? Field.Index.TOKENIZED: Field.Index.UN_TOKENIZED):
      Field.Index.NO));
  }


  public Configuration getConf()
  {
    return this.conf;
  }


  public void setConf(Configuration conf)
  {
    this.conf = conf;
  }
}
Source Code of org.archive.access.nutch.indexer.WaxIndexingFilter

Related Classes of org.archive.access.nutch.indexer.WaxIndexingFilter