Source Code of net.sf.regain.crawler.document.AbstractPreparator

/*
 * regain - A file search engine providing plenty of formats
 * Copyright (C) 2004  Til Schneider
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Contact: Til Schneider, info@murfman.de
 *
 * CVS information:
 *  $RCSfile$
 *   $Source$
 *     $Date: 2010-11-07 16:02:14 +0100 (So, 07 Nov 2010) $
 *   $Author: thtesche $
 * $Revision: 465 $
 */
package net.sf.regain.crawler.document;


import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;


import net.sf.regain.RegainException;
import net.sf.regain.crawler.config.PreparatorConfig;


import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;


/**
 * Abstract implementation of a preparator.
 * <p>
 * Implements the getter methods and assumes the clean-up between two
 * preparations (See {@link #cleanUp()}).
 * <p>
 * Child class may set the values using the protected setter methods.
 *
 * @author Til Schneider, www.murfman.de
 */
public abstract class AbstractPreparator implements Preparator {


  /**
   * The regular expression a URL must match to, to be prepared by this
   * preparator.
   */
  private RE mUrlRegex;
  /** Der gefundene Titel. */
  private String mTitle;
  /** The cleaned content. */
  private String mCleanedContent;
  /** Die Zusammenfassung des Dokuments. */
  private String mSummary;
  /** The cleaned meta data of the document. */
  private String mCleanedMetaData;
  /** Die extrahierten Überschriften. Kann <code>null</code> sein */
  private String mHeadlines;
  /** Der Pfad, über den das Dokument zu erreichen ist. */
  private PathElement[] mPath;
  /** The additional fields that should be indexed. */
  private HashMap mAdditionalFieldMap;
  /** The assigned mimetypes for the preparator */
  private String[] mMimeTypes;
  /** The priority of the preparator. Used for the selection of preparators */
  private int mPriority;


  /**
   * Creates a new instance of AbstractPreparator.
   * <p>
   * The preparator won't accept any documents until a new rule was defined
   * using {@link #setUrlRegex(RE)}.
   *
   * @see #setUrlRegex(RE)
   * @see #accepts(RawDocument)
   */
  public AbstractPreparator() {
  }


  /**
   * Creates a new instance of AbstractPreparator.
   * <p>
   * If <code>urlRegex</code> is null, the preparator won't accept any documents.
   *
   * @param urlRegex the regex a URL must match to to be accepted by this
   *        preparator (may be null)
   *
   * @see #setUrlRegex(RE)
   * @see #accepts(RawDocument)
   */
  public AbstractPreparator(RE urlRegex) {
    mUrlRegex = urlRegex;
  }


  /**
   * Creates a new instance of AbstractPreparator.
   * <p>
   * If <code>extention</code> is null or empty, the preparator won't accept any
   * documents.
   * 
   * @param extention The file extension a URL must have to be accepted by
   *        this preparator.
   * @throws RegainException If creating the preparator failed.
   *
   * @see #setUrlRegex(RE)
   * @see #accepts(RawDocument)
   */
  public AbstractPreparator(String mimeType) throws RegainException {
    mMimeTypes = new String[]{mimeType};
    // this(createExtentionRegex(extention));
  }


  /**
   * Creates a new instance of AbstractPreparator.
   * <p>
   * If <code>extentionArr</code> is null or empty, the preparator won't accept
   * any documents.
   *
   * @param extentionArr The file extensions a URL must have one to be accepted
   *        by this preparator.
   * @throws RegainException If creating the preparator failed.
   *
   * @see #setUrlRegex(RE)
   * @see #accepts(RawDocument)
   */
  public AbstractPreparator(String[] mimeTypeArr) throws RegainException {
    mMimeTypes = mimeTypeArr;
    // this(createExtentionRegex(extentionArr));
  }


  /**
   * Creates a regex that matches a file extensions.
   * <p>
   * If <code>extention</code> is null or empty, null will be returned.
   *
   * @param extention The file extension to create the regex for.
   * @return The regex.
   * @throws RegainException If the regex couldn't be created.
   */
  private static RE createExtentionRegex(String extention)
          throws RegainException {
    if (extention == null || extention.length() == 0) {
      return null;
    }


    String regex = "\\." + extention + "$";
    try {
      return new RE(regex, RE.MATCH_CASEINDEPENDENT);
    } catch (RESyntaxException exc) {
      throw new RegainException("Creating accept regex for preparator failed: "
              + regex, exc);
    }
  }


  /**
   * Creates a regex that matches a set of file extensions.
   * <p>
   * If <code>extentionArr</code> is null or empty, null will be returned.
   *
   * @param extentionArr The file extensions to create the regex for.
   * @return The regex.
   * @throws RegainException If the regex couldn't be created.
   */
  private static RE createExtentionRegex(String[] extentionArr)
          throws RegainException {
    if (extentionArr == null || extentionArr.length == 0) {
      return null;
    }


    StringBuilder buffer = new StringBuilder("\\.(");
    for (int i = 0; i < extentionArr.length; i++) {
      if (i > 0) {
        buffer.append("|");
      }
      buffer.append(extentionArr[i]);
    }
    buffer.append(")$");


    String urlRegex = buffer.toString();
    try {
      return new RE(urlRegex, RE.MATCH_CASEINDEPENDENT);
    } catch (RESyntaxException exc) {
      throw new RegainException("Creating accept regex for preparator failed: "
              + urlRegex, exc);
    }
  }


  /**
   * Initializes the preparator.
   * <p>
   * Does nothing by default. May be overridden by subclasses.
   *
   * @param config The configuration for this preparator.
   * @throws RegainException If the regular expression or the configuration
   *         has an error.
   */
  @Override
  public void init(PreparatorConfig config) throws RegainException {
  }


  /**
   * Sets the regular expression a URL must match to, to be prepared by this
   * preparator.
   * <p>
   * If <code>urlRegex</code> is null, the preparator won't accept any documents.
   *
   * @param urlRegex the new URL regex (may be null)
   * @see #accepts(RawDocument)
   */
  @Override
  public void setUrlRegex(RE urlRegex) {
    mUrlRegex = urlRegex;
  }


  /**
   * Gets whether the preparator is able to process the given document. This is
   * the case, if its URL matches the URL regex.
   *
   * @param rawDocument The document to check.
   * @return Whether the preparator is able to process the given document.
   * @see #setUrlRegex(RE)
   */
  @Override
  public boolean accepts(RawDocument rawDocument) {
    if (mUrlRegex == null) {
      if (mMimeTypes != null && mMimeTypes.length > 0) {
        for (String mimeType : mMimeTypes) {
          if (mimeType.equals(rawDocument.getMimeType())) {
            return true;
          }
        }
        return false;
      } else {
        return false;
      }
    } else {
      return mUrlRegex.match(rawDocument.getUrl());
    }
  }


  /**
   * Gibt den Titel des Dokuments zurück.
   * <p>
   * Falls kein Titel extrahiert werden konnte, wird <CODE>null</CODE>
   * zurückgegeben.
   *
   * @return Der Titel des Dokuments.
   */
  @Override
  public String getTitle() {
    return mTitle;
  }


  /**
   * Setzt den Titel des Dokuments, das gerade Präpariert wird.
   *
   * @param title Der Titel.
   */
  protected void setTitle(String title) {
    mTitle = title;
  }


  /**
   * Gibt den von Formatierungsinformation befreiten Inhalt des Dokuments zurück.
   *
   * @return Der ges�uberte Inhalt.
   */
  @Override
  public String getCleanedContent() {
    return mCleanedContent;
  }


  /**
   * Setzt von Formatierungsinformation befreiten Inhalt des Dokuments, das
   * gerade Präpariert wird.
   *
   * @param The cleanedContent
   */
  protected void setCleanedContent(String cleanedContent) {
    mCleanedContent = cleanedContent;
  }


  /**
   * @return the mCleanedMetaData
   */
  @Override
  public String getCleanedMetaData() {
    return mCleanedMetaData;
  }


  /**
   * @param mCleanedMetaData the mCleanedMetaData to set
   */
  public void setCleanedMetaData(String mCleanedMetaData) {
    this.mCleanedMetaData = mCleanedMetaData;
  }


  /**
   * Gibt eine Zusammenfassung für das Dokument zurück.
   * <p>
   * Da eine Zusammenfassung nicht einfach m�glich ist, wird <CODE>null</CODE>
   * zurückgegeben.
   *
   * @return Eine Zusammenfassung für das Dokument
   */
  @Override
  public String getSummary() {
    return mSummary;
  }


  /**
   * Setzt die Zusammenfassung des Dokuments, das gerade Präpariert wird.
   *
   * @param summary Die Zusammenfassung
   */
  protected void setSummary(String summary) {
    mSummary = summary;
  }


  /**
   * Gibt die überschriften des Dokuments zurück.
   * <p>
   * Es handelt sich dabei nicht um die überschrift des Dokuments selbst,
   * sondern lediglich um Unter-überschriften, die in dem Dokument verwendendet
   * werden. Mit Hilfe dieser überschriften läßt sich eine bessere Relevanz
   * berechnen.
   * <p>
   * Wenn keine überschriften gefunden wurden, dann wird <code>null</code>
   * zurückgegeben.
   *
   * @return Die überschriften des Dokuments.
   */
  @Override
  public String getHeadlines() {
    return mHeadlines;
  }


  /**
   * Setzt die überschriften, in im Dokument, das gerade Präpariert wird,
   * gefunden wurden.
   *
   * @param headlines Die Zusammenfassung
   */
  protected void setHeadlines(String headlines) {
    mHeadlines = headlines;
  }


  /**
   * Gibt den Pfad zurück, über den das Dokument zu erreichen ist.
   * <p>
   * Falls kein Pfad verfügbar ist, wird <code>null</code> zurückgegeben.
   *
   * @return Der Pfad, über den das Dokument zu erreichen ist.
   */
  @Override
  public PathElement[] getPath() {
    return mPath;
  }


  /**
   * Setzt den Pfad, über den das Dokument zu erreichen ist.
   *
   * @param path Der Pfad, über den das Dokument zu erreichen ist.
   */
  public void setPath(PathElement[] path) {
    mPath = path;
  }


  /**
   * Gets additional fields that should be indexed.
   * <p>
   * These fields will be indexed and stored.
   * 
   * @return The additional fields or <code>null</code>.
   */
  @Override
  public Map getAdditionalFields() {
    return mAdditionalFieldMap;
  }


  /**
   * Adds an additional field to the current document.
   * <p>
   * This field will be indexed and stored.
   * 
   * @param fieldName The name of the field.
   * @param fieldValue The value of the field.
   */
  public void addAdditionalField(String fieldName, String fieldValue) {
    if (mAdditionalFieldMap == null) {
      mAdditionalFieldMap = new HashMap();
    }
    mAdditionalFieldMap.put(fieldName, fieldValue);
  }


  /** 
   * Gets the priority of the preparator
   * @return int the priority
   */
  @Override
  public int getPriority() {
    return mPriority;
  }


  /**
   * Sets the priority of the preparator
   * @param priority read from config or default value settings
   */
  @Override
  public void setPriority(int priority) {
    this.mPriority = priority;
  }


  /**
   * Release all ressources used for handling a document.
   */
  @Override
  public void cleanUp() {
    mTitle = null;
    mCleanedContent = null;
    mSummary = null;
    mHeadlines = null;
    mPath = null;
    mAdditionalFieldMap = null;
  }


  /**
   * Concatenate all parts together, use ', ' as delimiter. If a parts is empty or consists
   * only of whitespaces the part will be negleted.
   * 
   * @param parts for concatenation
   * @param maxPartsUsed number of partsused for concatenation
   * @return the resulting string whith all single parts concatenated
   */
  protected String concatenateStringParts(ArrayList<String> parts, int maxPartsUsed) {


    String result = "";


    if (parts.size() > 0) {
      int end = parts.size();
      if (maxPartsUsed < parts.size()) {
        end = maxPartsUsed;
      }
      for (int i = 0; i < end; i++) {
        // Iterate over single parts
        if (parts.get(i).length() > 0) {
          result += parts.get(i);
          if (i < end - 1) {
            result += ", ";
          }
        }
      }
    }
    return result;
  }


  /**
   * Frees all resources reserved by the preparator.
   * <p>
   * Is called at the end of the crawler process after all documents were
   * processed.
   * 
   * @throws RegainException If freeing the resources failed.
   */
  @Override
  public void close() throws RegainException {
  }
}
Source Code of net.sf.regain.crawler.document.AbstractPreparator

Related Classes of net.sf.regain.crawler.document.AbstractPreparator