Source Code of it.unimi.dsi.mg4j.document.HtmlDocumentFactory$HtmlDocument

package it.unimi.dsi.mg4j.document;


/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2005-2010 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */


import it.unimi.dsi.fastutil.chars.CharArrays;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.mg4j.util.MG4JClassParser;
import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.callback.ComposedCallbackBuilder;
import it.unimi.dsi.parser.callback.TextExtractor;
import it.unimi.dsi.util.Properties;


import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.Reader;
import java.nio.charset.Charset;


import org.apache.commons.configuration.ConfigurationException;


/** A factory that provides fields for body and title of HTML documents. 
 * It uses internally a {@link BulletParser}. 
 * A default encoding can be provided
 * using the property {@link it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#ENCODING}.
 * 
 * <p>By default, the {@link WordReader} provided by this factory
 * is just a {@link FastBufferedReader}, but you can specify
 * an alternative word reader using the property 
 * {@link it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#WORDREADER}.
 */


public class HtmlDocumentFactory extends PropertyBasedDocumentFactory {
  private static final long serialVersionUID = 1L;


  public static enum MetadataKeys {
    /** The maximum number of characters before an anchor. */
    MAXPREANCHOR,
    /** The maximum number of characters in an anchor. */
    MAXANCHOR,
    /** The maximum number of characters after an anchor. */
    MAXPOSTANCHOR,
  };


  private static final int DEFAULT_BUFFER_SIZE = 16 * 1024;
  /** A parser that will be used to extract text from HTML documents. */
  private transient BulletParser parser;
  /** The callback recording text. */
  private transient TextExtractor textExtractor;
  /** The callback for anchors. */
  private transient AnchorExtractor anchorExtractor;
  /** The word reader used for all documents. */
  private transient WordReader wordReader;
  /** The maximum number of characters before an anchor. */
  private int maxPreAnchor;
  /** The maximum number of characters in an anchor. */
  private int maxAnchor;
  /** The maximum number of characters after an anchor. */
  private int maxPostAnchor;
  
  
  private transient char[] text;


  protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws ConfigurationException {
    if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, key ) ) {
      metadata.put( PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, ensureJustOne( key, values ) );
      return true;
    }
    else if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, key ) ) {
      metadata.put( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, Charset.forName( ensureJustOne( key, values ) ).toString() );
      return true;
    }
    else if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER, key ) ) {
      try {
        final String spec = ( ensureJustOne( key, values ) ).toString();
        metadata.put( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER, spec );
        // Just to check
        ObjectParser.fromSpec( spec, WordReader.class, MG4JClassParser.PACKAGE );
      }
      catch ( ClassNotFoundException e ) {
        throw new ConfigurationException( e );
      }
      // TODO: this must turn into a more appropriate exception
      catch ( Exception e ) {
        throw new ConfigurationException( e );
      }
      return true;
    }
    else if ( sameKey( MetadataKeys.MAXPREANCHOR, key ) ) {
      metadata.put( MetadataKeys.MAXPREANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) );
      return true;
    }
    else if ( sameKey( MetadataKeys.MAXANCHOR, key ) ) {
      metadata.put( MetadataKeys.MAXANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) );
      return true;
    }
    else if ( sameKey( MetadataKeys.MAXPOSTANCHOR, key ) ) {
      metadata.put( MetadataKeys.MAXPOSTANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) );
      return true;
    }
    
    return super.parseProperty( key, values, metadata );
  }


  private void init() {
    this.parser = new BulletParser();
    
    ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();
    composedBuilder.add( this.textExtractor = new TextExtractor() );
    composedBuilder.add( this.anchorExtractor = new AnchorExtractor( maxPreAnchor, maxAnchor, maxPostAnchor ) ); 
    parser.setCallback( composedBuilder.compose() );


    Object o;
    try {
      o = defaultMetadata.get( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER );
      wordReader = o == null ? new FastBufferedReader() : ObjectParser.fromSpec( o.toString(), WordReader.class, MG4JClassParser.PACKAGE );
    }
    catch ( Exception e ) {
      throw new RuntimeException( e );
    }
    text = new char[ DEFAULT_BUFFER_SIZE ];
  }


  @SuppressWarnings("boxing")
  private void initVars() {
    maxPreAnchor = (Integer)resolve( MetadataKeys.MAXPREANCHOR, defaultMetadata, 8 );
    maxAnchor = (Integer)resolve( MetadataKeys.MAXANCHOR, defaultMetadata, 256 );
    maxPostAnchor = (Integer)resolve( MetadataKeys.MAXPOSTANCHOR, defaultMetadata, 4 );
  }
  
  /** Returns a copy of this document factory. A new parser is allocated for the copy. */
  public HtmlDocumentFactory copy() {
    return new HtmlDocumentFactory( defaultMetadata );
  }
  
  public HtmlDocumentFactory( final Properties properties ) throws ConfigurationException {
    super( properties );
    initVars();
    init();
  }


  public HtmlDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) {
    super( defaultMetadata );
    initVars();
    init();
  }


  public HtmlDocumentFactory( final String[] property ) throws ConfigurationException {
    super( property );
    initVars();
    init();
  }


  public HtmlDocumentFactory() {
    super();
    initVars();
    init();
  }
  
  public int numberOfFields() {
    return 3;
  }


  public String fieldName( final int field ) {
    ensureFieldIndex( field );
    switch( field ) {
      case 0: return "text";
      case 1: return "title";
      case 2: return "anchor";
      default: throw new IllegalArgumentException();
    }
  }
  
  public int fieldIndex( final String fieldName ) {
    for ( int i = 0; i < numberOfFields(); i++ )
      if ( fieldName( i ).equals( fieldName ) ) return i;
    return -1;
  }
  
  public FieldType fieldType( final int field ) {
    ensureFieldIndex( field );
    switch( field ) {
      case 0: return FieldType.TEXT;
      case 1: return FieldType.TEXT;
      case 2: return FieldType.VIRTUAL;
      default: throw new IllegalArgumentException();
    }
  }


  private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {
    s.defaultReadObject();
    init();
  }


  /** An HTML document. If a <samp>TITLE</samp> element is available, it will be used for {@link #title()}
   *   instead of the default value. 
   * 
   * <p>We delay the actual parsing until it is actually necessary, so operations like
   * getting the document URI will not require parsing. */
  
  protected class HtmlDocument extends AbstractDocument {
    private final Reference2ObjectMap<Enum<?>,Object> metadata;
    /** Whether we already parsed the document. */
    private boolean parsed;
    /** The cached raw content. */
    private final InputStream rawContent;


    private void ensureParsed() throws IOException {
      if ( parsed ) return;


      int offset = 0, l;
      Reader r = new InputStreamReader( rawContent, (String)resolveNotNull( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, metadata ) );
      while( ( l = r.read( text, offset, text.length - offset ) ) > 0 ) {
        offset += l;
        text = CharArrays.grow( text, offset + 1 );
      }
      parser.parse( text, 0, offset );
      textExtractor.title.trim();


      parsed = true;
    }
    
    protected HtmlDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) {
      this.metadata = metadata;
      this.rawContent = rawContent;
    }


    public CharSequence title() {
      try {
        ensureParsed();
      }
      catch ( IOException e ) {
        throw new RuntimeException( e );
      }
      return (CharSequence)( textExtractor.title.length() == 0 ? resolve( PropertyBasedDocumentFactory.MetadataKeys.TITLE, metadata ): textExtractor.title );
    }


    public String toString() {
      return title().toString();
    }


    public CharSequence uri() {
      return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.URI, metadata );
    }


    public Object content( final int field ) throws IOException {
      ensureFieldIndex( field );
      ensureParsed();
      switch( field ) {
        case 0: return new FastBufferedReader( textExtractor.text );
        case 1: return new FastBufferedReader( textExtractor.title );
        case 2: return anchorExtractor.anchors;
        default: throw new IllegalArgumentException();
      }
    }


    public WordReader wordReader( final int field ) {
      ensureFieldIndex( field );
      return wordReader; 
    }
  }


  public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws IOException {
    return new HtmlDocument( rawContent, metadata );
  }
}
Source Code of it.unimi.dsi.mg4j.document.HtmlDocumentFactory$HtmlDocument

Related Classes of it.unimi.dsi.mg4j.document.HtmlDocumentFactory$HtmlDocument