package it.unimi.dsi.mg4j.document;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2005-2010 Paolo Boldi and Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
import it.unimi.dsi.fastutil.chars.CharArrays;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.mg4j.util.MG4JClassParser;
import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.callback.ComposedCallbackBuilder;
import it.unimi.dsi.parser.callback.TextExtractor;
import it.unimi.dsi.util.Properties;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.Reader;
import java.nio.charset.Charset;
import org.apache.commons.configuration.ConfigurationException;
/** A factory that provides fields for body and title of HTML documents.
* It uses internally a {@link BulletParser}.
* A default encoding can be provided
* using the property {@link it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#ENCODING}.
*
* <p>By default, the {@link WordReader} provided by this factory
* is just a {@link FastBufferedReader}, but you can specify
* an alternative word reader using the property
* {@link it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys#WORDREADER}.
*/
public class HtmlDocumentFactory extends PropertyBasedDocumentFactory {
private static final long serialVersionUID = 1L;
public static enum MetadataKeys {
/** The maximum number of characters before an anchor. */
MAXPREANCHOR,
/** The maximum number of characters in an anchor. */
MAXANCHOR,
/** The maximum number of characters after an anchor. */
MAXPOSTANCHOR,
};
private static final int DEFAULT_BUFFER_SIZE = 16 * 1024;
/** A parser that will be used to extract text from HTML documents. */
private transient BulletParser parser;
/** The callback recording text. */
private transient TextExtractor textExtractor;
/** The callback for anchors. */
private transient AnchorExtractor anchorExtractor;
/** The word reader used for all documents. */
private transient WordReader wordReader;
/** The maximum number of characters before an anchor. */
private int maxPreAnchor;
/** The maximum number of characters in an anchor. */
private int maxAnchor;
/** The maximum number of characters after an anchor. */
private int maxPostAnchor;
private transient char[] text;
protected boolean parseProperty( final String key, final String[] values, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws ConfigurationException {
if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, key ) ) {
metadata.put( PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, ensureJustOne( key, values ) );
return true;
}
else if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, key ) ) {
metadata.put( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, Charset.forName( ensureJustOne( key, values ) ).toString() );
return true;
}
else if ( sameKey( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER, key ) ) {
try {
final String spec = ( ensureJustOne( key, values ) ).toString();
metadata.put( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER, spec );
// Just to check
ObjectParser.fromSpec( spec, WordReader.class, MG4JClassParser.PACKAGE );
}
catch ( ClassNotFoundException e ) {
throw new ConfigurationException( e );
}
// TODO: this must turn into a more appropriate exception
catch ( Exception e ) {
throw new ConfigurationException( e );
}
return true;
}
else if ( sameKey( MetadataKeys.MAXPREANCHOR, key ) ) {
metadata.put( MetadataKeys.MAXPREANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) );
return true;
}
else if ( sameKey( MetadataKeys.MAXANCHOR, key ) ) {
metadata.put( MetadataKeys.MAXANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) );
return true;
}
else if ( sameKey( MetadataKeys.MAXPOSTANCHOR, key ) ) {
metadata.put( MetadataKeys.MAXPOSTANCHOR, Integer.valueOf( ensureJustOne( key, values ) ) );
return true;
}
return super.parseProperty( key, values, metadata );
}
private void init() {
this.parser = new BulletParser();
ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();
composedBuilder.add( this.textExtractor = new TextExtractor() );
composedBuilder.add( this.anchorExtractor = new AnchorExtractor( maxPreAnchor, maxAnchor, maxPostAnchor ) );
parser.setCallback( composedBuilder.compose() );
Object o;
try {
o = defaultMetadata.get( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER );
wordReader = o == null ? new FastBufferedReader() : ObjectParser.fromSpec( o.toString(), WordReader.class, MG4JClassParser.PACKAGE );
}
catch ( Exception e ) {
throw new RuntimeException( e );
}
text = new char[ DEFAULT_BUFFER_SIZE ];
}
@SuppressWarnings("boxing")
private void initVars() {
maxPreAnchor = (Integer)resolve( MetadataKeys.MAXPREANCHOR, defaultMetadata, 8 );
maxAnchor = (Integer)resolve( MetadataKeys.MAXANCHOR, defaultMetadata, 256 );
maxPostAnchor = (Integer)resolve( MetadataKeys.MAXPOSTANCHOR, defaultMetadata, 4 );
}
/** Returns a copy of this document factory. A new parser is allocated for the copy. */
public HtmlDocumentFactory copy() {
return new HtmlDocumentFactory( defaultMetadata );
}
public HtmlDocumentFactory( final Properties properties ) throws ConfigurationException {
super( properties );
initVars();
init();
}
public HtmlDocumentFactory( final Reference2ObjectMap<Enum<?>,Object> defaultMetadata ) {
super( defaultMetadata );
initVars();
init();
}
public HtmlDocumentFactory( final String[] property ) throws ConfigurationException {
super( property );
initVars();
init();
}
public HtmlDocumentFactory() {
super();
initVars();
init();
}
public int numberOfFields() {
return 3;
}
public String fieldName( final int field ) {
ensureFieldIndex( field );
switch( field ) {
case 0: return "text";
case 1: return "title";
case 2: return "anchor";
default: throw new IllegalArgumentException();
}
}
public int fieldIndex( final String fieldName ) {
for ( int i = 0; i < numberOfFields(); i++ )
if ( fieldName( i ).equals( fieldName ) ) return i;
return -1;
}
public FieldType fieldType( final int field ) {
ensureFieldIndex( field );
switch( field ) {
case 0: return FieldType.TEXT;
case 1: return FieldType.TEXT;
case 2: return FieldType.VIRTUAL;
default: throw new IllegalArgumentException();
}
}
private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {
s.defaultReadObject();
init();
}
/** An HTML document. If a <samp>TITLE</samp> element is available, it will be used for {@link #title()}
* instead of the default value.
*
* <p>We delay the actual parsing until it is actually necessary, so operations like
* getting the document URI will not require parsing. */
protected class HtmlDocument extends AbstractDocument {
private final Reference2ObjectMap<Enum<?>,Object> metadata;
/** Whether we already parsed the document. */
private boolean parsed;
/** The cached raw content. */
private final InputStream rawContent;
private void ensureParsed() throws IOException {
if ( parsed ) return;
int offset = 0, l;
Reader r = new InputStreamReader( rawContent, (String)resolveNotNull( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, metadata ) );
while( ( l = r.read( text, offset, text.length - offset ) ) > 0 ) {
offset += l;
text = CharArrays.grow( text, offset + 1 );
}
parser.parse( text, 0, offset );
textExtractor.title.trim();
parsed = true;
}
protected HtmlDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) {
this.metadata = metadata;
this.rawContent = rawContent;
}
public CharSequence title() {
try {
ensureParsed();
}
catch ( IOException e ) {
throw new RuntimeException( e );
}
return (CharSequence)( textExtractor.title.length() == 0 ? resolve( PropertyBasedDocumentFactory.MetadataKeys.TITLE, metadata ): textExtractor.title );
}
public String toString() {
return title().toString();
}
public CharSequence uri() {
return (CharSequence)resolve( PropertyBasedDocumentFactory.MetadataKeys.URI, metadata );
}
public Object content( final int field ) throws IOException {
ensureFieldIndex( field );
ensureParsed();
switch( field ) {
case 0: return new FastBufferedReader( textExtractor.text );
case 1: return new FastBufferedReader( textExtractor.title );
case 2: return anchorExtractor.anchors;
default: throw new IllegalArgumentException();
}
}
public WordReader wordReader( final int field ) {
ensureFieldIndex( field );
return wordReader;
}
}
public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws IOException {
return new HtmlDocument( rawContent, metadata );
}
}