Package net.bpiwowar.mg4j.extensions.utils

Source Code of net.bpiwowar.mg4j.extensions.utils.HTMLDocumentFactory$WARCSegmentedDocument

package net.bpiwowar.mg4j.extensions.utils;

/*    
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2005-2007 Paolo Boldi and Sebastiano Vigna
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 2.1 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/

import it.unimi.di.big.mg4j.document.AbstractDocument;
import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory;
import it.unimi.di.big.mg4j.util.parser.callback.AnchorExtractor;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.HTMLFactory;
import it.unimi.dsi.parser.callback.ComposedCallbackBuilder;
import it.unimi.dsi.util.Properties;
import net.bpiwowar.mg4j.extensions.MarkedUpDocument;
import net.bpiwowar.mg4j.extensions.TagPointer;
import net.bpiwowar.mg4j.extensions.warc.WarcHTMLResponseRecord;
import net.bpiwowar.mg4j.extensions.warc.WarcRecord;
import org.apache.commons.configuration.ConfigurationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.nio.charset.Charset;
import java.util.Iterator;

/**
* A factory that provides fields for body and title of HTML documents. It uses
* internally a {@link BulletParser}.
* @author B. Piwowarski <benjamin@bpiwowar.net>
*/

public class HTMLDocumentFactory extends PropertyBasedDocumentFactory {
  final static private Logger LOGGER = LoggerFactory.getLogger(HTMLDocumentFactory.class);

  private static final long serialVersionUID = 1L;

  private static final int DEFAULT_BUFFER_SIZE = 16 * 1024;
  /** A parser that will be used to extract text from HTML documents. */
  private transient BulletParser parser;
  /** The callback recording text. */
  private transient StructuredTextExtractor textExtractor;
  /** The callback for anchors. */
  private transient AnchorExtractor anchorExtractor;
  /** The word reader used for all documents. */
  private transient WordReader wordReader;

  private transient char[] text;

  /** The collection type (e.g. TREC, WARC) */
  private CollectionType collectionType = CollectionType.WARC018;

  @Override
  protected boolean parseProperty(final String key, final String[] values,
      final Reference2ObjectMap<Enum<?>, Object> metadata) throws ConfigurationException {
    if (sameKey(PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, key)) {
      metadata.put(PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE,
          ensureJustOne(key, values));
      return true;
    } else if (sameKey(PropertyBasedDocumentFactory.MetadataKeys.ENCODING,
        key)) {
      metadata.put(PropertyBasedDocumentFactory.MetadataKeys.ENCODING,
          Charset.forName(ensureJustOne(key, values)).toString());
      return true;
    }

    return super.parseProperty(key, values, metadata);
  }

 
 
  private void init() {
    /** The HTML bullet parser */
    this.parser = new BulletParser(HTMLFactory.INSTANCE);

    ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();

    composedBuilder
        .add(this.textExtractor = new StructuredTextExtractor());
    parser.setCallback(composedBuilder.compose());

    this.wordReader = new FastBufferedReader();
    text = new char[DEFAULT_BUFFER_SIZE];

    if (collectionType == null)
      collectionType = CollectionType.WARC018;

  }

  private void initVars() {
  }

  /**
   * Returns a copy of this document factory. A new parser is allocated for
   * the copy.
   */
  @Override
  public HTMLDocumentFactory copy() {
    return new HTMLDocumentFactory(defaultMetadata);
  }

  public HTMLDocumentFactory(final Properties properties)
      throws ConfigurationException {
    super(properties);
    initVars();
    init();
  }

  public HTMLDocumentFactory(
            final Reference2ObjectMap<Enum<?>, Object> defaultMetadata) {
    super(defaultMetadata);
    initVars();
    init();
  }

  public HTMLDocumentFactory(final String[] property)
      throws ConfigurationException {
    super(property);
    initVars();
    init();
  }

  public HTMLDocumentFactory() {
    super();
    initVars();
    init();
  }

  @Override
  public int numberOfFields() {
    return Fields.values().length;
  }

  public static enum Fields {
    TEXT, TITLE
  }

    /**
     * Type of collection
     */
  public static enum CollectionType {
    // collections in WARC 0.18 format
    WARC018
  }

  @Override
  public String fieldName(final int field) {
    ensureFieldIndex(field);

    switch (field) {
    case 0:
      return "text";
    case 1:
      return "title";
    default:
      throw new IllegalArgumentException();
    }
  }

  @Override
  public int fieldIndex(final String fieldName) {
    for (int i = 0; i < numberOfFields(); i++)
      if (fieldName(i).equals(fieldName))
        return i;
    return -1;
  }

  @Override
  public FieldType fieldType(final int field) {
    ensureFieldIndex(field);
    switch (field) {
    case 0:
      return FieldType.TEXT;
    case 1:
      return FieldType.TEXT;
    default:
      throw new IllegalArgumentException();
    }
  }

  private void readObject(final ObjectInputStream s) throws IOException,
      ClassNotFoundException {
    s.defaultReadObject();
    init();
  }


  @Override
  public WARCSegmentedDocument getDocument(final InputStream rawContent,
      final Reference2ObjectMap<Enum<?>, Object> metadata)
      throws IOException {
    return new WARCSegmentedDocument(rawContent, metadata);
  }

  /**
   * A TREC document. If a <samp>TITLE</samp> element is available, it will be
   * used for {@link #title()} instead of the default value.
   *
   * <p>
   * The document may be segmented.
   * <p>
   * We delay the actual parsing until it is actually necessary, so operations
   * like getting the document URI will not require parsing.
   */
  public class WARCSegmentedDocument extends AbstractDocument implements
            MarkedUpDocument {
    private final Reference2ObjectMap<Enum<?>, Object> metadata;
    /** Whether we already parsed the document. */
    private boolean parsed;
    /** The cached raw content. */
    private final InputStream rawContent;

 
    /**
     * Here's where the actual document is parsed (using the BulletParser)
     * @throws java.io.IOException
     */
    private void ensureParsed() throws IOException {
      if (parsed)
        return;

      switch(collectionType) {
      case WARC018:
        parseWAR018CDocument();
        break;
      }
    }

    /**
     * Parses a document from a WARC collection
     *
     * @throws java.io.IOException
     */
    private void parseWAR018CDocument() throws IOException {
      WarcRecord warcRecord = null;
      DataInputStream dis = new DataInputStream(rawContent);

      // Regardless of what the stream gives us, we read and return
      // the first entry which is a response.
      WarcHTMLResponseRecord warcResponse = null;
      while ((warcRecord = WarcRecord.readNextWarcRecord(dis)) != null) {
        // ignore if no WARC response type, otherwise read and finish
        if (warcRecord.getHeaderRecordType().equals("response")) {
          warcResponse = new WarcHTMLResponseRecord(warcRecord);
          break;
        }
      }

      if (warcResponse != null) {

        // parse the HTML content (skip HTTP header)
        Reader reader = warcResponse.getContentReader();
                int len = 0;
                while ((len = reader.read(text, 0, text.length)) >= 0) {
                    parser.parse(text, 0, len);
                }
                reader.close();


                // We use the TrecID for the URI
                metadata.put(MetadataKeys.URI, warcResponse.getTargetTrecID());
                // Set the title
                metadata.put(MetadataKeys.TITLE, textExtractor.title.trim());

                parsed = true;
      }
      else {
        LOGGER.warn("Couldn't find WARC response in stream!");
        if (warcRecord != null) {
          LOGGER.warn("{}", warcRecord);
        }
        else LOGGER.warn("WARC record is null!");
        throw new IOException("Couldn't find WARC response in stream!");
      }
    }


        @Override
    public Iterator<TagPointer> tags(int field) {
      try {
        ensureParsed();
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
      switch (field) {
      case 0:
        return textExtractor.tagPointer();
      case 1:
        return null;
      case 2:
        return null;
      default:
        throw new IllegalArgumentException();
      }
    }

    public WARCSegmentedDocument(final InputStream rawContent,
                                     final Reference2ObjectMap<Enum<?>, Object> metadata) {
      this.metadata = metadata;
      this.rawContent = rawContent;
    }

    @Override
    public CharSequence title() {
      try {
        ensureParsed();
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
      return (CharSequence) (textExtractor.title.length() == 0 ? resolve(
          PropertyBasedDocumentFactory.MetadataKeys.TITLE, metadata)
          : textExtractor.title);
    }

    @Override
    public String toString() {
      return title().toString();
    }

    @Override
    public CharSequence uri() {
      return (CharSequence) resolve(
          PropertyBasedDocumentFactory.MetadataKeys.URI, metadata);
    }

    @Override
    public Object content(final int field) throws IOException {
      ensureFieldIndex(field);
      ensureParsed();
      switch (field) {
      case 0:
                return new FastBufferedReader(textExtractor.text);
      case 1:
        return new FastBufferedReader(textExtractor.title);
      case 2:
        return anchorExtractor.anchors;
      default:
        throw new IllegalArgumentException();
      }
    }

    /**
     * Returns the text of this document
     *
     * @return the text
     */
    public MutableString getText() throws IOException {
      ensureFieldIndex(0);
      ensureParsed();
      return textExtractor.getText();
    }

    @Override
    public WordReader wordReader(final int field) {
      ensureFieldIndex(field);
      return wordReader;
    }

  }

  public WordReader getWordReader() {
    return wordReader;
  }

  /**
   * Sets the type of the underlying collection (e.g. standard TREC
   * collection, WARC collection)
   *
   * @param t
   *            the collection type
   */
  public void setCollectionType(CollectionType t) {
    this.collectionType = t;
  }
}
TOP

Related Classes of net.bpiwowar.mg4j.extensions.utils.HTMLDocumentFactory$WARCSegmentedDocument

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.