Package net.bpiwowar.mg4j.extensions.warc

Source Code of net.bpiwowar.mg4j.extensions.warc.WARCDocumentCollection

/**
* $Author:$
* $Id:$
* $Rev:$
*/

package net.bpiwowar.mg4j.extensions.warc;

import it.unimi.di.big.mg4j.document.DocumentFactory;
import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory;
import it.unimi.dsi.fastutil.objects.ObjectBigArrayBigList;
import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import net.bpiwowar.mg4j.extensions.Compression;
import net.bpiwowar.mg4j.extensions.segmented.SegmentedDocumentCollection;
import net.bpiwowar.mg4j.extensions.segmented.SegmentedDocumentDescriptor;
import net.bpiwowar.mg4j.extensions.utils.HTMLDocumentFactory;
import org.apache.log4j.Logger;

import java.io.*;

/**
* Managing TREC collections provided in a WARC format, as used for instance
* by the TREC session track. A document collection basically consists of a set
* of descriptors pointing to important locations in the (possibly zipped)
* document archive. This is called a <em>sequence</em>.
*
* @author <a href="mailto:ingo@dcs.gla.ac.uk">Ingo Frommholz</a>
* @author <a href="mailto:benjamin@bpiwowar.net">Benjamin Piwowarski</a>
* @see net.bpiwowar.mg4j.extensions.trec.TRECDocumentCollection
* @see DocumentFactory
*/
public class WARCDocumentCollection extends SegmentedDocumentCollection {

  private static final long serialVersionUID = 1;
  private static final Logger LOGGER = Logger.getLogger(WARCDocumentCollection.class);
  final boolean debugEnabled = LOGGER.isDebugEnabled();


  /**
   * Creates a new TREC WARC collection by parsing the given files.
   *
   * @param file
   *            an array of file names containing documents in TREC WARC
   *            format.
   * @param bufferSize
   *            the buffer size.
   * @param compression
   *            true if the files are gzipped.
   */
  public WARCDocumentCollection(String[] file,
      int bufferSize, Compression compression, File metadataFile) throws IOException {
    super(file, new HTMLDocumentFactory(), bufferSize, compression, metadataFile);
  }

  /**
   * Copy constructor (that is, the one used by {@link #copy()}. Just
   * initializes final fields
   */
  public WARCDocumentCollection(String[] file, DocumentFactory factory,
                                  ObjectBigArrayBigList<SegmentedDocumentDescriptor> descriptors,
      int bufferSize, Compression compression, File metadataFile) {
    super(file, factory, descriptors, bufferSize, compression, metadataFile);
  }

    @Override
    public Reference2ObjectMap<Enum<?>, Object> metadata(long index) {
        ensureDocumentIndex(index);
        final Reference2ObjectArrayMap<Enum<?>, Object> metadata
                = new Reference2ObjectArrayMap<>(4);

        try {
            metadataRandomAccess.seek(descriptors.get(index).metadataPosition);
            String docno = metadataRandomAccess.readUTF();
            metadata.put(PropertyBasedDocumentFactory.MetadataKeys.URI, docno);
        } catch (IOException e) {
            LOGGER.error(String.format("Could not retrieve metadata for file %d [%s]", index, e));
        }
        return metadata;
    }

    @Override
  protected void parseContent(int fileIndex, InputStream is, DataOutputStream metadataOutput)
      throws IOException {
    WarcRecord.newFile();
    WarcRecord warcRecord;
    DataInputStream dis = new DataInputStream(is);
    boolean oldReadContentFlag =
        WarcRecord.readContent(false); // don't read content
    while ((warcRecord = WarcRecord.readNextWarcRecord(dis)) != null) {
      // ignore if no WARC response type
      if (warcRecord.getHeaderRecordType().equals("response")) {
        WarcHTMLResponseRecord warcResponse =
          new WarcHTMLResponseRecord(warcRecord);
        String docno = warcResponse.getTargetTrecID();
        long currStart = warcResponse.getStartMarker();
        long currStop = warcResponse.getStopMarker();
        if (debugEnabled)
          LOGGER.debug(String.format("Setting markers {%s, %d, %d}", docno,
              currStart, currStop));

                long metadataPos = metadataOutput.size();
                metadataOutput.writeUTF(docno);

                descriptors.add(SegmentedDocumentDescriptor.create(fileIndex, currStart, currStop, metadataPos));
        LOGGER.debug("Descriptor size is " + size());
      }
    }
    dis.close();
    WarcRecord.readContent(oldReadContentFlag); // reset readContent flag
  }


    @Override
    public WARCDocumentCollection copy() {
        return new WARCDocumentCollection(files, factory().copy(), descriptors,
                bufferSize, compression, metadataFile);
    }


 

}
TOP

Related Classes of net.bpiwowar.mg4j.extensions.warc.WARCDocumentCollection

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.