Package net.bpiwowar.mg4j.extensions.tasks

Source Code of net.bpiwowar.mg4j.extensions.tasks.CatCollectionBatches

package net.bpiwowar.mg4j.extensions.tasks;

import bpiwowar.argparser.Argument;
import bpiwowar.argparser.handlers.XStreamHandler;
import bpiwowar.experiments.AbstractTask;
import bpiwowar.experiments.TaskDescription;
import it.unimi.di.big.mg4j.document.*;
import it.unimi.di.big.mg4j.index.NullTermProcessor;
import it.unimi.di.big.mg4j.index.TermProcessor;
import it.unimi.di.big.mg4j.tool.Scan;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import net.bpiwowar.mg4j.extensions.segmented.SegmentedDocumentCollection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;

import static java.lang.StrictMath.random;

/**
* Wrapper for Index in MG4J
*
* @author B. Piwowarski <benjamin@bpiwowar.net>
* @date 13/7/12
*/
@TaskDescription(name = "cat-batch", project = {"ir", "mg4j"}, description = "Outputs a document collection as a stream")
public class CatCollectionBatches extends AbstractTask {
    static final Logger LOGGER = LoggerFactory.getLogger(CatCollectionBatches.class);

    @Argument(name = "collection", help = "Sequence to index", required = true)
    String sequence;

    @Argument(name = "term-processor", help = "An XML serialized form of the term processor (default: no processor)", handler = XStreamHandler.class, group = "Options")
    TermProcessor termProcessor = NullTermProcessor.getInstance();

    @Argument(name = "fields", help = "The fields to output", required = true)
    ArrayList<String> fieldNames = new ArrayList<>();

    @Argument(name = "word-reader")
    WordReaderType wordReaderType = WordReaderType.DEFAULT;

    @Argument(name = "batch-size", required = true, help = "Number of elements in each batch")
    long batchSize;

    @Override
    public int execute() throws Throwable {
        // -*- Loads the document collection
        DocumentCollection collection = (DocumentCollection) Scan.getSequence(this.sequence,
                IdentityDocumentFactory.class, new String[]{},
                Scan.DEFAULT_DELIMITER, LOGGER);
        DocumentFactory factory = collection.factory();

        final WordReader wordReader = wordReaderType.getWordReader();
        LOGGER.info(String.format("Term processor class is %s", termProcessor.getClass()));

        // -*- Get the fields
        int[] fields = new int[fieldNames.size()];
        DocumentFactory.FieldType[] types = new DocumentFactory.FieldType[fields.length];
        for (int i = fields.length; --i >= 0; ) {
            fields[i] = factory.fieldIndex(fieldNames.get(i));
            types[i] = factory.fieldType(i);
        }


        final long numberOfDocuments = collection.size();
        MutableString word = new MutableString();
        MutableString delimiter = new MutableString();

        while (true) {
            // Select a starting document
            final int start = (int) (Math.random() * (numberOfDocuments - batchSize));
            try(final DocumentIterator iterator =
                    collection instanceof SegmentedDocumentCollection ?
                            ((SegmentedDocumentCollection) collection).iterator(start)
                            : null) {

                for (int docid = start; docid < start + batchSize; docid++) {
                    final Document document =
                            iterator == null ?
                                    collection.document(docid)
                                    : iterator.nextDocument();

                    System.out.format("%d\t%s", docid, document.uri());
                    for (int i = 0; i < fields.length; i++) {
                        final Object content = document.content(0);
                        switch (types[i]) {
                            case TEXT: {
                                wordReader.setReader((FastBufferedReader) content);
                                while (wordReader.next(word, delimiter)) {
                                    System.out.print('\t');
                                    System.out.print(word);
                                }
                                break;
                            }
                            default:
                                throw new RuntimeException("Cannot handle type " + types[i]);
                        }

                    }

                    System.out.println();
                }

            }
        }

    }
}
TOP

Related Classes of net.bpiwowar.mg4j.extensions.tasks.CatCollectionBatches

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.