DocumentCollection collection = (DocumentCollection) Scan.getSequence(this.sequence,
IdentityDocumentFactory.class, new String[]{},
Scan.DEFAULT_DELIMITER, LOGGER);
DocumentFactory factory = collection.factory();
final WordReader wordReader = wordReaderType.getWordReader();
LOGGER.info(String.format("Term processor class is %s", termProcessor.getClass()));
// -*- Get the fields
int[] fields = new int[fieldNames.size()];
DocumentFactory.FieldType[] types = new DocumentFactory.FieldType[fields.length];
for (int i = fields.length; --i >= 0; ) {
fields[i] = factory.fieldIndex(fieldNames.get(i));
types[i] = factory.fieldType(i);
}
final long numberOfDocuments = collection.size();
MutableString word = new MutableString();
MutableString delimiter = new MutableString();
while (true) {
// Select a starting document
final int start = (int) (Math.random() * (numberOfDocuments - batchSize));
try(final DocumentIterator iterator =
collection instanceof SegmentedDocumentCollection ?
((SegmentedDocumentCollection) collection).iterator(start)
: null) {
for (int docid = start; docid < start + batchSize; docid++) {
final Document document =
iterator == null ?
collection.document(docid)
: iterator.nextDocument();
System.out.format("%d\t%s", docid, document.uri());
for (int i = 0; i < fields.length; i++) {
final Object content = document.content(0);
switch (types[i]) {
case TEXT: {
wordReader.setReader((FastBufferedReader) content);
while (wordReader.next(word, delimiter)) {
System.out.print('\t');
System.out.print(word);
}
break;
}