@SuppressWarnings("unchecked")
public static void main( final String[] arg ) throws JSAPException, InvocationTargetException, NoSuchMethodException, ConfigurationException, ClassNotFoundException, IOException,
IllegalAccessException, InstantiationException {
SimpleJSAP jsap = new SimpleJSAP(
Scan.class.getName(),
"Builds a set of batches from a sequence of documents.",
new Parameter[] {
new FlaggedOption( "sequence", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'S', "sequence", "A serialised document sequence that will be used instead of stdin." ),
new FlaggedOption( "objectSequence", new ObjectParser( DocumentSequence.class, MG4JClassParser.PACKAGE ), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "object-sequence", "An object specification describing a document sequence that will be used instead of stdin." ),
new FlaggedOption( "delimiter", JSAP.INTEGER_PARSER, Integer.toString( DEFAULT_DELIMITER ), JSAP.NOT_REQUIRED, 'd', "delimiter", "The document delimiter (when indexing stdin)." ),
new FlaggedOption( "factory", MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), JSAP.NOT_REQUIRED, 'f', "factory", "A document factory with a standard constructor (when indexing stdin)." ),
new FlaggedOption( "property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file (when indexing stdin)." )
.setAllowMultipleDeclarations( true ),
new FlaggedOption( "termProcessor", JSAP.STRING_PARSER, NullTermProcessor.class.getName(), JSAP.NOT_REQUIRED, 't', "term-processor",
"Sets the term processor to the given class." ),
new FlaggedOption( "completeness", JSAP.STRING_PARSER, Completeness.POSITIONS.name(), JSAP.NOT_REQUIRED, 'c', "completeness", "How complete the index should be " + Arrays.toString( Completeness.values() ) + "." ),
new Switch( "downcase", JSAP.NO_SHORTFLAG, "downcase", "A shortcut for setting the term processor to the downcasing processor." ),
new FlaggedOption( "indexedField", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'I', "indexed-field",
"The field(s) of the document factory that will be indexed. (default: all non-virtual fields)" ).setAllowMultipleDeclarations( true ),
new Switch( "allFields", 'a', "all-fields", "Index also all virtual fields; has no effect if indexedField has been used at least once." ),
new FlaggedOption( "buildCollection", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'B', "build-collection", "During the indexing phase, build a collection using this basename." ),
new FlaggedOption( "builderClass", MG4JClassParser.getParser(), SimpleCompressedDocumentCollectionBuilder.class.getName(), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "builder-class", "Specifies a builder class for a document collection that will be created during the indexing phase." ),
new Switch( "exact", 'e', "exact", "The builder class should be instantiated in its exact form, which records both words and nonwords." ),
new FlaggedOption( "batchSize", JSAP.INTSIZE_PARSER, Integer.toString( Scan.DEFAULT_BATCH_SIZE ), JSAP.NOT_REQUIRED, 's', "batch-size", "The maximum size of a batch, in documents. Batches will be smaller, however, if memory is exhausted or there are too many terms." ),
new FlaggedOption( "maxTerms", JSAP.INTSIZE_PARSER, Integer.toString( Scan.DEFAULT_MAX_TERMS ), JSAP.NOT_REQUIRED, 'M', "max-terms", "The maximum number of terms in a batch, in documents." ),
new FlaggedOption( "virtualDocumentResolver", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'v', "virtual-document-resolver",
"The virtual document resolver. It can be specified several times in the form [<field>:]<filename>. If the field is omitted, it sets the document resolver for all virtual fields." )
.setAllowMultipleDeclarations( true ),
new FlaggedOption( "virtualDocumentGap", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'g', "virtual-document-gap",
"The virtual document gap. It can be specified several times in the form [<field>:]<gap>. If the field is omitted, it sets the document gap for all virtual fields; the default gap is "
+ DEFAULT_VIRTUAL_DOCUMENT_GAP ).setAllowMultipleDeclarations( true ),
new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),
new FlaggedOption( "renumber", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'r', "renumber", "The filename of a document renumbering." ),
new Switch( "keepUnsorted", 'u', "keep-unsorted", "Keep the unsorted term file." ),
new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval",
"The minimum time interval between activity logs in milliseconds." ),
new FlaggedOption( "tempDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T', "temp-dir", "A directory for all temporary files (e.g., batches)." ),
new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the resulting index." ) } );
JSAPResult jsapResult = jsap.parse( arg );
if ( jsap.messagePrinted() ) return;
if ( ( jsapResult.userSpecified( "builderClass" ) || jsapResult.userSpecified( "exact" ) ) && ! jsapResult.userSpecified( "buildCollection" ) ) throw new IllegalArgumentException( "To specify options about the collection building process, you must specify a basename first." );
if ( jsapResult.userSpecified( "sequence" ) && jsapResult.userSpecified( "objectSequence" ) ) throw new IllegalArgumentException( "You cannot specify both a serialised and an parseable-object sequence" );
final DocumentSequence documentSequence = jsapResult.userSpecified( "objectSequence" ) ? (DocumentSequence)jsapResult.getObject( "objectSequence" ) : Scan.getSequence( jsapResult.getString( "sequence" ), jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ), jsapResult.getInt( "delimiter" ), LOGGER );