/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
package cc.mallet.pipe.iterator;
import java.util.ArrayList;
import java.util.Iterator;
import java.net.URI;
import java.util.regex.*;
import java.io.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.Label;
import cc.mallet.util.Strings;
* An iterator that generates instances from an initial
* directory or set of directories. The iterator will recurse through sub-directories.
* Each filename becomes the data field of an instance, and the result of
* a user-specified regular expression pattern applied to the filename becomes
* the target value of the instance.
* <p>
* In document classification it is common that the file name in the data field
* will be subsequently processed by one or more pipes until it contains a feature vector.
* The pattern applied to the file name is often
* used to extract a directory name
* that will be used as the true label of the instance; this label is kept in the target
* field.
* @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
public class FileIterator implements Iterator<Instance>
FileFilter fileFilter;
ArrayList<File> fileArray;
Iterator<File> subIterator;
Pattern targetPattern; // Set target slot to string coming from 1st group of this Pattern
File[] startingDirectories;
int[] minFileIndex;
int fileCount;
int commonPrefixIndex;
/** Special value that means to use the directories[i].getPath() as the target name */
/** Use as label names the directories specified in the constructor,
* optionally removing common prefix of all starting directories
// gdruck@cs.umass.edu 08/09/10:
// generalize regular expressions to work with Windows filenames
public static final String sep = "\\" + File.separatorChar;
public static final Pattern STARTING_DIRECTORIES = Pattern.compile ("_STARTING_DIRECTORIES_");
/** Use as label names the first directory in the filename. */
public static final Pattern FIRST_DIRECTORY =
Pattern.compile (sep+"?([^"+sep+"]*)"+sep+".+");
/** Use as label name the last directory in the filename. */
public static final Pattern LAST_DIRECTORY =
Pattern.compile(".*"+sep+"([^"+sep+"]+)"+sep+"[^"+sep+"]+"); // was ("([^/]*)/[^/]+");
/** Use as label names all the directory names in the filename. */
public static final Pattern ALL_DIRECTORIES =
Pattern.compile ("^(.*)"+sep+"[^"+sep+"]+");
// added by Fuchun Peng
public ArrayList<File> getFileArray()
return fileArray;
* Construct a FileIterator that will supply filenames within initial directories
* as instances
* @param directories Array of directories to collect files from
* @param fileFilter class implementing interface FileFilter that will decide which names to accept.
* May be null.
* @param targetPattern regex Pattern applied to the filename whose first parenthesized group
* on matching is taken to be the target value of the generated instance. The pattern is applied to
* the directory with the matcher.find() method. If null, then all instances
* will have target null.
* @param removeCommonPrefix boolean that modifies the behavior of the STARTING_DIRECTORIES pattern,
* removing the common prefix of all initially specified directories,
* leaving the remainder of each filename as the target value.
protected FileIterator(File[] directories, FileFilter fileFilter,
Pattern targetPattern, boolean removeCommonPrefix) {
this.startingDirectories = directories;
this.fileFilter = fileFilter;
this.minFileIndex = new int[directories.length];
this.fileArray = new ArrayList<File> ();
this.targetPattern = targetPattern;
for (int i = 0; i < directories.length; i++) {
if (!directories[i].isDirectory())
throw new IllegalArgumentException (directories[i].getAbsolutePath()
+ " is not a directory.");
minFileIndex[i] = fileArray.size();
fillFileArray (directories[i], fileFilter, fileArray);
this.subIterator = fileArray.iterator();
this.fileCount = 0;
String[] dirStrings = new String[directories.length];
for (int i = 0; i < directories.length; i++)
dirStrings[i] = directories[i].toString();
if (removeCommonPrefix)
this.commonPrefixIndex = Strings.commonPrefixIndex (dirStrings);
//print the files
// System.out.println("FileIterator fileArray");
// for(int i=0; i<fileArray.size(); i++){
// File file = (File) fileArray.get(i);
// System.out.println(file.toString());
// }
public FileIterator (File[] directories, FileFilter fileFilter, Pattern targetPattern)
this (directories, fileFilter, targetPattern, false);
/** Iterate over Files that pass the fileFilter test, setting... */
public FileIterator (File[] directories, Pattern targetPattern)
this (directories, null, targetPattern);
public FileIterator (File[] directories, Pattern targetPattern, boolean removeCommonPrefix )
this (directories, null, targetPattern, removeCommonPrefix);
public static File[] stringArray2FileArray (String[] sa)
File[] ret = new File[sa.length];
for (int i = 0; i < sa.length; i++)
ret[i] = new File (sa[i]);
return ret;
public FileIterator (String[] directories, FileFilter ff)
this (stringArray2FileArray(directories), ff, null);
public FileIterator (String[] directories, String targetPattern)
this (stringArray2FileArray(directories), Pattern.compile(targetPattern));
public FileIterator (String[] directories, Pattern targetPattern)
this (stringArray2FileArray(directories), targetPattern);
public FileIterator (String[] directories, Pattern targetPattern, boolean removeCommonPrefix)
this (stringArray2FileArray(directories), targetPattern, removeCommonPrefix);
public FileIterator (File directory, FileFilter fileFilter, Pattern targetPattern)
this (new File[] {directory}, fileFilter, targetPattern);
public FileIterator (File directory, FileFilter fileFilter,
Pattern targetPattern, boolean removeCommonPrefix)
this (new File[] {directory}, fileFilter, targetPattern, removeCommonPrefix);
public FileIterator (File directory, FileFilter fileFilter)
this (new File[] {directory}, fileFilter, null);
public FileIterator (File directory, Pattern targetPattern)
this (new File[] {directory}, null, targetPattern);
public FileIterator (File directory, Pattern targetPattern, boolean removeCommonPrefix)
this (new File[] {directory}, null, targetPattern, removeCommonPrefix);
public FileIterator (String directory, Pattern targetPattern)
this (new File[] {new File(directory)}, null, targetPattern);
public FileIterator (String directory, Pattern targetPattern, boolean removeCommonPrefix)
this (new File[] {new File(directory)}, null, targetPattern, removeCommonPrefix);
public FileIterator (File directory)
this (new File[] {directory}, null, null, false);
public FileIterator (String directory)
this (new File[] {new File(directory)}, null, null, false);
public FileIterator (String directory, FileFilter filter) {
this (new File[] {new File(directory) }, filter, null);
private int fillFileArray (File directory, FileFilter filter, ArrayList<File> files)
int count = 0;
File[] directoryContents = directory.listFiles();
for (int i = 0; i < directoryContents.length; i++) {
if (directoryContents[i].isDirectory())
count += fillFileArray (directoryContents[i], filter, files);
else if (filter == null || filter.accept(directoryContents[i])) {
files.add (directoryContents[i]);
return count;
// The PipeInputIterator interface
public Instance next ()
File nextFile = subIterator.next();
String path = nextFile.getAbsolutePath();
String targetName = null;
if (targetPattern == STARTING_DIRECTORIES) {
int i;
for (i = 0; i < minFileIndex.length; i++)
if (minFileIndex[i] > fileCount)
targetName = startingDirectories[--i].getPath().substring(commonPrefixIndex);
} else if (targetPattern != null) {
Matcher m = targetPattern.matcher(path);
if (m.find ()){
targetName = m.group (1);
return new Instance (nextFile, targetName, nextFile.toURI(), null);
public void remove () {
throw new IllegalStateException ("This Iterator<Instance> does not support remove().");
// culotta - 9.11.03
public File nextFile ()
return subIterator.next();
public boolean hasNext () { return subIterator.hasNext(); }