/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
package cc.mallet.pipe.iterator;
import java.util.ArrayList;
import java.util.Iterator;
import java.net.URI;
import java.util.regex.*;
import java.io.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.Label;
import cc.mallet.util.Strings;
/**
* An iterator that generates instances from an initial
* directory or set of directories. The iterator will recurse through sub-directories.
* Each filename becomes the data field of an instance, and the result of
* a user-specified regular expression pattern applied to the filename becomes
* the target value of the instance.
* <p>
* In document classification it is common that the file name in the data field
* will be subsequently processed by one or more pipes until it contains a feature vector.
* The pattern applied to the file name is often
* used to extract a directory name
* that will be used as the true label of the instance; this label is kept in the target
* field.
*
*
* @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
public class FileIterator implements Iterator<Instance>
{
FileFilter fileFilter;
ArrayList<File> fileArray;
Iterator<File> subIterator;
Pattern targetPattern; // Set target slot to string coming from 1st group of this Pattern
File[] startingDirectories;
int[] minFileIndex;
int fileCount;
int commonPrefixIndex;
/** Special value that means to use the directories[i].getPath() as the target name */
/** Use as label names the directories specified in the constructor,
* optionally removing common prefix of all starting directories
*/
// gdruck@cs.umass.edu 08/09/10:
// generalize regular expressions to work with Windows filenames
public static final String sep = "\\" + File.separatorChar;
public static final Pattern STARTING_DIRECTORIES = Pattern.compile ("_STARTING_DIRECTORIES_");
/** Use as label names the first directory in the filename. */
public static final Pattern FIRST_DIRECTORY =
Pattern.compile (sep+"?([^"+sep+"]*)"+sep+".+");
/** Use as label name the last directory in the filename. */
public static final Pattern LAST_DIRECTORY =
Pattern.compile(".*"+sep+"([^"+sep+"]+)"+sep+"[^"+sep+"]+"); // was ("([^/]*)/[^/]+");
/** Use as label names all the directory names in the filename. */
public static final Pattern ALL_DIRECTORIES =
Pattern.compile ("^(.*)"+sep+"[^"+sep+"]+");
// added by Fuchun Peng
public ArrayList<File> getFileArray()
{
return fileArray;
}
/**
* Construct a FileIterator that will supply filenames within initial directories
* as instances
* @param directories Array of directories to collect files from
* @param fileFilter class implementing interface FileFilter that will decide which names to accept.
* May be null.
* @param targetPattern regex Pattern applied to the filename whose first parenthesized group
* on matching is taken to be the target value of the generated instance. The pattern is applied to
* the directory with the matcher.find() method. If null, then all instances
* will have target null.
* @param removeCommonPrefix boolean that modifies the behavior of the STARTING_DIRECTORIES pattern,
* removing the common prefix of all initially specified directories,
* leaving the remainder of each filename as the target value.
*
*/
protected FileIterator(File[] directories, FileFilter fileFilter,
Pattern targetPattern, boolean removeCommonPrefix) {
this.startingDirectories = directories;
this.fileFilter = fileFilter;
this.minFileIndex = new int[directories.length];
this.fileArray = new ArrayList<File> ();
this.targetPattern = targetPattern;
for (int i = 0; i < directories.length; i++) {
if (!directories[i].isDirectory())
throw new IllegalArgumentException (directories[i].getAbsolutePath()
+ " is not a directory.");
minFileIndex[i] = fileArray.size();
fillFileArray (directories[i], fileFilter, fileArray);
}
this.subIterator = fileArray.iterator();
this.fileCount = 0;
String[] dirStrings = new String[directories.length];
for (int i = 0; i < directories.length; i++)
dirStrings[i] = directories[i].toString();
if (removeCommonPrefix)
this.commonPrefixIndex = Strings.commonPrefixIndex (dirStrings);
//print the files
// System.out.println("FileIterator fileArray");
// for(int i=0; i<fileArray.size(); i++){
// File file = (File) fileArray.get(i);
// System.out.println(file.toString());
// }
}
public FileIterator (File[] directories, FileFilter fileFilter, Pattern targetPattern)
{
this (directories, fileFilter, targetPattern, false);
}
/** Iterate over Files that pass the fileFilter test, setting... */
public FileIterator (File[] directories, Pattern targetPattern)
{
this (directories, null, targetPattern);
}
public FileIterator (File[] directories, Pattern targetPattern, boolean removeCommonPrefix )
{
this (directories, null, targetPattern, removeCommonPrefix);
}
public static File[] stringArray2FileArray (String[] sa)
{
File[] ret = new File[sa.length];
for (int i = 0; i < sa.length; i++)
ret[i] = new File (sa[i]);
return ret;
}
public FileIterator (String[] directories, FileFilter ff)
{
this (stringArray2FileArray(directories), ff, null);
}
public FileIterator (String[] directories, String targetPattern)
{
this (stringArray2FileArray(directories), Pattern.compile(targetPattern));
}
public FileIterator (String[] directories, Pattern targetPattern)
{
this (stringArray2FileArray(directories), targetPattern);
}
public FileIterator (String[] directories, Pattern targetPattern, boolean removeCommonPrefix)
{
this (stringArray2FileArray(directories), targetPattern, removeCommonPrefix);
}
public FileIterator (File directory, FileFilter fileFilter, Pattern targetPattern)
{
this (new File[] {directory}, fileFilter, targetPattern);
}
public FileIterator (File directory, FileFilter fileFilter,
Pattern targetPattern, boolean removeCommonPrefix)
{
this (new File[] {directory}, fileFilter, targetPattern, removeCommonPrefix);
}
public FileIterator (File directory, FileFilter fileFilter)
{
this (new File[] {directory}, fileFilter, null);
}
public FileIterator (File directory, Pattern targetPattern)
{
this (new File[] {directory}, null, targetPattern);
}
public FileIterator (File directory, Pattern targetPattern, boolean removeCommonPrefix)
{
this (new File[] {directory}, null, targetPattern, removeCommonPrefix);
}
public FileIterator (String directory, Pattern targetPattern)
{
this (new File[] {new File(directory)}, null, targetPattern);
}
public FileIterator (String directory, Pattern targetPattern, boolean removeCommonPrefix)
{
this (new File[] {new File(directory)}, null, targetPattern, removeCommonPrefix);
}
public FileIterator (File directory)
{
this (new File[] {directory}, null, null, false);
}
public FileIterator (String directory)
{
this (new File[] {new File(directory)}, null, null, false);
}
public FileIterator (String directory, FileFilter filter) {
this (new File[] {new File(directory) }, filter, null);
}
private int fillFileArray (File directory, FileFilter filter, ArrayList<File> files)
{
int count = 0;
File[] directoryContents = directory.listFiles();
for (int i = 0; i < directoryContents.length; i++) {
if (directoryContents[i].isDirectory())
count += fillFileArray (directoryContents[i], filter, files);
else if (filter == null || filter.accept(directoryContents[i])) {
files.add (directoryContents[i]);
count++;
}
}
return count;
}
// The PipeInputIterator interface
public Instance next ()
{
File nextFile = subIterator.next();
String path = nextFile.getAbsolutePath();
String targetName = null;
if (targetPattern == STARTING_DIRECTORIES) {
int i;
for (i = 0; i < minFileIndex.length; i++)
if (minFileIndex[i] > fileCount)
break;
targetName = startingDirectories[--i].getPath().substring(commonPrefixIndex);
} else if (targetPattern != null) {
Matcher m = targetPattern.matcher(path);
if (m.find ()){
targetName = m.group (1);
}
}
fileCount++;
return new Instance (nextFile, targetName, nextFile.toURI(), null);
}
public void remove () {
throw new IllegalStateException ("This Iterator<Instance> does not support remove().");
}
// culotta - 9.11.03
public File nextFile ()
{
return subIterator.next();
}
public boolean hasNext () { return subIterator.hasNext(); }
}