Package cc.mallet.pipe.iterator

Source Code of cc.mallet.pipe.iterator.UnlabeledFileIterator

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




/**
   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/

package cc.mallet.pipe.iterator;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.*;
import java.io.*;

import cc.mallet.types.Instance;

/**
* An iterator that generates instances from an initial
* directory or set of directories. The iterator will recurse through sub-directories.
* Each filename becomes the data field of an instance, and the targets are set to null.
* To set the target values to the directory name, use FileIterator instead.
* <p>
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
@author Gregory Druck <a href="mailto:gdruck@cs.umass.edu">gdruck@cs.umass.edu</a>
*/
public class UnlabeledFileIterator implements Iterator<Instance>
{
  FileFilter fileFilter;
  ArrayList<File> fileArray;
  Iterator<File> subIterator;
  File[] startingDirectories;
  int[] minFileIndex;
  int fileCount;

  /** Special value that means to use the directories[i].getPath() as the target name */
  // xxx Note that these are specific to UNIX directory delimiter characters!  Fix this.

  /** Use as label names the directories specified in the constructor,
   * optionally removing common prefix of all starting directories
   */
  public static final Pattern STARTING_DIRECTORIES = Pattern.compile ("_STARTING_DIRECTORIES_");
  /** Use as label names the first directory in the filename. */
  public static final Pattern FIRST_DIRECTORY = Pattern.compile ("/?([^/]*)/.+");
  /** Use as label name the last directory in the filename. */
  public static final Pattern LAST_DIRECTORY = Pattern.compile(".*/([^/]+)/[^/]+"); // was ("([^/]*)/[^/]+");
  /** Use as label names all the directory names in the filename. */
  public static final Pattern ALL_DIRECTORIES = Pattern.compile ("^(.*)/[^/]+");


  // added by Fuchun Peng 
  public ArrayList<File> getFileArray()
  {
    return fileArray;
  }

  /**
   * Construct a FileIterator that will supply filenames within initial directories
   * as instances
   * @param directories  Array of directories to collect files from
   * @param fileFilter   class implementing interface FileFilter that will decide which names to accept.
   *                     May be null.
   * @param targetPattern  regex Pattern applied to the filename whose first parenthesized group
   *                       on matching is taken to be the target value of the generated instance. The pattern is applied to
   *                       the directory with the matcher.find() method. If null, then all instances
   *                       will have target null.
   * @param removeCommonPrefix boolean that modifies the behavior of the STARTING_DIRECTORIES pattern,
   *                           removing the common prefix of all initially specified directories,
   *                          leaving the remainder of each filename as the target value.
   *
   */
  protected UnlabeledFileIterator(File[] directories, FileFilter fileFilter) {
    this.startingDirectories = directories;
    this.fileFilter = fileFilter;
    this.minFileIndex = new int[directories.length];
    this.fileArray = new ArrayList<File> ();

    for (int i = 0; i < directories.length; i++) {
      if (!directories[i].isDirectory())
        throw new IllegalArgumentException (directories[i].getAbsolutePath()
                                            + " is not a directory.");
      minFileIndex[i] = fileArray.size();
      fillFileArray (directories[i], fileFilter, fileArray);
    }
    this.subIterator = fileArray.iterator();
    this.fileCount = 0;

    String[] dirStrings = new String[directories.length];
    for (int i = 0; i < directories.length; i++)
      dirStrings[i] = directories[i].toString();
  }

  public static File[] stringArray2FileArray (String[] sa)
  {
    File[] ret = new File[sa.length];
    for (int i = 0; i < sa.length; i++)
      ret[i] = new File (sa[i]);
    return ret;
  }

  public UnlabeledFileIterator (String[] directories, FileFilter ff)
  {
    this (stringArray2FileArray(directories), ff);
  }

  public  UnlabeledFileIterator (File directory, FileFilter fileFilter)
  {
    this (new File[] {directory}, fileFilter);
  }
 
  public UnlabeledFileIterator (File directory)
  {
    this (new File[] {directory}, null);
  }
 
  public UnlabeledFileIterator (File[] directories)
  {
    this (directories, null);
  }

  public UnlabeledFileIterator (String directory)
  {
    this (new File[] {new File(directory)}, null);
  }

    public UnlabeledFileIterator (String directory, FileFilter filter) {
       this (new File[] {new File(directory) }, filter);
    }

  private int fillFileArray (File directory, FileFilter filter, ArrayList<File> files)
  {
    int count = 0;
    File[] directoryContents = directory.listFiles();
    for (int i = 0; i < directoryContents.length; i++) {
      if (directoryContents[i].isDirectory())
        count += fillFileArray (directoryContents[i], filter, files);
      else if (filter == null || filter.accept(directoryContents[i])) {
        files.add (directoryContents[i]);
        count++;
      }
    }
    return count;
  }

  // The PipeInputIterator interface
  public Instance next ()
  {
    File nextFile = subIterator.next();
    fileCount++;
    return new Instance (nextFile, null, nextFile.toURI(), null);
  }
 
  public void remove () {
    throw new IllegalStateException ("This Iterator<Instance> does not support remove().");
  }

  // culotta - 9.11.03
  public File nextFile ()
  {
    return subIterator.next();   
  }

  public boolean hasNext ()  {  return subIterator.hasNext()}
 
}
TOP

Related Classes of cc.mallet.pipe.iterator.UnlabeledFileIterator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.