Package edu.stanford.nlp.util

Source Code of edu.stanford.nlp.util.FileBackedCache$CloseAction

package edu.stanford.nlp.util;

import java.io.*;
import java.lang.ref.ReferenceQueue;
import java.lang.ref.SoftReference;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.text.DecimalFormat;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import static edu.stanford.nlp.util.logging.Redwood.Util.*;

/**
* <p>
* A Map backed by the filesystem.
* The primary use-case for this class is in reading a large cache which is convenient to store on disk.
* The class will load subsets of data on demand; if the JVM is in danger of running out of memory, these will
* be dropped from memory, and re-queried from disk if requested again.
* For best results, make sure to set a maximum number of files (by default, any number of files can be created);
* and, make sure this number is the same when reading and writing to the database.
* </p>
*
* <p>
* The keys should have a consistent hash code.
* That is, the value of the hash code of an object should be consistent between runs of the JVM.
* Note that this is <b>not</b> enforced in the specification of a hash code; in fact, in Java 7
* the hash code of a String may change between JVM invocations. The user is advised to be wary.
* </p>
*
* <p>
* Furthermore, note that many of the operations on this class are expensive, as they require traversing
*   a potentially large portion of disk, reading it into memory.
* Some operations, such as those requiring all the values to be enumerated, may cause a spike in memory
*   usage.
* </p>
*
* <p>
* This class is thread-safe, but not necessarily process-safe.
* If two processes write to the same block, there is no guarantee that both values will actually be written.
* This is very important -- <b>this class is a cache and not a database</b>.
* If you care about data integrity, you should use a real database.
* </p>
*
* <p>
*   The values in this map should not be modified once read -- the cache has no reliable way to pick up this change
*   and synchronize it with the disk.
*   To enforce this, the cache will cast collections to their unmodifiable counterparts -- to avoid class cast exceptions,
*   you should not parameterize the class with a particular type of collection
*   (e.g., use {@link java.util.Map} rather than {@link java.util.HashMap}).
* </p>
*
* <p>
*   The serialization behavior can be safely changed by overwriting:
*   <ul>
*     <li>@See FileBackedCache#newInputStream</li>
*     <li>@See FileBackedCache#newOutputStream</li>
*     <li>@See FileBackedCache#writeNextObject</li>
*     <li>@See FileBackedCache#readNextObject</li>
*   </ul>
* </p>
*
* @param <KEY> The key to cache by
* @param <T> The object to cache
*
* @author Gabor Angeli (angeli at cs)
*/

public class FileBackedCache<KEY extends Serializable, T> implements Map<KEY, T>, Iterable <Map.Entry<KEY,T>> {
  //
  // Variables
  //
  /** The directory the cached elements are being written to */
  public final File cacheDir;

  /** The maximum number of files to create in that directory ('buckets' in the hash map) */
  public final int maxFiles;

  /** The implementation of the mapping */
  private final Map<KEY, SoftReference<T>> mapping = new ConcurrentHashMap<KEY, SoftReference<T>>();

  /** A reaper for soft references, to save memory on storing the keys */
  private final ReferenceQueue<T> reaper = new ReferenceQueue<T>();

  /**
   * A file canonicalizer, so that we can synchronize on blocks -- static, as it should work between instances.
   * In particular, an exception is thrown if the JVM attempts to take out two locks on a file.
   */
  private static final Interner<File> canonicalFile = new Interner<File>();
  /** A map indicating whether the JVM holds a file lock on the given file */
  private static final IdentityHashMap<File, FileSemaphore> fileLocks = Generics.newIdentityHashMap();

  //
  // Constructors
  //

  /**
   * Create a file backed cache in a particular directory; either inheriting the elements in the directory
   * or starting with an empty cache.
   * This constructor may exception, and will create the directory in question if it does not exist.
   * @param directoryToCacheIn The directory to create the cache in
   */
  public FileBackedCache(File directoryToCacheIn) {
    this(directoryToCacheIn, -1);
  }

  /**
   * Create a file backed cache in a particular directory; either inheriting the elements in the directory
   * or starting with an empty cache.
   * This constructor may exception, and will create the directory in question if it does not exist.
   * @param directoryToCacheIn The directory to create the cache in
   * @param maxFiles The maximum number of files to store on disk
   */
  public FileBackedCache(File directoryToCacheIn, int maxFiles) {
    // Ensure directory exists
    if (!directoryToCacheIn.exists()) {
      if (!directoryToCacheIn.mkdirs()) {
        throw new IllegalArgumentException("Could not create cache directory: " + directoryToCacheIn);
      }
    }
    // Ensure directory is directory
    if (!directoryToCacheIn.isDirectory()) {
      throw new IllegalArgumentException("Cache directory must be a directory: " + directoryToCacheIn);
    }
    // Ensure directory is writable
    if (!directoryToCacheIn.canRead()) {
      throw new IllegalArgumentException("Cannot read cache directory: " + directoryToCacheIn);
    }
    // Save cache directory
    this.cacheDir = directoryToCacheIn;
    this.maxFiles = maxFiles;
    // Start cache cleaner
    /*
    Occasionally clean up the cache, removing keys which have been garbage collected.
   */
    Thread mappingCleaner = new Thread() {
      @SuppressWarnings({"unchecked", "StatementWithEmptyBody", "EmptyCatchBlock", "InfiniteLoopStatement"})
      @Override
      public void run() {
        while (true) {
          try {
            if (reaper.poll() != null) {
              // Clear reference queue
              while (reaper.poll() != null) {
              }
              // GC stale cache entries
              List<KEY> toRemove = Generics.newLinkedList();
              try {
                for (Entry<KEY, SoftReference<T>> entry : mapping.entrySet()) {
                  if (entry.getValue().get() == null) {
                    // Remove stale SoftReference
                    toRemove.add(entry.getKey());
                  }
                }
              } catch (ConcurrentModificationException e) {
                // Do nothing --
              }
              // Actually remove entries
              for (KEY key : toRemove) {
                mapping.remove(key);
              }
            }
            // Sleep a bit
            Thread.sleep(100);
          } catch (InterruptedException e) {
          }
        }
      }
    };
    mappingCleaner.setDaemon(true);
    mappingCleaner.start();
  }

  /**
   * Create a file backed cache in a particular directory; either inheriting the elements in the directory
   * with the initial mapping added, or starting with only the initial mapping.
   * This constructor may exception, and will create the directory in question if it does not exist.
   * @param directoryToCacheIn The directory to create the cache in
   * @param initialMapping The initial elements to place into the cache.
   */
  public FileBackedCache(File directoryToCacheIn, Map<KEY, T> initialMapping) {
    this(directoryToCacheIn, -1);
    putAll(initialMapping);
  }

  /**
   * Create a file backed cache in a particular directory; either inheriting the elements in the directory
   * with the initial mapping added, or starting with only the initial mapping.
   * This constructor may exception, and will create the directory in question if it does not exist.
   * @param directoryToCacheIn The directory to create the cache in
   * @param maxFiles The maximum number of files to store on disk
   * @param initialMapping The initial elements to place into the cache.
   */
  public FileBackedCache(File directoryToCacheIn, Map<KEY, T> initialMapping, int maxFiles) {
    this(directoryToCacheIn, maxFiles);
    putAll(initialMapping);
  }

  /**
   * Create a file backed cache in a particular directory; either inheriting the elements in the directory
   * or starting with an empty cache.
   * This constructor may exception, and will create the directory in question if it does not exist.
   * @param directoryToCacheIn The directory to create the cache in
   */
  public FileBackedCache(String directoryToCacheIn) {
    this(new File(directoryToCacheIn), -1);
  }

  /**
   * Create a file backed cache in a particular directory; either inheriting the elements in the directory
   * or starting with an empty cache.
   * This constructor may exception, and will create the directory in question if it does not exist.
   * @param directoryToCacheIn The directory to create the cache in
   * @param maxFiles The maximum number of files to store on disk
   */
  public FileBackedCache(String directoryToCacheIn, int maxFiles) {
    this(new File(directoryToCacheIn), maxFiles);
  }

  /**
   * Create a file backed cache in a particular directory; either inheriting the elements in the directory
   * with the initial mapping added, or starting with only the initial mapping.
   * This constructor may exception, and will create the directory in question if it does not exist.
   * @param directoryToCacheIn The directory to create the cache in
   * @param initialMapping The initial elements to place into the cache.
   */
  public FileBackedCache(String directoryToCacheIn, Map<KEY, T> initialMapping) {
    this(new File(directoryToCacheIn), initialMapping);
  }

  /**
   * Create a file backed cache in a particular directory; either inheriting the elements in the directory
   * with the initial mapping added, or starting with only the initial mapping.
   * This constructor may exception, and will create the directory in question if it does not exist.
   * @param directoryToCacheIn The directory to create the cache in
   * @param initialMapping The initial elements to place into the cache.
   * @param maxFiles The maximum number of files to store on disk
   */
  public FileBackedCache(String directoryToCacheIn, Map<KEY, T> initialMapping, int maxFiles) {
    this(new File(directoryToCacheIn), initialMapping, maxFiles);
  }

  //
  // Interface
  //

  /**
   * Gets the size of the cache, in terms of elements on disk.
   * Note that this is an expensive operation, as it reads the entire cache in from disk.
   * @return The size of the cache on disk.
   */
  @Override
  public int size() {
    return readCache();
  }

  /**
   * Gets the size of the cache, in terms of elements in memory.
   * In a multithreaded environment this is on a best-effort basis.
   * This method makes no disk accesses.
   * @return The size of the cache in memory.
   */
  public int sizeInMemory() {
    return mapping.size();
  }

  /**
   * Gets whether the cache is empty, including elements on disk.
   * Note thaTrue if the cache is emtpy.
   */
  @Override
  public boolean isEmpty() {
    return size() == 0;
  }

  /**
   * Returns true if the specified key exists in the mapping (on a best-effort basis in a multithreaded
   * environment).
   * This method may require some disk access, up to a maximum of one file read (of unknown size a priori).
   * @param key The key to query.
   * @return True if this key is in the cache.
   */
  @Override
  public boolean containsKey(Object key) {
    // Early exits
    if (mapping.containsKey(key)) return true;
    if (!tryFile(key)) return false;
    // Read the block for this key
    Collection<Pair<KEY, T>> elementsRead = readBlock(key);
    for (Pair<KEY, T> pair : elementsRead) {
      if (pair.first.equals(key)) return true;
    }
    return false;
  }

  /**
   * Returns true if the specified value is contained.
   * It is nearly (if not always) a bad idea to call this method.
   * @param value The value being queried for
   * @return True if the specified value exists in the cache.
   */
  @SuppressWarnings({"unchecked", "SuspiciousMethodCalls"})
  @Override
  public boolean containsValue(Object value) {
    // Try to short circuit and save the use from their stupidity
    if (mapping.containsValue(new SoftReference(value))) { return true; }
    // Do an exhaustive check over the values
    return values().contains(value);
  }

  /**
   * Get a cached value based on a key.
   * If the key is in memory, this is a constant time operation.
   * Else, this requires a single disk access, of undeterminable size but roughly correlated with the
   * quality of the key's hash code.
   */
  @SuppressWarnings({"SuspiciousMethodCalls", "unchecked"})
  @Override
  public T get(Object key) {
    SoftReference<T> likelyReferenceOrNull = mapping.get(key);
    T referenceOrNull = likelyReferenceOrNull == null ? null : likelyReferenceOrNull.get();
    if (likelyReferenceOrNull == null) {
      // Case: We don't know about this element being in the cache
      if (!tryFile(key)) { return null; // Case: there's no hope of finding this element
      Collection<Pair<KEY, T>> elemsRead = readBlock(key)// Read the block for this key
      for (Pair<KEY, T> pair : elemsRead) {
        if (pair.first.equals(key)) { return pair.second; }
      }
      return null;
    } else if (referenceOrNull == null) {
      // Case: This element once was in the cache
      mapping.remove(key);
      return get(key)// try again
    } else {
      if (referenceOrNull instanceof Collection) {
        return (T) Collections.unmodifiableCollection((Collection) referenceOrNull);
      } else if (referenceOrNull instanceof Map) {
        return (T) Collections.unmodifiableMap((Map) referenceOrNull);
      } else {
        return referenceOrNull;
      }
    }
  }

  @Override
  public T put(KEY key, T value) {
    T existing = get(key);
    if (existing == value || (existing != null && existing.equals(value))) {
      // Make sure we flush objects which have changed
      if (existing != null && !existing.equals(value)) {
        updateBlockOrDelete(key, value);
      }
      // Return the same object back
      return existing;
    } else {
      // In-memory
      SoftReference<T> ref = new SoftReference<T>(value, this.reaper);
      mapping.put(key, ref);
      // On Disk
      if (existing == null) {
        appendBlock(key, value);
      } else {
        updateBlockOrDelete(key, value);
      }
      // Return
      return existing;
    }
  }

  @SuppressWarnings("unchecked")
  @Override
  public T remove(Object key) {
    if (!tryFile(key)) return null;
    try {
      return updateBlockOrDelete((KEY) key, null);
    } catch (ClassCastException e) {
      return null;
    }
  }

  @Override
  public void putAll(Map<? extends KEY, ? extends T> m) {
    for (Entry<? extends KEY, ? extends T> entry : m.entrySet()) {
      try {
        put( entry.getKey(), entry.getValue() );
      } catch (RuntimeException e) {
        err(e);
      }
    }
  }

  /**
   * Clear the IN-MEMORY portion of the cache. This does not delete any files.
   */
  @Override
  public void clear() {
    mapping.clear();
  }

  /**
   * Returns all the keys for this cache that are found ON DISK.
   * This is an expensive operation.
   * @return The set of keys for this cache as found on disk.
   */
  @Override
  public Set<KEY> keySet() {
    readCache();
    return mapping.keySet();
  }

  /**
   * Returns all the values for this cache that are found ON DISK.
   * This is an expensive operation, both in terms of disk access time,
   * and in terms of memory used.
   * Furthermore, the memory used in this function cannot be GC collected -- you are loading the
   * entire cache into memory.
   * @return The set of values for this cache as found on disk.
   */
  @Override
  public Collection<T> values() {
    Set<Entry<KEY, T>> entries = entrySet();
    ArrayList<T> values = Generics.newArrayList(entries.size());
    for (Entry<KEY, T> entry : entries) {
      values.add(entry.getValue());
    }
    return values;
  }

  /**
   * Returns all the (key,value) pairs for this cache that are found ON DISK.
   * This is an expensive operation, both in terms of disk access time,
   * and in terms of memory used.
   * Furthermore, the memory used in this function cannot be GC collected -- you are loading the
   * entire cache into memory.
   * @return The set of keys and associated values for this cache as found on disk.
   */
  @Override
  public Set<Entry<KEY, T>> entrySet() {
    readCache();
    Set<Entry<KEY, SoftReference<T>>> entries = mapping.entrySet();
    Set<Entry<KEY, T>> rtn = Generics.newHashSet();
    for (final Entry<KEY, SoftReference<T>> entry : entries) {
      T value = entry.getValue().get();
      if (value == null) value = get(entry.getKey());
      final T valueFinal = value;
      rtn.add(new Entry<KEY, T>(){
        private T valueImpl = valueFinal;
        @Override
        public KEY getKey() {
          return entry.getKey();
        }
        @Override
        public T getValue() {
          return valueImpl;
        }
        @Override
        public T setValue(T value) {
          T oldValue = valueImpl;
          valueImpl = value;
          return oldValue;
        }
      });
    }
    return rtn;
  }

  /**
   * Iterates over the entries of the cache.
   * In the end, this loads the entire cache, but it can do it incrementally.
   * @return An iterator over the entries in the cache.
   */
  @Override
  public Iterator<Entry<KEY,T>> iterator() {
    final File[] files = cacheDir.listFiles();
    if (files == null || files.length == 0) return Generics.<Entry<KEY,T>>newLinkedList().iterator();
    for (int i = 0; i < files.length; ++i) {
      try {
        files[i] = canonicalFile.intern(files[i].getCanonicalFile());
      } catch (IOException e) {
        throw throwSafe(e);
      }
    }

    return new Iterator<Entry<KEY,T>>() {
      Iterator<Pair<KEY, T>> elements = readBlock(files[0]).iterator();
      int index = 1;

      @Override
      public boolean hasNext() {
        // Still have elements in this block
        if (elements.hasNext()) return true;
        // Still have files to traverse
        elements = null;
        while (index < files.length && elements == null) {
          try {
            elements = readBlock(files[index]).iterator();
          } catch (OutOfMemoryError e) {
            warn("FileBackedCache", "Caught out of memory error (clearing cache): " + e.getMessage());
            FileBackedCache.this.clear();
            //noinspection EmptyCatchBlock
            try { Thread.sleep(1000); } catch (InterruptedException e2) { }
            elements = readBlock(files[index]).iterator();
          } catch (RuntimeException e) {
            err(e);
          }
          index += 1;
        }
        // No more elements
        return elements != null && hasNext();
      }
      @Override
      public Entry<KEY, T> next() {
        if (!hasNext()) throw new NoSuchElementException();
        // Convert a pair to an entry
        final Pair<KEY, T> pair =  elements.next();
        return new Entry<KEY, T>() {
          @Override
          public KEY getKey() { return pair.first; }
          @Override
          public T getValue() { return pair.second; }
          @Override
          public T setValue(T value) { throw new RuntimeException("Cannot set entry"); }
        };
      }
      @Override
      public void remove() {
        throw new RuntimeException("Remove not implemented");
      }
    };
  }

  /**
   * Remove a given key from memory, not removing it from the disk.
   * @param key The key to remove from memory.
   */
  public boolean removeFromMemory(KEY key) {
    return mapping.remove(key) != null;
  }

  /**
   * Get the list of files on which this JVM holds a lock.
   * @return A collection of files on which the JVM holds a file lock.
   */
  public static Collection<File> locksHeld() {
    ArrayList<File> files = Generics.newArrayList();
    for (Entry<File, FileSemaphore> entry : fileLocks.entrySet()) {
      if (entry.getValue().isActive()) {
        files.add(entry.getKey());
      }
    }
    return files;
  }

  //
  // Daemons
  //

  //
  // Implementation
  // These are directly called by the interface methods
  //
  /** Reads the cache in its entirely -- this is potentially very slow */
  private int readCache() {
    File[] files = cacheDir.listFiles();
    if (files == null) { return 0; }
    for (int i = 0; i < files.length; ++i) {
      try {
        files[i] = canonicalFile.intern(files[i].getCanonicalFile());
      } catch (IOException e) {
        throw throwSafe(e);
      }
    }
    int count = 0;
    for (File f : files) {
      try {
        Collection<Pair<KEY, T>> block = readBlock(f);
        count += block.size();
      } catch (Exception e) {
        throw throwSafe(e);
      }
    }
    return count;
  }

  /** Checks for the existence of the block associated with the key */
  private boolean tryFile(Object key) {
    try {
      return hash2file(key.hashCode(), false).exists();
    } catch (IOException e) {
      throw throwSafe(e);
    }
  }

  /** Reads the block specified by the key in its entirety */
  private Collection<Pair<KEY, T>> readBlock(Object key) {
    try {
      return readBlock(hash2file(key.hashCode(), true));
    } catch (IOException e) {
      err("Could not read file: " + cacheDir.getPath() + File.separator + fileRoot(key.hashCode()));
      throw throwSafe(e);
    }
  }

  /** Appends a value to the block specified by the key */
  @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
  private void appendBlock(KEY key, T value) {
    boolean haveTakenLock = false;
    Pair<? extends OutputStream, CloseAction> writer = null;
    try {
      // Get File
      File toWrite = hash2file(key.hashCode(), false);
      boolean exists = toWrite.exists();
      robustCreateFile(toWrite);
      synchronized (toWrite) {
        assert canonicalFile.intern(toWrite.getCanonicalFile()) == toWrite;
        // Write Object
        writer = newOutputStream(toWrite, exists);
        haveTakenLock = true;
        writeNextObject(writer.first, Pair.makePair(key, value));
        writer.second.apply();
        haveTakenLock = false;
      }
    } catch (IOException e) {
      try { if (haveTakenLock) { writer.second.apply(); } }
      catch (IOException e2) { throw throwSafe(e2); }
      throw throwSafe(e);
    }
  }

  /** Updates a block with the specified value; or deletes the block if the value is null */
  @SuppressWarnings({"unchecked", "SynchronizationOnLocalVariableOrMethodParameter"})
  private T updateBlockOrDelete(KEY key, T valueOrNull) {
    Pair<? extends InputStream, CloseAction> reader = null;
    Pair<? extends OutputStream, CloseAction> writer = null;
    boolean haveClosedReader = false;
    boolean haveClosedWriter = false;
    try {
      // Variables
      File blockFile = hash2file(key.hashCode(), true);
      synchronized (blockFile) {
        assert canonicalFile.intern(blockFile.getCanonicalFile()) == blockFile;
        reader = newInputStream(blockFile);
        writer = newOutputStream(blockFile, false); // Get write lock before reading
        List<Pair<KEY, T>> block = Generics.newLinkedList();
        T existingValue = null;
        // Read
        Pair<KEY, T> element;
        while ((element = readNextObjectOrNull(reader.first)) != null) {
          if (element.first.equals(key)) {
            if (valueOrNull != null) {
              // Update
              existingValue = element.second;
              element.second = valueOrNull;
              block.add(element);
            }
          } else {
            // Spurious read
            block.add(element);
          }
        }
        reader.second.apply();
        haveClosedReader = true;
        // Write
        for( Pair<KEY, T> elem : block ) {
          writeNextObject(writer.first, elem);
        }
        writer.second.apply();
        haveClosedWriter = true;
        // Return
        return existingValue;
      }
    } catch (IOException e) {
      err(e);
      throw throwSafe(e);
    } catch (ClassNotFoundException e) {
      err(e);
      throw throwSafe(e);
    } finally {
      try {
        if (reader != null && !haveClosedReader) { reader.second.apply(); }
          if (writer != null && !haveClosedWriter) { writer.second.apply(); }
      } catch (IOException e) {
        warn(e);
      }
    }
  }

  //
  // Implementation Helpers
  // These are factored bits of the implementation
  //

  /** Completely reads a block into local memory */
  @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
  private Collection<Pair<KEY, T>> readBlock(File block) {
    boolean haveClosed = false;
    Pair<? extends InputStream, CloseAction> input = null;

    try {
      synchronized (block) {
        assert canonicalFile.intern(block.getCanonicalFile()) == block;
        List<Pair<KEY, T>> read = Generics.newLinkedList();
        // Get the reader
        input = newInputStream(block);
        // Get each object in the block
        Pair<KEY,T> element;
        while ((element = readNextObjectOrNull(input.first)) != null) {
          read.add(element);
        }
        input.second.apply();
        haveClosed = true;
        // Add elements
        for (Pair<KEY, T> elem : read) {
          SoftReference<T> ref = new SoftReference<T>(elem.second, this.reaper);
          mapping.put(elem.first, ref);
        }
        return read;
      }
    } catch (StreamCorruptedException e) {
      warn("Stream corrupted reading " + block);
      // Case: corrupted write
      if (!block.delete()) {
        throw new IllegalStateException("File corrupted, and cannot delete it: " + block.getPath());
      }
      return Generics.newLinkedList();
    } catch (EOFException e) {
      warn("Empty file (someone else is preparing to write to it?) " + block);
      return Generics.newLinkedList();
    } catch (IOException e) {
      // Case: General IO Error
      err("Could not read file: " + block + ": " + e.getMessage());
      return Generics.newLinkedList();
    } catch (ClassNotFoundException e) {
      // Case: Couldn't read class
      err("Could not read a class in file: " + block + ": " + e.getMessage());
      return Generics.newLinkedList();
    } catch (RuntimeException e) {
      // Case: Unknown error -- see if it's caused by StreamCorrupted
      if (e.getCause() != null && StreamCorruptedException.class.isAssignableFrom(e.getCause().getClass())) {
        // Yes -- caused by StreamCorrupted
        if (!block.delete()) {
          throw new IllegalStateException("File corrupted, and cannot delete it: " + block.getPath());
        }
        return Generics.newLinkedList();
      } else {
        // No -- random error (pass up)
        throw e;
      }
    } finally {
      if (input != null && !haveClosed) {
        try {
          input.second.apply();
        } catch (IOException e) { warn(e); }
      }
    }
  }

  /** Returns a file corresponding to a hash code, ensuring it exists first */
  private File hash2file(int hashCode, boolean create) throws IOException {
    File candidate =  canonicalFile.intern(new File(cacheDir.getCanonicalPath() + File.separator + fileRoot(hashCode) + ".block.ser.gz").getCanonicalFile());
    if (create) { robustCreateFile(candidate); }
    return candidate;
  }

  private int fileRoot(int hashCode) {
    if (this.maxFiles < 0) { return hashCode; }
    else { return Math.abs(hashCode) % this.maxFiles; }
  }

  //
  // Java Hacks
  //
  /** Turns out, an ObjectOutputStream cannot append to a file. This is dumb. */
  public class AppendingObjectOutputStream extends ObjectOutputStream {
    public AppendingObjectOutputStream(OutputStream out) throws IOException {
      super(out);
    }
    @Override
    protected void writeStreamHeader() throws IOException {
      // do not write a header, but reset
      reset();
    }
  }

  private static RuntimeException throwSafe(Throwable e) {
    if (e instanceof RuntimeException) return (RuntimeException) e;
    else if (e.getCause() == null) return new RuntimeException(e);
    else return throwSafe(e.getCause());
  }

  private static void robustCreateFile(File candidate) throws IOException {
    int tries = 0;
    while (!candidate.exists()) {
      if (tries > 30) { throw new IOException("Could not create file: " + candidate); }
      if (candidate.createNewFile()) { break; }
      try { Thread.sleep(1000); } catch (InterruptedException e) { log(e); }
    }
  }

  public static interface CloseAction {
    public void apply() throws IOException;
  }

  public static class FileSemaphore {
    private int licenses = 1;
    private final FileLock lock;
    private final FileChannel channel;

    public FileSemaphore(FileLock lock, FileChannel channel) { this.lock = lock; this.channel = channel; }

    public synchronized boolean isActive() {
      if (licenses == 0) { assert lock == null || !lock.isValid(); }
      if (licenses != 0 && lock != null) { assert lock.isValid(); }
      return licenses != 0;
    }

    public synchronized void take() {
      if (!isActive()) { throw new IllegalStateException("Taking a file license when the licenses have all been released"); }
      licenses += 1;
    }

    public synchronized void release() throws IOException {
      if (licenses <= 0) { throw new IllegalStateException("Already released all semaphore licenses"); }
      licenses -= 1;
      if (licenses <= 0) {
        if (lock != null) { lock.release(); }
        channel.close();
      }
    }
  }

  @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
  protected FileSemaphore acquireFileLock(File f) throws IOException {
    assert canonicalFile.intern(f.getCanonicalFile()) == f;
    synchronized (f) {
      // Check semaphore
      synchronized (fileLocks) {
        if (fileLocks.containsKey(f)) {
          FileSemaphore sem = fileLocks.get(f);
          if (sem.isActive()) {
            sem.take();
            return sem;
          } else {
            fileLocks.remove(f);
          }
        }
      }
      // Get the channel
      FileChannel channel = new RandomAccessFile(f, "rw").getChannel();
      FileLock lockOrNull = null;
      // Try the lock
      for (int i = 0; i < 1000; ++i) {
        lockOrNull = channel.tryLock();
        if (lockOrNull == null || !lockOrNull.isValid()) {
          try { Thread.sleep(1000); } catch (InterruptedException e) { log(e); }
          if (i % 60 == 59) { warn("FileBackedCache", "Lock still busy after " + ((i+1)/60) + " minutes"); }
          //noinspection UnnecessaryContinue
          continue;
        } else {
          break;
        }
      }
      if (lockOrNull == null) { warn("FileBackedCache", "Could not acquire file lock! Continuing without lock"); }
      // Return
      FileSemaphore sem = new FileSemaphore(lockOrNull, channel);
      synchronized (fileLocks) {
        fileLocks.put(f, sem);
      }
      return sem;
    }
  }

  //
  //  POSSIBLE OVERRIDES
  //

  /**
   * Create a new input stream, along with the code to close it and clean up.
   * This code may be overridden, but should match nextObjectOrNull().
   * IMPORTANT NOTE: aqcquiring a lock (well, semaphore) with FileBackedCache#acquireFileLock(File)
   * is generally a good idea. Make sure to release() it in the close action as well.
   * @param f The file to read from
   * @return A pair, corresponding to the stream and the code to close it.
   * @throws IOException
   */
  protected Pair<? extends InputStream, CloseAction> newInputStream(File f) throws IOException {
    final FileSemaphore lock = acquireFileLock(f);
    final ObjectInputStream rtn = new ObjectInputStream(new GZIPInputStream(new BufferedInputStream(new FileInputStream(f))));
    return new Pair<ObjectInputStream, CloseAction>(rtn,
        () -> { lock.release(); rtn.close()});
  }

  /**
   * Create a new output stream, along with the code to close it and clean up.
   * This code may be overridden, but should match nextObjectOrNull()
   * IMPORTANT NOTE: aqcquiring a lock (well, semaphore) with FileBackedCache#acquireFileLock(File)
   * is generally a good idea. Make sure to release() it in the close action as well.
   * @param f The file to write to
   * @param isAppend Signals whether the file we are writing to exists, and we are appending to it.
   * @return A pair, corresponding to the stream and the code to close it.
   * @throws IOException
   */
  protected Pair<? extends OutputStream, CloseAction> newOutputStream(File f, boolean isAppend) throws IOException {
    final FileOutputStream stream = new FileOutputStream(f, isAppend);
    final FileSemaphore lock = acquireFileLock(f);
    final ObjectOutputStream rtn = isAppend
        ? new AppendingObjectOutputStream(new GZIPOutputStream(new BufferedOutputStream(stream)))
        : new ObjectOutputStream(new GZIPOutputStream(new BufferedOutputStream(stream)));
    return new Pair<ObjectOutputStream, CloseAction>(rtn,
        () -> { rtn.flush(); lock.release(); rtn.close(); });
  }

  /**
   * Return the next object in the given stream, or null if there is no such object.
   * This method may be overwritten, but should match the implementation of newInputStream
   * @param input The input stream to read the object from
   * @return A (key, value) pair corresponding to the read object
   * @throws IOException
   * @throws ClassNotFoundException
   */
  @SuppressWarnings("unchecked")
  protected Pair<KEY, T> readNextObjectOrNull(InputStream input) throws IOException, ClassNotFoundException {
    try {
      return (Pair<KEY, T>) ((ObjectInputStream) input).readObject();
    } catch (EOFException e) {
      return null; // I hate java
    }
  }

  /**
   * Write an object to a stream
   * This method may be overwritten, but should match the implementation of newOutputStream()
   * @param output The output stream to write the object to.
   * @param value The value to write to the stream, as a (key, value) pair.
   * @throws IOException
   */
  protected void writeNextObject(OutputStream output, Pair<KEY, T> value) throws IOException {
    ((ObjectOutputStream) output).writeObject(value);
  }

  /**
   * <p>Merge a number of caches together. This could be useful for creating large caches,
   * as (1) it can bypass NFS for local caching, and (2) it can allow for many small caches
   * that are then merged together, which is more efficient as the number of entries in a bucket
   * increases (e.g., if the cache becomes very large).</p>
   *
   * <p>If there are collision, they are broken by accepting the entry in destination (if applicable),
   *    and then by accepting the entry in the last constituent.</p>
   *
   * <p><b>IMPORTANT NOTE:</b>: This method requires quite a bit of memory, and there is a brief time
   * when it deletes all the files in destination, storing the data entirely in memory. If the program
   * crashes in this state, THE DATA IN |destination| MAY BE LOST</p>
   *
   * @param destination The cache to append to. This might not be empty, in which case all entries
   *                   in the destination are preserved.
   * @param constituents The constituent caches. All entries in each of these caches are added to
   *                     the destination.
   */
  public static <KEY extends Serializable, T extends Serializable> void merge(
      FileBackedCache<KEY, T> destination, FileBackedCache<? extends KEY, ? extends T>[] constituents) {
    startTrack("Merging Caches");

    // (1) Read everything into memory
    forceTrack("Reading Constituents");
    Map<String, Map<KEY, T>> combinedMapping = Generics.newHashMap();
    try {
      // Accumulate constituents
      for (int i = 0; i < constituents.length; ++i) {
        FileBackedCache<? extends KEY, ? extends T> constituent = constituents[i];
        for (Entry<? extends KEY, ? extends T> entry : constituent) {
          String fileToWriteTo = destination.hash2file(entry.getKey().hashCode(), false).getName();
          if (!combinedMapping.containsKey(fileToWriteTo)) { combinedMapping.put(fileToWriteTo, Generics.<KEY,T>newHashMap()); }
          combinedMapping.get(fileToWriteTo).put(entry.getKey(), entry.getValue());
        }
        log("[" + new DecimalFormat("0000").format(i) + "/" + constituents.length + "] read " + constituent.cacheDir + " [" + (Runtime.getRuntime().freeMemory() / 1000000) + "MB free memory]");
        constituent.clear();
      }
      // Accumulate destination
      for (Entry<? extends KEY, ? extends T> entry : destination) {
        String fileToWriteTo = destination.hash2file(entry.getKey().hashCode(), false).getName();
        if (!combinedMapping.containsKey(fileToWriteTo)) { combinedMapping.put(fileToWriteTo, Generics.<KEY,T>newHashMap()); }
        combinedMapping.get(fileToWriteTo).put(entry.getKey(), entry.getValue());
      }
    } catch (IOException e) {
      err("Found exception in merge() -- all data is intact (but passing exception up)");
      throw new RuntimeException(e);
    }
    endTrack("Reading Constituents");

    // (2) Clear out Destination
    forceTrack("Clearing Destination");
    if (!destination.cacheDir.exists() && !destination.cacheDir.mkdirs()) {
      throw new RuntimeException("Could not create cache dir for destination (data is intact): " + destination.cacheDir);
    }
    File[] filesInDestination = destination.cacheDir.listFiles();
    if (filesInDestination == null) {
      throw new RuntimeException("Cannot list files in destination's cache dir (data is intact): " + destination.cacheDir);
    }
    for (File block : filesInDestination) {
      if (!block.delete()) {
        warn("FileBackedCache", "could not delete block: " + block);
      }
    }
    endTrack("Clearing Destination");

    // (3) Write new files
    forceTrack("Writing New Files");
    try {
      for (Entry<String, Map<KEY, T>> blockEntry : combinedMapping.entrySet()) {
        // Get File
        File toWrite = canonicalFile.intern(new File(destination.cacheDir + File.separator + blockEntry.getKey()).getCanonicalFile());
        boolean exists = toWrite.exists(); // should really be false;
        // Write Objects
        Pair<? extends OutputStream, CloseAction> writer = destination.newOutputStream(toWrite, exists);
        for (Entry<KEY, T> entry : blockEntry.getValue().entrySet()) {
          destination.writeNextObject(writer.first, Pair.makePair(entry.getKey(), entry.getValue()));
        }
        writer.second.apply();
      }
    } catch (IOException e) {
      err("Could not write constituent files to combined cache (DATA IS LOST)!");
      throw new RuntimeException(e);
    }
    endTrack("Writing New Files");
    endTrack("Merging Caches");
  }

  @SuppressWarnings("unchecked")
  public static <KEY extends Serializable, T extends Serializable> void merge(
      FileBackedCache<KEY, T> destination, Collection<FileBackedCache<KEY, T>> constituents) {
    merge(destination, constituents.toArray((FileBackedCache<KEY,T>[])new FileBackedCache[constituents.size()]));
  }

}
TOP

Related Classes of edu.stanford.nlp.util.FileBackedCache$CloseAction

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.