Package org.apache.mahout.cf.taste.impl.model.file

Source Code of org.apache.mahout.cf.taste.impl.model.file.FileDataModel

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.cf.taste.impl.model.file;

import java.io.File;
import java.io.FileFilter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.locks.ReentrantLock;

import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.mahout.cf.taste.common.Refreshable;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
import org.apache.mahout.cf.taste.impl.common.FastIDSet;
import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
import org.apache.mahout.cf.taste.impl.model.AbstractDataModel;
import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel;
import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
import org.apache.mahout.cf.taste.impl.model.GenericPreference;
import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
import org.apache.mahout.cf.taste.model.DataModel;
import org.apache.mahout.cf.taste.model.Preference;
import org.apache.mahout.cf.taste.model.PreferenceArray;
import org.apache.mahout.common.iterator.FileLineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;

/**
* <p>
* A {@link DataModel} backed by a delimited file. This class expects a file where each line
* contains a user ID, followed by item ID, followed by optional preference value, followed by
* optional timestamp. Commas or tabs delimit fields:
* </p>
*
* <p>{@code userID,itemID[,preference[,timestamp]]}</p>
*
* <p>
* Preference value is optional to accommodate applications that have no notion of a
* preference value (that is, the user simply expresses a
* preference for an item, but no degree of preference).
* </p>
*
* <p>
* The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are
* read parsed as {@code long}s. The timestamp, if present, is assumed to be parseable as a
* {@code long}, though this can be overridden via {@link #readTimestampFromString(String)}.
* The preference value may be empty, to indicate "no preference value", but cannot be empty. That is,
* this is legal:
* </p>
*
* <p>{@code 123,456,,129050099059}</p>
*
* <p>But this isn't:</p>
*
* <p>{@code 123,456,129050099059}</p>
*
* <p>
* It is also acceptable for the lines to contain additional fields. Fields beyond the third will be ignored.
* An empty line, or one that begins with '#' will be ignored as a comment.
* </p>
*
* <p>
* This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file
* has been reloaded very recently already.
* </p>
*
* <p>
* This class will also look for update "delta" files in the same directory, with file names that start the
* same way (up to the first period). These files have the same format, and provide updated data that
* supersedes what is in the main data file. This is a mechanism that allows an application to push updates to
*  without re-copying the entire data file.
* </p>
*
* <p>
* One small format difference exists. Update files must also be able to express deletes.
* This is done by ending with a blank preference value, as in "123,456,".
* </p>
*
* <p>
* Note that it's all-or-nothing -- all of the items in the file must express no preference, or the all must.
* These cannot be mixed. Put another way there will always be the same number of delimiters on every line of
* the file!
* </p>
*
* <p>
* This class is not intended for use with very large amounts of data (over, say, tens of millions of rows).
* For that, a JDBC-backed {@link DataModel} and a database are more appropriate.
* </p>
*
* <p>
* It is possible and likely useful to subclass this class and customize its behavior to accommodate
* application-specific needs and input formats. See {@link #processLine(String, FastByIDMap, FastByIDMap, boolean)} and
* {@link #processLineWithoutID(String, FastByIDMap, FastByIDMap)}
*/
public class FileDataModel extends AbstractDataModel {

  private static final Logger log = LoggerFactory.getLogger(FileDataModel.class);

  public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute?
  private static final char COMMENT_CHAR = '#';
  private static final char[] DELIMIETERS = {',', '\t'};

  private final File dataFile;
  private long lastModified;
  private long lastUpdateFileModified;
  private final char delimiter;
  private final Splitter delimiterPattern;
  private final boolean hasPrefValues;
  private DataModel delegate;
  private final ReentrantLock reloadLock;
  private final boolean transpose;
  private final long minReloadIntervalMS;

  /**
   * @param dataFile
   *          file containing preferences data. If file is compressed (and name ends in .gz or .zip
   *          accordingly) it will be decompressed as it is read)
   * @throws FileNotFoundException
   *           if dataFile does not exist
   * @throws IOException
   *           if file can't be read
   */
  public FileDataModel(File dataFile) throws IOException {
    this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS);
  }

  /**
   * @param transpose
   *          transposes user IDs and item IDs -- convenient for 'flipping' the data model this way
   * @param minReloadIntervalMS
   *  the minimum interval in milliseconds after which a full reload of the original datafile is done
   *  when refresh() is called
   * @see #FileDataModel(File)
   */
  public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS) throws IOException {
    this.dataFile = Preconditions.checkNotNull(dataFile.getAbsoluteFile());
    if (!dataFile.exists() || dataFile.isDirectory()) {
      throw new FileNotFoundException(dataFile.toString());
    }
    Preconditions.checkArgument(dataFile.length() > 0L, "dataFile is empty");
    Preconditions.checkArgument(minReloadIntervalMS >= 0L, "minReloadIntervalMs must be non-negative");

    log.info("Creating FileDataModel for file {}", dataFile);

    this.lastModified = dataFile.lastModified();
    this.lastUpdateFileModified = readLastUpdateFileModified();

    FileLineIterator iterator = new FileLineIterator(dataFile, false);
    String firstLine = iterator.peek();
    while (firstLine.isEmpty() || firstLine.charAt(0) == COMMENT_CHAR) {
      iterator.next();
      firstLine = iterator.peek();
    }
    Closeables.close(iterator, true);

    delimiter = determineDelimiter(firstLine);
    delimiterPattern = Splitter.on(delimiter);
    List<String> firstLineSplit = Lists.newArrayList();
    for (String token : delimiterPattern.split(firstLine)) {
      firstLineSplit.add(token);
    }
    // If preference value exists and isn't empty then the file is specifying pref values
    hasPrefValues = firstLineSplit.size() >= 3 && !firstLineSplit.get(2).isEmpty();

    this.reloadLock = new ReentrantLock();
    this.transpose = transpose;
    this.minReloadIntervalMS = minReloadIntervalMS;

    reload();
  }

  public File getDataFile() {
    return dataFile;
  }

  public char getDelimiter() {
    return delimiter;
  }

  protected void reload() {
    if (reloadLock.tryLock()) {
      try {
        delegate = buildModel();
      } catch (IOException ioe) {
        log.warn("Exception while reloading", ioe);
      } finally {
        reloadLock.unlock();
      }
    }
  }

  protected DataModel buildModel() throws IOException {

    long newLastModified = dataFile.lastModified();
    long newLastUpdateFileModified = readLastUpdateFileModified();

    boolean loadFreshData = delegate == null || newLastModified > lastModified + minReloadIntervalMS;

    long oldLastUpdateFileModifieid = lastUpdateFileModified;
    lastModified = newLastModified;
    lastUpdateFileModified = newLastUpdateFileModified;

    FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<FastByIDMap<Long>>();

    if (hasPrefValues) {

      if (loadFreshData) {

        FastByIDMap<Collection<Preference>> data = new FastByIDMap<Collection<Preference>>();
        FileLineIterator iterator = new FileLineIterator(dataFile, false);
        processFile(iterator, data, timestamps, false);

        for (File updateFile : findUpdateFilesAfter(newLastModified)) {
          processFile(new FileLineIterator(updateFile, false), data, timestamps, false);
        }

        return new GenericDataModel(GenericDataModel.toDataMap(data, true), timestamps);

      } else {

        FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData();

        for (File updateFile : findUpdateFilesAfter(Math.max(oldLastUpdateFileModifieid, newLastModified))) {
          processFile(new FileLineIterator(updateFile, false), rawData, timestamps, true);
        }

        return new GenericDataModel(rawData, timestamps);

      }

    } else {

      if (loadFreshData) {

        FastByIDMap<FastIDSet> data = new FastByIDMap<FastIDSet>();
        FileLineIterator iterator = new FileLineIterator(dataFile, false);
        processFileWithoutID(iterator, data, timestamps);

        for (File updateFile : findUpdateFilesAfter(newLastModified)) {
          processFileWithoutID(new FileLineIterator(updateFile, false), data, timestamps);
        }

        return new GenericBooleanPrefDataModel(data, timestamps);

      } else {

        FastByIDMap<FastIDSet> rawData = ((GenericBooleanPrefDataModel) delegate).getRawUserData();

        for (File updateFile : findUpdateFilesAfter(Math.max(oldLastUpdateFileModifieid, newLastModified))) {
          processFileWithoutID(new FileLineIterator(updateFile, false), rawData, timestamps);
        }

        return new GenericBooleanPrefDataModel(rawData, timestamps);

      }

    }
  }

  /**
   * Finds update delta files in the same directory as the data file. This finds any file whose name starts
   * the same way as the data file (up to first period) but isn't the data file itself. For example, if the
   * data file is /foo/data.txt.gz, you might place update files at /foo/data.1.txt.gz, /foo/data.2.txt.gz,
   * etc.
   */
  private Iterable<File> findUpdateFilesAfter(long minimumLastModified) {
    String dataFileName = dataFile.getName();
    int period = dataFileName.indexOf('.');
    String startName = period < 0 ? dataFileName : dataFileName.substring(0, period);
    File parentDir = dataFile.getParentFile();
    Map<Long, File> modTimeToUpdateFile = new TreeMap<Long,File>();
    FileFilter onlyFiles = new FileFilter() {
      @Override
      public boolean accept(File file) {
        return !file.isDirectory();
      }
    };
    for (File updateFile : parentDir.listFiles(onlyFiles)) {
      String updateFileName = updateFile.getName();
      if (updateFileName.startsWith(startName)
          && !updateFileName.equals(dataFileName)
          && updateFile.lastModified() >= minimumLastModified) {
        modTimeToUpdateFile.put(updateFile.lastModified(), updateFile);
      }
    }
    return modTimeToUpdateFile.values();
  }

  private long readLastUpdateFileModified() {
    long mostRecentModification = Long.MIN_VALUE;
    for (File updateFile : findUpdateFilesAfter(0L)) {
      mostRecentModification = Math.max(mostRecentModification, updateFile.lastModified());
    }
    return mostRecentModification;
  }

  public static char determineDelimiter(String line) {
    for (char possibleDelimieter : DELIMIETERS) {
      if (line.indexOf(possibleDelimieter) >= 0) {
        return possibleDelimieter;
      }
    }
    throw new IllegalArgumentException("Did not find a delimiter in first line");
  }

  protected void processFile(FileLineIterator dataOrUpdateFileIterator,
                             FastByIDMap<?> data,
                             FastByIDMap<FastByIDMap<Long>> timestamps,
                             boolean fromPriorData) {
    log.info("Reading file info...");
    int count = 0;
    while (dataOrUpdateFileIterator.hasNext()) {
      String line = dataOrUpdateFileIterator.next();
      if (!line.isEmpty()) {
        processLine(line, data, timestamps, fromPriorData);
        if (++count % 1000000 == 0) {
          log.info("Processed {} lines", count);
        }
      }
    }
    log.info("Read lines: {}", count);
  }

  /**
   * <p>
   * Reads one line from the input file and adds the data to a {@link FastByIDMap} data structure which maps user IDs
   * to preferences. This assumes that each line of the input file corresponds to one preference. After
   * reading a line and determining which user and item the preference pertains to, the method should look to
   * see if the data contains a mapping for the user ID already, and if not, add an empty data structure of preferences
   * as appropriate to the data.
   * </p>
   *
   * <p>
   * Note that if the line is empty or begins with '#' it will be ignored as a comment.
   * </p>
   *
   * @param line
   *          line from input data file
   * @param data
   *          all data read so far, as a mapping from user IDs to preferences
   * @param fromPriorData an implementation detail -- if true, data will map IDs to
   *  {@link PreferenceArray} since the framework is attempting to read and update raw
   *  data that is already in memory. Otherwise it maps to {@link Collection}s of
   *  {@link Preference}s, since it's reading fresh data. Subclasses must be prepared
   *  to handle this wrinkle.
   */
  protected void processLine(String line,
                             FastByIDMap<?> data,
                             FastByIDMap<FastByIDMap<Long>> timestamps,
                             boolean fromPriorData) {

    // Ignore empty lines and comments
    if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) {
      return;
    }

    Iterator<String> tokens = delimiterPattern.split(line).iterator();
    String userIDString = tokens.next();
    String itemIDString = tokens.next();
    String preferenceValueString = tokens.next();
    boolean hasTimestamp = tokens.hasNext();
    String timestampString = hasTimestamp ? tokens.next() : null;

    long userID = readUserIDFromString(userIDString);
    long itemID = readItemIDFromString(itemIDString);

    if (transpose) {
      long tmp = userID;
      userID = itemID;
      itemID = tmp;
    }

    // This is kind of gross but need to handle two types of storage
    Object maybePrefs = data.get(userID);
    if (fromPriorData) {
      // Data are PreferenceArray

      PreferenceArray prefs = (PreferenceArray) maybePrefs;
      if (!hasTimestamp && preferenceValueString.isEmpty()) {
        // Then line is of form "userID,itemID,", meaning remove
        if (prefs != null) {
          boolean exists = false;
          int length = prefs.length();
          for (int i = 0; i < length; i++) {
            if (prefs.getItemID(i) == itemID) {
              exists = true;
              break;
            }
          }
          if (exists) {
            if (length == 1) {
              data.remove(userID);
            } else {
              PreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1);
              for (int i = 0, j = 0; i < length; i++, j++) {
                if (prefs.getItemID(i) == itemID) {
                  j--;
                } else {
                  newPrefs.set(j, prefs.get(i));
                }
              }
              ((FastByIDMap<PreferenceArray>) data).put(userID, newPrefs);
            }
          }
        }

        removeTimestamp(userID, itemID, timestamps);

      } else {

        float preferenceValue = Float.parseFloat(preferenceValueString);

        boolean exists = false;
        if (prefs != null) {
          for (int i = 0; i < prefs.length(); i++) {
            if (prefs.getItemID(i) == itemID) {
              exists = true;
              prefs.setValue(i, preferenceValue);
              break;
            }
          }
        }

        if (!exists) {
          if (prefs == null) {
            prefs = new GenericUserPreferenceArray(1);
          } else {
            PreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.length() + 1);
            for (int i = 0, j = 1; i < prefs.length(); i++, j++) {
              newPrefs.set(j, prefs.get(i));
            }
            prefs = newPrefs;
          }
          prefs.setUserID(0, userID);
          prefs.setItemID(0, itemID);
          prefs.setValue(0, preferenceValue);
          ((FastByIDMap<PreferenceArray>) data).put(userID, prefs);         
        }
      }

      addTimestamp(userID, itemID, timestampString, timestamps);

    } else {
      // Data are Collection<Preference>

      Collection<Preference> prefs = (Collection<Preference>) maybePrefs;

      if (!hasTimestamp && preferenceValueString.isEmpty()) {
        // Then line is of form "userID,itemID,", meaning remove
        if (prefs != null) {
          // remove pref
          Iterator<Preference> prefsIterator = prefs.iterator();
          while (prefsIterator.hasNext()) {
            Preference pref = prefsIterator.next();
            if (pref.getItemID() == itemID) {
              prefsIterator.remove();
              break;
            }
          }
        }

        removeTimestamp(userID, itemID, timestamps);
       
      } else {

        float preferenceValue = Float.parseFloat(preferenceValueString);

        boolean exists = false;
        if (prefs != null) {
          for (Preference pref : prefs) {
            if (pref.getItemID() == itemID) {
              exists = true;
              pref.setValue(preferenceValue);
              break;
            }
          }
        }

        if (!exists) {
          if (prefs == null) {
            prefs = Lists.newArrayListWithCapacity(2);
            ((FastByIDMap<Collection<Preference>>) data).put(userID, prefs);
          }
          prefs.add(new GenericPreference(userID, itemID, preferenceValue));
        }

        addTimestamp(userID, itemID, timestampString, timestamps);

      }

    }
  }

  protected void processFileWithoutID(FileLineIterator dataOrUpdateFileIterator,
                                      FastByIDMap<FastIDSet> data,
                                      FastByIDMap<FastByIDMap<Long>> timestamps) {
    log.info("Reading file info...");
    int count = 0;
    while (dataOrUpdateFileIterator.hasNext()) {
      String line = dataOrUpdateFileIterator.next();
      if (!line.isEmpty()) {
        processLineWithoutID(line, data, timestamps);
        if (++count % 100000 == 0) {
          log.info("Processed {} lines", count);
        }
      }
    }
    log.info("Read lines: {}", count);
  }

  protected void processLineWithoutID(String line,
                                      FastByIDMap<FastIDSet> data,
                                      FastByIDMap<FastByIDMap<Long>> timestamps) {

    if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) {
      return;
    }

    Iterator<String> tokens = delimiterPattern.split(line).iterator();
    String userIDString = tokens.next();
    String itemIDString = tokens.next();
    boolean hasPreference = tokens.hasNext();
    String preferenceValueString = hasPreference ? tokens.next() : "";
    boolean hasTimestamp = tokens.hasNext();
    String timestampString = hasTimestamp ? tokens.next() : null;

    long userID = readUserIDFromString(userIDString);
    long itemID = readItemIDFromString(itemIDString);

    if (transpose) {
      long tmp = userID;
      userID = itemID;
      itemID = tmp;
    }

    if (hasPreference && !hasTimestamp && preferenceValueString.isEmpty()) {
      // Then line is of form "userID,itemID,", meaning remove

      FastIDSet itemIDs = data.get(userID);
      if (itemIDs != null) {
        itemIDs.remove(itemID);
      }

      removeTimestamp(userID, itemID, timestamps);

    } else {

      FastIDSet itemIDs = data.get(userID);
      if (itemIDs == null) {
        itemIDs = new FastIDSet(2);
        data.put(userID, itemIDs);
      }
      itemIDs.add(itemID);

      addTimestamp(userID, itemID, timestampString, timestamps);

    }
  }

  private void addTimestamp(long userID,
                            long itemID,
                            String timestampString,
                            FastByIDMap<FastByIDMap<Long>> timestamps) {
    if (timestampString != null) {
      FastByIDMap<Long> itemTimestamps = timestamps.get(userID);
      if (itemTimestamps == null) {
        itemTimestamps = new FastByIDMap<Long>();
        timestamps.put(userID, itemTimestamps);
      }
      long timestamp = readTimestampFromString(timestampString);
      itemTimestamps.put(itemID, timestamp);
    }
  }

  private static void removeTimestamp(long userID,
                                      long itemID,
                                      FastByIDMap<FastByIDMap<Long>> timestamps) {
    FastByIDMap<Long> itemTimestamps = timestamps.get(userID);
    if (itemTimestamps != null) {
      itemTimestamps.remove(itemID);
    }
  }

  /**
   * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by
   * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform
   * translation.
   */
  protected long readUserIDFromString(String value) {
    return Long.parseLong(value);
  }

  /**
   * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by
   * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform
   * translation.
   */
  protected long readItemIDFromString(String value) {
    return Long.parseLong(value);
  }

  /**
   * Subclasses may wish to override this to change how time values in the input file are parsed.
   * By default they are expected to be numeric, expressing a time as milliseconds since the epoch.
   */
  protected long readTimestampFromString(String value) {
    return Long.parseLong(value);
  }

  @Override
  public LongPrimitiveIterator getUserIDs() throws TasteException {
    return delegate.getUserIDs();
  }

  @Override
  public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
    return delegate.getPreferencesFromUser(userID);
  }

  @Override
  public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
    return delegate.getItemIDsFromUser(userID);
  }

  @Override
  public LongPrimitiveIterator getItemIDs() throws TasteException {
    return delegate.getItemIDs();
  }

  @Override
  public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
    return delegate.getPreferencesForItem(itemID);
  }

  @Override
  public Float getPreferenceValue(long userID, long itemID) throws TasteException {
    return delegate.getPreferenceValue(userID, itemID);
  }

  @Override
  public Long getPreferenceTime(long userID, long itemID) throws TasteException {
    return delegate.getPreferenceTime(userID, itemID);
  }

  @Override
  public int getNumItems() throws TasteException {
    return delegate.getNumItems();
  }

  @Override
  public int getNumUsers() throws TasteException {
    return delegate.getNumUsers();
  }

  @Override
  public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
    return delegate.getNumUsersWithPreferenceFor(itemID);
  }

  @Override
  public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
    return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
  }

  /**
   * Note that this method only updates the in-memory preference data that this
   * maintains; it does not modify any data on disk. Therefore any updates from this method are only
   * temporary, and lost when data is reloaded from a file. This method should also be considered relatively
   * slow.
   */
  @Override
  public void setPreference(long userID, long itemID, float value) throws TasteException {
    delegate.setPreference(userID, itemID, value);
  }

  /** See the warning at {@link #setPreference(long, long, float)}. */
  @Override
  public void removePreference(long userID, long itemID) throws TasteException {
    delegate.removePreference(userID, itemID);
  }

  @Override
  public void refresh(Collection<Refreshable> alreadyRefreshed) {
    if (dataFile.lastModified() > lastModified + minReloadIntervalMS
        || readLastUpdateFileModified() > lastUpdateFileModified + minReloadIntervalMS) {
      log.debug("File has changed; reloading...");
      reload();
    }
  }

  @Override
  public boolean hasPreferenceValues() {
    return delegate.hasPreferenceValues();
  }

  @Override
  public float getMaxPreference() {
    return delegate.getMaxPreference();
  }

  @Override
  public float getMinPreference() {
    return delegate.getMinPreference();
  }

  @Override
  public String toString() {
    return "FileDataModel[dataFile:" + dataFile + ']';
  }

}
TOP

Related Classes of org.apache.mahout.cf.taste.impl.model.file.FileDataModel

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.