Package de.kunysch.localimdb

Source Code of de.kunysch.localimdb.ImdbParser

/* $Id: ImdbParser.java 124 2008-03-06 07:19:25Z bananeweizen $
* GNU GPL Version 2, Copyright (C) 2005 Paul C. Kunysch */
package de.kunysch.localimdb;

import java.awt.Component;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import javax.swing.ProgressMonitorInputStream;

import util.ui.Localizer;

/** This class offers a framework for parsing the IMDb list files. */
public abstract class ImdbParser {
  private static Localizer mLocalizer = util.ui.Localizer.getLocalizerFor(ImdbParser.class);

  private static final Charset ENCODING = Charset.forName("ISO-8859-1"); //$NON-NLS-1$

  private static final Logger LOG = Logger.getLogger(ImdbParser.class.getPackage().getName());

  private final Date timestamp = new Date(0);

  private final String dataStartMark;

  private final String filename;

  private final Pattern pattern;

  private final MoviesBuilder builder;

  /**
   * This configures an ImdbParser for a specific IMDb table.
   *
   * @param builder
   *          will collect data and create a Movies object.
   * @param filename
   *          is the basename of the file. ("foo.list.gz")
   * @param dataStartMark
   *          is a line that must appear befor the first data entries. This may
   *          be null.
   * @param pattern
   *          must match every line in the data region. The resulting matchers
   *          are passed to processMatchedLine().
   */
  protected ImdbParser(MoviesBuilder builder, String filename, String dataStartMark, Pattern pattern) {
    this.filename = filename;
    this.dataStartMark = dataStartMark;
    this.pattern = pattern;
    this.builder = builder;
  }

  /**
   * Fetch and parse a table from the specified URL.
   *
   * @param baseUrl
   *          the URL for the data directory.
   * @param parent
   *          a parent for a ProgressMonitorInputStream or null.
   * @return the number of added entries.
   * @throws MalformedURLException
   *           if the URL to the IMDb file can't be constructed.
   * @throws IOException
   *           if the data can't be read or if the user cancels the
   *           ProgressMonitorInputStream.
   * @throws ParseException
   *           if the date on the first line can't be read.
   */
  public int fetch(URL baseUrl, Component parent) throws IOException, ParseException {
    BufferedReader br = null;
    try {
      br = constructBufferedReader(baseUrl, parent);
      timestamp.setTime(parseTimestamp(br.readLine()));
      return processMatchingLines(skipToMatch(br), br);
    } finally {
      if (null != br) {
        br.close();
      }
    }
  }

  public int fetch(ReaderFactory factory) throws IOException, ParseException {
    BufferedReader br = null;
    try {
      br = factory.reader(getFilename(), ENCODING);
      timestamp.setTime(parseTimestamp(br.readLine()));
      return processMatchingLines(skipToMatch(br), br);
    } finally {
      if (null != br) {
        br.close();
      }
    }
  }

  /**
   * This internal function calls <code>processMatchedLine</code> for every
   * input while <code>pattern</code> matches.
   *
   * @param firstLine
   *          is the first line that will be processed.
   * @param input
   *          is is the source for all other lines.
   * @throws IOException
   *           if an I/O error occurs.
   * @return the number of lines that added entries.
   */
  private int processMatchingLines(String firstLine, BufferedReader input) throws IOException {
    int line_count = 0;
    String line = firstLine;
    Matcher matcher = pattern.matcher(line);
    while (null != line && matcher.matches()) {
      if (processMatchedLine(matcher)) {
        line_count = line_count + 1;
      }
      line = input.readLine();
      matcher = pattern.matcher(line);
    }
    if (null != line) {
      LOG.log(Level.INFO, mLocalizer.msg("1", "Stopped parsing file at: ''{0}''", line)); //$NON-NLS-1$
    }
    return line_count;
  }

  /**
   * This internal function reads lines from a BufferedReader until the
   * <code>pattern</code> matches.
   *
   * @param input
   *          is the data source.
   * @return the next line from <code>input</code> that matches the
   *         <code>pattern</code>.
   * @throws IOException
   */
  private String skipToMatch(BufferedReader input) throws IOException {
    String line = input.readLine();
    if (null != dataStartMark) {
      while (null != line && !line.equals(dataStartMark)) {
        line = input.readLine();
      }
    }
    while (null != line && !pattern.matcher(line).matches()) {
      line = input.readLine();
    }
    return line;
  }

  /**
   * This parses a timestamp. Usually the first line in a IMDb file contains
   * that information. This function assumes it's given in pacific standard
   * time.
   *
   * @param line
   *          is a line containing a timestamp.
   * @return the number of milliseconds since January 1, 1970 represented by
   *         this date.
   * @throws ParseException
   */
  private long parseTimestamp(String line) throws ParseException {
    final long time;
    SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM d HH:mm:ss yyyy", Locale.US); //$NON-NLS-1$
    sdf.setTimeZone(TimeZone.getTimeZone("PST")); //$NON-NLS-1$
    time = sdf.parse(line.substring(line.lastIndexOf(": ") + 2)).getTime(); //$NON-NLS-1$
    return time;
  }

  /**
   * Internal function that opens an IMDb list file for reading.
   *
   * @param baseUrl
   *          is the folder where IMDb list files are located.
   * @param parent
   *          is the parent component for progress dialogs. It may be
   *          <code>null</code>.
   * @return a BufferedReader to the IMDb list file.
   * @throws IOException
   *           if an I/O exception occurs.
   * @throws MalformedURLException
   *           if the full URL can't be constructed.
   */
  private BufferedReader constructBufferedReader(URL baseUrl, Component parent) throws IOException,
      MalformedURLException {
    URLConnection con = new URL(baseUrl, getFilename()).openConnection();
    ProgressMonitorInputStream pmis = new ProgressMonitorInputStream(parent, filename, con.getInputStream());
    pmis.getProgressMonitor().setMillisToPopup(0);
    pmis.getProgressMonitor().setMaximum(con.getContentLength());
    InputStream input = new GZIPInputStream(pmis);
    return new BufferedReader(new InputStreamReader(input, ENCODING));
  }

  /**
   * This function returns the timestamp from the IMDb file. This might be used
   * to avoid downloading known data. Currently the files are updated once a
   * week and they need two hours until they are available online.
   *
   * @return the timestamp from the IMDb file.
   */
  public Date getTimestamp() {
    return new Date(timestamp.getTime());
  }

  /**
   * This returns the filename of the IMDb table file.
   *
   * @return the filename of the IMDb table file.
   */
  public String getFilename() {
    return filename;
  }

  /**
   * This is used to call derived classes that know how to interpret data lines.
   *
   * @param matcher
   *          is a matching matcher.
   * @return true if the parser added an entry for this line.
   */
  abstract protected boolean processMatchedLine(Matcher matcher);

  /**
   * @return Returns the builder.
   */
  protected MoviesBuilder getBuilder() {
    return builder;
  }
}
TOP

Related Classes of de.kunysch.localimdb.ImdbParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.