/* $Id: ImdbParser.java 124 2008-03-06 07:19:25Z bananeweizen $
* GNU GPL Version 2, Copyright (C) 2005 Paul C. Kunysch */
package de.kunysch.localimdb;
import java.awt.Component;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import javax.swing.ProgressMonitorInputStream;
import util.ui.Localizer;
/** This class offers a framework for parsing the IMDb list files. */
public abstract class ImdbParser {
private static Localizer mLocalizer = util.ui.Localizer.getLocalizerFor(ImdbParser.class);
private static final Charset ENCODING = Charset.forName("ISO-8859-1"); //$NON-NLS-1$
private static final Logger LOG = Logger.getLogger(ImdbParser.class.getPackage().getName());
private final Date timestamp = new Date(0);
private final String dataStartMark;
private final String filename;
private final Pattern pattern;
private final MoviesBuilder builder;
/**
* This configures an ImdbParser for a specific IMDb table.
*
* @param builder
* will collect data and create a Movies object.
* @param filename
* is the basename of the file. ("foo.list.gz")
* @param dataStartMark
* is a line that must appear befor the first data entries. This may
* be null.
* @param pattern
* must match every line in the data region. The resulting matchers
* are passed to processMatchedLine().
*/
protected ImdbParser(MoviesBuilder builder, String filename, String dataStartMark, Pattern pattern) {
this.filename = filename;
this.dataStartMark = dataStartMark;
this.pattern = pattern;
this.builder = builder;
}
/**
* Fetch and parse a table from the specified URL.
*
* @param baseUrl
* the URL for the data directory.
* @param parent
* a parent for a ProgressMonitorInputStream or null.
* @return the number of added entries.
* @throws MalformedURLException
* if the URL to the IMDb file can't be constructed.
* @throws IOException
* if the data can't be read or if the user cancels the
* ProgressMonitorInputStream.
* @throws ParseException
* if the date on the first line can't be read.
*/
public int fetch(URL baseUrl, Component parent) throws IOException, ParseException {
BufferedReader br = null;
try {
br = constructBufferedReader(baseUrl, parent);
timestamp.setTime(parseTimestamp(br.readLine()));
return processMatchingLines(skipToMatch(br), br);
} finally {
if (null != br) {
br.close();
}
}
}
public int fetch(ReaderFactory factory) throws IOException, ParseException {
BufferedReader br = null;
try {
br = factory.reader(getFilename(), ENCODING);
timestamp.setTime(parseTimestamp(br.readLine()));
return processMatchingLines(skipToMatch(br), br);
} finally {
if (null != br) {
br.close();
}
}
}
/**
* This internal function calls <code>processMatchedLine</code> for every
* input while <code>pattern</code> matches.
*
* @param firstLine
* is the first line that will be processed.
* @param input
* is is the source for all other lines.
* @throws IOException
* if an I/O error occurs.
* @return the number of lines that added entries.
*/
private int processMatchingLines(String firstLine, BufferedReader input) throws IOException {
int line_count = 0;
String line = firstLine;
Matcher matcher = pattern.matcher(line);
while (null != line && matcher.matches()) {
if (processMatchedLine(matcher)) {
line_count = line_count + 1;
}
line = input.readLine();
matcher = pattern.matcher(line);
}
if (null != line) {
LOG.log(Level.INFO, mLocalizer.msg("1", "Stopped parsing file at: ''{0}''", line)); //$NON-NLS-1$
}
return line_count;
}
/**
* This internal function reads lines from a BufferedReader until the
* <code>pattern</code> matches.
*
* @param input
* is the data source.
* @return the next line from <code>input</code> that matches the
* <code>pattern</code>.
* @throws IOException
*/
private String skipToMatch(BufferedReader input) throws IOException {
String line = input.readLine();
if (null != dataStartMark) {
while (null != line && !line.equals(dataStartMark)) {
line = input.readLine();
}
}
while (null != line && !pattern.matcher(line).matches()) {
line = input.readLine();
}
return line;
}
/**
* This parses a timestamp. Usually the first line in a IMDb file contains
* that information. This function assumes it's given in pacific standard
* time.
*
* @param line
* is a line containing a timestamp.
* @return the number of milliseconds since January 1, 1970 represented by
* this date.
* @throws ParseException
*/
private long parseTimestamp(String line) throws ParseException {
final long time;
SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM d HH:mm:ss yyyy", Locale.US); //$NON-NLS-1$
sdf.setTimeZone(TimeZone.getTimeZone("PST")); //$NON-NLS-1$
time = sdf.parse(line.substring(line.lastIndexOf(": ") + 2)).getTime(); //$NON-NLS-1$
return time;
}
/**
* Internal function that opens an IMDb list file for reading.
*
* @param baseUrl
* is the folder where IMDb list files are located.
* @param parent
* is the parent component for progress dialogs. It may be
* <code>null</code>.
* @return a BufferedReader to the IMDb list file.
* @throws IOException
* if an I/O exception occurs.
* @throws MalformedURLException
* if the full URL can't be constructed.
*/
private BufferedReader constructBufferedReader(URL baseUrl, Component parent) throws IOException,
MalformedURLException {
URLConnection con = new URL(baseUrl, getFilename()).openConnection();
ProgressMonitorInputStream pmis = new ProgressMonitorInputStream(parent, filename, con.getInputStream());
pmis.getProgressMonitor().setMillisToPopup(0);
pmis.getProgressMonitor().setMaximum(con.getContentLength());
InputStream input = new GZIPInputStream(pmis);
return new BufferedReader(new InputStreamReader(input, ENCODING));
}
/**
* This function returns the timestamp from the IMDb file. This might be used
* to avoid downloading known data. Currently the files are updated once a
* week and they need two hours until they are available online.
*
* @return the timestamp from the IMDb file.
*/
public Date getTimestamp() {
return new Date(timestamp.getTime());
}
/**
* This returns the filename of the IMDb table file.
*
* @return the filename of the IMDb table file.
*/
public String getFilename() {
return filename;
}
/**
* This is used to call derived classes that know how to interpret data lines.
*
* @param matcher
* is a matching matcher.
* @return true if the parser added an entry for this line.
*/
abstract protected boolean processMatchedLine(Matcher matcher);
/**
* @return Returns the builder.
*/
protected MoviesBuilder getBuilder() {
return builder;
}
}