Source Code of edu.ucla.sspace.tools.BlogPreProcessor

/*
 * Copyright 2009 Keith Stevens 
 *
 * This file is part of the S-Space package and is covered under the terms and
 * conditions therein.
 *
 * The S-Space package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */


package edu.ucla.sspace.tools;


import edu.ucla.sspace.common.ArgOptions;


import edu.ucla.sspace.text.DocumentPreprocessor;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.PrintWriter;


import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.LinkedList;


import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;


import java.util.logging.Level;
import java.util.logging.Logger;


import java.sql.Timestamp;


/**
 * An informal tool class which extracts the date and content of cleaned xml
 * files.  No xml parsing is done, instead the open and closed tags are searched
 * for in the string, and everything in between is extracted and saved to a
 * file.
 */
public class BlogPreProcessor {


  private static final Logger LOGGER =
    Logger.getLogger(BlogPreProcessor.class.getName());


  private DocumentPreprocessor processor;
  private final PrintWriter pw;
  private boolean saveTS;
  private long beginTime;
  private long endTime;


  private BlogPreProcessor(File wordFile, File outFile, long begin, long end) {
    PrintWriter writer = null;
    beginTime = begin;
    endTime = end;
    try {
      writer = new PrintWriter(outFile);
      processor = new DocumentPreprocessor(wordFile);
    } catch (FileNotFoundException fnee) {
      fnee.printStackTrace();
      System.exit(1); 
    } catch (IOException ioe) {
      ioe.printStackTrace();
      System.exit(1); 
    }
    pw = writer;
  }


  /**
   * Given a blog file, read through each line and extract the content and
   * updated date, printing these as one line to the result file.
   */
  public void processFile(File blogFile) throws IOException {
    BufferedReader br = new BufferedReader(new FileReader(blogFile));
    String line = null;
    String date = null;
    String id = null;
    StringBuilder content = new StringBuilder();
    boolean needMoreContent = false;
    while ((line = br.readLine()) != null) {
      if (line.contains("<id>")) {
        int startIndex = line.indexOf(">")+1;
        int endIndex = line.lastIndexOf("<");
        id = line.substring(startIndex, endIndex);
      } else if (line.contains("<content>")) {
        // Extract the start of a content node.  If the previous content,
        // updated pair was incomplete, i.e. updated had no value, this will
        // overwrite the previous content value.
        int startIndex = line.indexOf(">")+1;
        int endIndex = line.lastIndexOf("<");
        content = new StringBuilder();
        if (endIndex > startIndex)
          content.append(line.substring(startIndex, endIndex));
        else {
          content.append(line.substring(startIndex));
          needMoreContent = true;
        }
      } else if (needMoreContent) {
        // The content node might span several lines, so consider all lines read
        // until the next close bracket to be part of the current content.
        int endIndex = (line.contains("</content>")) ? line.lastIndexOf("<") : -1;
        if (endIndex > 0) {
          content.append(line.substring(0, endIndex));
          needMoreContent = false;
        } else
          content.append(line);
      } else if (line.contains("<updated>")) {
        // The updated timestamp only spans one line.
        int startIndex = line.indexOf(">")+1;
        int endIndex = line.lastIndexOf("<");
        date = line.substring(startIndex, endIndex);
        if (date.equals(""))
          date = null;
      } else if (content != null && date != null) {
        // Cleand and print out the content and date.
        long dateTime = Timestamp.valueOf(date).getTime();
        if (dateTime < beginTime || dateTime > endTime) {
          needMoreContent = false;
          date = null;
          continue;
        }
        String cleanedContent = processor.process(content.toString());
        if (!cleanedContent.equals("")) {
          synchronized (pw) {
            pw.format("%d %s\n", dateTime, cleanedContent);
            pw.flush();
          }
        }
        LOGGER.info(String.format("Processed blog %s with timestamp %d",
                                  id, dateTime));
        needMoreContent = false;
        date = null;
      }
    }
    br.close();
  }


  public static ArgOptions setupOptions() {
    ArgOptions opts = new ArgOptions();
    opts.addOption('d', "docFiles", "location of directory containing only blog files", 
                   true, "FILE[,FILE,...]", "Required");
    opts.addOption('w', "wordlist", "Word List for cleaning documents",
                   true, "STRING", "Required");
    opts.addOption('s', "beginTime", "Earliest timestamp for any document",
                   true, "INTEGER", "Optional");
    opts.addOption('e', "endTime", "Latest timestamp for any document",
                   true, "INTEGER", "Optional");
    opts.addOption('h', "threads", "number of threads", true, "INT");
    return opts;
  }


  public static void main(String[] args)
      throws IOException, InterruptedException  {
    ArgOptions options = setupOptions();
    options.parseOptions(args);


    if (!options.hasOption("docFiles") || 
        !options.hasOption("wordlist") ||
        options.numPositionalArgs() != 1) {
      System.out.println("usage: java BlogPreProcessor [options] <out_file> \n" +
                         options.prettyPrint());
      System.exit(1);
    }
    // Load up the output file and the wordlist.
    File outFile = new File(options.getPositionalArg(0));
    File wordFile = new File(options.getStringOption("wordlist"));


    // Create the cleaner.
    long startTime = (options.hasOption("beginTime")) ?
      options.getLongOption("beginTime") : 0;
    long endTime = (options.hasOption("endTime")) ?
      options.getLongOption("endTime") : Long.MAX_VALUE;


    final BlogPreProcessor blogCleaner =
      new BlogPreProcessor(wordFile, outFile, startTime, endTime);
    String[] fileNames = options.getStringOption("docFiles").split(",");


  // Load the program-specific options next.
  int numThreads = Runtime.getRuntime().availableProcessors();
  if (options.hasOption("threads"))
      numThreads = options.getIntOption("threads");
    
    Collection<File> blogFiles = new ArrayDeque<File>() ;
    for (String fileName : fileNames) {
      blogFiles.add(new File(fileName));
    }


    final Iterator<File> fileIter = blogFiles.iterator();


    Collection<Thread> threads = new LinkedList<Thread>();


  for (int i = 0; i < numThreads; ++i) {
      Thread t = new Thread() {
          public void run() {
            while (fileIter.hasNext()) {
              File currentFile = fileIter.next();
              try {
                LOGGER.info("processing: " + currentFile.getPath());
                blogCleaner.processFile(currentFile);
              } catch (IOException ioe) {
                ioe.printStackTrace();
              }
            }
          }
      };
      threads.add(t);
    }


  for (Thread t : threads)
      t.start();
  for (Thread t : threads)
      t.join();
  }
}
Source Code of edu.ucla.sspace.tools.BlogPreProcessor

Related Classes of edu.ucla.sspace.tools.BlogPreProcessor