Package com.tamingtext.tagrecommender

Source Code of com.tamingtext.tagrecommender.ExtractStackOverflowData

/*
* Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
*
*    Licensed under the Apache License, Version 2.0 (the "License");
*    you may not use this file except in compliance with the License.
*    You may obtain a copy of the License at
*
*        http://www.apache.org/licenses/LICENSE-2.0
*
*    Unless required by applicable law or agreed to in writing, software
*    distributed under the License is distributed on an "AS IS" BASIS,
*    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*    See the License for the specific language governing permissions and
*    limitations under the License.
* -------------------
* To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
* http://www.manning.com/ingersoll
*/

package com.tamingtext.tagrecommender;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.stream.events.XMLEvent;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ExtractStackOverflowData {
 
  private static final Logger log = LoggerFactory.getLogger(ExtractStackOverflowData.class);

  File inputFile;
  File trainingOutputFile;
  File testOutputFile;
 
  int trainingDataSize = 100000;
  int testDataSize = 10000;
 
  public static void main(String [] args) {
    ExtractStackOverflowData si = new ExtractStackOverflowData();
    if (si.parseArgs(args)) {
      si.extract();
    }
  }
 
 
  public boolean parseArgs(String[] args) {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    Option helpOpt = DefaultOptionCreator.helpOption();
   
    Option inputFileOpt = obuilder.withLongName("inputFile").withRequired(true).withArgument(
        abuilder.withName("inputFile").withMinimum(1).withMaximum(1).create()).withDescription(
        "The input file").withShortName("i").create();
   
    Option trainingOutputOpt = obuilder.withLongName("trainingOutputFile").withRequired(true).withArgument(
        abuilder.withName("trainingOutputFile").withMinimum(1).withMaximum(1).create()).withDescription(
        "The training data output file").withShortName("tr").create();
   
    Option testOutputOpt = obuilder.withLongName("testOutputFile").withRequired(true).withArgument(
        abuilder.withName("testOutputFile").withMinimum(1).withMaximum(1).create()).withDescription(
        "The test data output file").withShortName("te").create();
   
    Option trainingDataSizeOpt = obuilder.withLongName("trainingDataSize").withRequired(false).withArgument(
        abuilder.withName("trainingDataSize").withMinimum(1).withMaximum(1).create()).withDescription(
        "The number of questions to extract for training data").withShortName("trs").create();
   
    Option testDataSizeOpt = obuilder.withLongName("testDataSize").withRequired(false).withArgument(
        abuilder.withName("testDataSize").withMinimum(1).withMaximum(1).create()).withDescription(
        "The number of questions to extract for training data").withShortName("tes").create();

    Group group = gbuilder.withName("Options").withOption(inputFileOpt).withOption(trainingOutputOpt)
         .withOption(testOutputOpt).withOption(trainingDataSizeOpt).withOption(testDataSizeOpt).create();
   
    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      CommandLine cmdLine = parser.parse(args);
     
      if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return false;
      }
     
      inputFile = new File((String) cmdLine.getValue(inputFileOpt));
      trainingOutputFile = new File((String) cmdLine.getValue(trainingOutputOpt));
      testOutputFile = new File((String) cmdLine.getValue(testOutputOpt));

      if (cmdLine.hasOption(trainingDataSizeOpt)) {
        trainingDataSize = Integer.parseInt((String) cmdLine.getValue(trainingDataSizeOpt));
      }
     
      if (cmdLine.hasOption(testDataSizeOpt)) {
        testDataSize = Integer.parseInt((String) cmdLine.getValue(testDataSizeOpt));
      }    
     
    } catch (OptionException e) {
      log.error("Command-line option Exception", e);
      CommandLineUtil.printHelp(group);
      return false;
    }
   
    validate();
    return true;
  }

  protected void validate() {
    if (!inputFile.exists()) {
      throw new IllegalArgumentException("inputFile " + inputFile.getAbsolutePath() + " does not exist");
    }
   
    if (trainingDataSize < 1) {
      throw new IllegalArgumentException("trainingDataSize must be 1 or more");
    }
   
    if (testDataSize < 1) {
      throw new IllegalArgumentException("testDataSize must be 1 or more");
    }
   
    Util.validateFileWritable(trainingOutputFile);
    Util.validateFileWritable(testOutputFile);
  }
 
  public void extract() {
    XMLInputFactory xif = XMLInputFactory.newInstance();
    XMLStreamReader reader = null;
    InputStream is = null;
   
    XMLOutputFactory xof = XMLOutputFactory.newInstance();
    XMLStreamWriter writer = null;
    OutputStream os = null;
   
    try {
      log.info("Reading data from " + inputFile);
     
      is = new FileInputStream(inputFile);
      reader = xif.createXMLStreamReader(is);
     
      os = new FileOutputStream(trainingOutputFile);
      writer = xof.createXMLStreamWriter(os);
      int trainingDataCount = extractXMLData(reader, writer, trainingDataSize);
      os.close();
     
      os = new FileOutputStream(testOutputFile);
      writer = xof.createXMLStreamWriter(os);
      int testDataCount = extractXMLData(reader, writer, testDataSize);
      os.close();
     
      log.info("Extracted " + trainingDataCount + " rows of training data");
      log.info("Extracted " + testDataCount     + " rows of test data");
    }
    catch (XMLStreamException e) {
      e.printStackTrace();
    }
    catch (IOException e) {
      e.printStackTrace();
    }
  }
 
  /** Extract as many as <code>limit</code> questions from the <code>reader</code>
   *  provided, writing them to <code>writer</code>.
   * @param reader
   * @param writer
   * @param limit
   * @return
   * @throws XMLStreamException
   */
  protected int extractXMLData(XMLStreamReader reader, XMLStreamWriter writer, int limit) throws XMLStreamException {

    int questionCount = 0;
    int attrCount;
    boolean copyElement = false;
   
    writer.writeStartDocument();
    writer.writeStartElement("posts");
    writer.writeCharacters("\n");
    while (reader.hasNext() && questionCount < limit) {
      switch (reader.next()) {
      case XMLEvent.START_ELEMENT:
        if (reader.getLocalName().equals("row")) {
          attrCount = reader.getAttributeCount();
          for (int i=0; i < attrCount; i++) {
            // copy only the questions.
            if (reader.getAttributeName(i).getLocalPart().equals("PostTypeId") &&
                reader.getAttributeValue(i).equals("1")) {
              copyElement = true;
              break;
            }  
          }
         
          if (copyElement) {
            writer.writeCharacters("  ");
            writer.writeStartElement("row");
            for (int i=0; i < attrCount; i++) {
              writer.writeAttribute(
                  reader.getAttributeName(i).getLocalPart(),
                  reader.getAttributeValue(i));
            }
            writer.writeEndElement();
            writer.writeCharacters("\n");
            copyElement = false;
            questionCount++;
          }
        }
        break;
      }
    }
    writer.writeEndElement();
    writer.writeEndDocument();
    writer.flush();
    writer.close();
   
    return questionCount;
  }
}
TOP

Related Classes of com.tamingtext.tagrecommender.ExtractStackOverflowData

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.