Package org.apache.mahout.common.iterator

Examples of org.apache.mahout.common.iterator.FileLineIterator


  public DataFileIterator(File dataFile) throws IOException {
    if (dataFile == null || dataFile.isDirectory() || !dataFile.exists()) {
      throw new IllegalArgumentException("Bad data file: " + dataFile);
    }
    lineIterator = new FileLineIterator(dataFile);
  }
View Full Code Here


   
    StringBuilder content = new StringBuilder();
    content.append(header);
    NumberFormat decimalFormatter = new DecimalFormat("0000");
    File dumpFile = new File(dumpFilePath);
    FileLineIterator it;
    if (dumpFilePath.endsWith(".bz2")) {
      // default compression format from http://download.wikimedia.org
      CompressionCodec codec = new BZip2Codec();
      it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
    } else {
      // assume the user has previously de-compressed the dump file
      it = new FileLineIterator(dumpFile);
    }
    int filenumber = 0;
    while (it.hasNext()) {
      String thisLine = it.next();
      if (thisLine.trim().startsWith("<page>")) {
        boolean end = false;
        while (!thisLine.trim().startsWith("</page>")) {
          content.append(thisLine).append('\n');
          if (it.hasNext()) {
            thisLine = it.next();
          } else {
            end = true;
            break;
          }
        }
View Full Code Here

   * <pre>
   * term DocFreq Index
   * </pre>
   */
  private static String[] loadTermDictionary(InputStream is) throws IOException {
    FileLineIterator it = new FileLineIterator(is);
   
    int numEntries = Integer.parseInt(it.next());
    String[] result = new String[numEntries];
   
    while (it.hasNext()) {
      String line = it.next();
      if (line.startsWith("#")) {
        continue;
      }
      String[] tokens = TAB_PATTERN.split(line);
      if (tokens.length < 3) {
View Full Code Here

TOP

Related Classes of org.apache.mahout.common.iterator.FileLineIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.