Source Code of com.cloudera.recordbreaker.analyzer.UnknownTextSchemaDescriptor

/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.analyzer;


import java.util.List;
import java.util.Random;
import java.util.Iterator;
import java.util.ArrayList;


import java.io.File;
import java.io.DataOutput;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.InputStreamReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;


import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericContainer;
import org.apache.avro.generic.GenericDatumReader;


import com.cloudera.recordbreaker.learnstructure.InferredType;
import com.cloudera.recordbreaker.learnstructure.LearnStructure;


import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


/************************************************************************
 * <code>UnknownTextSchemaDescriptor</code> returns schema data that we
 * figure out from the data itself.
 *
 * @author "Michael Cafarella"
 * @version 1.0
 * @since 1.0
 * @see SchemaDescriptor
 *************************************************************************/
public class UnknownTextSchemaDescriptor extends GenericSchemaDescriptor {
  private static final Log LOG = LogFactory.getLog(UnknownTextSchemaDescriptor.class);    
  public static String SCHEMA_ID = "recordbreaker-recovered";
  public static int MAX_LINES = 1000;
  InferredType typeTree;


  public UnknownTextSchemaDescriptor(DataDescriptor dd) throws IOException {
    super(dd);
    // Superclass calls computeSchema()
  }


  public UnknownTextSchemaDescriptor(DataDescriptor dd, String schemaRepr, byte[] miscPayload) throws IOException {
    super(dd, schemaRepr);
    this.randId = new Random().nextInt();
    
    // Deserialize the payload string into the parser
    DataInputStream in = new DataInputStream(new ByteArrayInputStream(miscPayload));
    try {
      this.typeTree = InferredType.readType(in);
    } finally {
      in.close();
    }
  }


  int randId;
  void computeSchema() throws IOException {
    this.randId = new Random().nextInt();    
    LearnStructure ls = new LearnStructure();
    FileSystem fs = FSAnalyzer.getInstance().getFS();
    FileSystem localFS = FileSystem.getLocal(new Configuration());
    Path inputPath = dd.getFilename();


    File workingParserFile = File.createTempFile("textdesc", "typetree", null);
    File workingSchemaFile = File.createTempFile("textdesc", "schema", null);
    
    ls.inferRecordFormat(fs, inputPath, localFS, new Path(workingSchemaFile.getCanonicalPath()), new Path(workingParserFile.getCanonicalPath()), null, null, false, MAX_LINES);


    this.schema = Schema.parse(workingSchemaFile);
    DataInputStream in = new DataInputStream(localFS.open(new Path(workingParserFile.getCanonicalPath())));
    try {
      this.typeTree = InferredType.readType(in);
    } catch(IOException iex) {
      iex.printStackTrace();
      throw iex;
    } finally {
      in.close();
    }
    //System.err.println("Recovered unknowntext schema: " + schema);
  }


  public byte[] getPayload() {
    // Serialize the parser, return the resulting string
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    DataOutputStream out = new DataOutputStream(baos);
    try {
      this.typeTree.write(out);
      out.close();      
    } catch (IOException iex) {
      return new byte[0];
    } finally {
    }
    return baos.toByteArray();
  }


  /**
   * Iterate through Avro-encoded rows of the file
   */
  public Iterator getIterator() {
    return new Iterator() {
      int lineno = 0;
      BufferedReader in = null;
      Object nextElt = null;
      {
        try {
          in = new BufferedReader(new InputStreamReader(dd.getRawBytes()));
          nextElt = lookahead();
        } catch (IOException iex) {
          LOG.info("iex: " + iex.toString());
          nextElt = null;
        }
      }
      public boolean hasNext() {
        return nextElt != null;
      }
      public synchronized Object next() {
        Object toReturn = nextElt;
        nextElt = lookahead();
        return toReturn;
      }
      public void remove() {
        throw new UnsupportedOperationException();
      }
      Object lookahead() {
        try {
          String str = null;
          while ((str = in.readLine()) != null) {
            GenericContainer resultObj = typeTree.parse(str);
            lineno++;
            if (resultObj != null) {
              return resultObj;
            }
          }
          if (in != null) {
            in.close();
            in = null;
          }
        } catch (IOException iex) {
          iex.printStackTrace();
        }
        return null;
      }
    };
  }
  
  /**
   * @return a <code>String</code> that annotates the schema
   */
  public String getSchemaSourceDescription() {
    return SCHEMA_ID;
  }
}
Source Code of com.cloudera.recordbreaker.analyzer.UnknownTextSchemaDescriptor

Related Classes of com.cloudera.recordbreaker.analyzer.UnknownTextSchemaDescriptor