Package com.cloudera.recordbreaker.analyzer

Source Code of com.cloudera.recordbreaker.analyzer.UnknownTextSchemaDescriptor

/*
* Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
*
* Cloudera, Inc. licenses this file to you under the Apache License,
* Version 2.0 (the "License"). You may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for
* the specific language governing permissions and limitations under the
* License.
*/
package com.cloudera.recordbreaker.analyzer;

import java.util.List;
import java.util.Random;
import java.util.Iterator;
import java.util.ArrayList;

import java.io.File;
import java.io.DataOutput;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.InputStreamReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericContainer;
import org.apache.avro.generic.GenericDatumReader;

import com.cloudera.recordbreaker.learnstructure.InferredType;
import com.cloudera.recordbreaker.learnstructure.LearnStructure;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/************************************************************************
* <code>UnknownTextSchemaDescriptor</code> returns schema data that we
* figure out from the data itself.
*
* @author "Michael Cafarella"
* @version 1.0
* @since 1.0
* @see SchemaDescriptor
*************************************************************************/
public class UnknownTextSchemaDescriptor extends GenericSchemaDescriptor {
  private static final Log LOG = LogFactory.getLog(UnknownTextSchemaDescriptor.class);   
  public static String SCHEMA_ID = "recordbreaker-recovered";
  public static int MAX_LINES = 1000;
  InferredType typeTree;

  public UnknownTextSchemaDescriptor(DataDescriptor dd) throws IOException {
    super(dd);
    // Superclass calls computeSchema()
  }

  public UnknownTextSchemaDescriptor(DataDescriptor dd, String schemaRepr, byte[] miscPayload) throws IOException {
    super(dd, schemaRepr);
    this.randId = new Random().nextInt();
   
    // Deserialize the payload string into the parser
    DataInputStream in = new DataInputStream(new ByteArrayInputStream(miscPayload));
    try {
      this.typeTree = InferredType.readType(in);
    } finally {
      in.close();
    }
  }

  int randId;
  void computeSchema() throws IOException {
    this.randId = new Random().nextInt();   
    LearnStructure ls = new LearnStructure();
    FileSystem fs = FSAnalyzer.getInstance().getFS();
    FileSystem localFS = FileSystem.getLocal(new Configuration());
    Path inputPath = dd.getFilename();

    File workingParserFile = File.createTempFile("textdesc", "typetree", null);
    File workingSchemaFile = File.createTempFile("textdesc", "schema", null);
   
    ls.inferRecordFormat(fs, inputPath, localFS, new Path(workingSchemaFile.getCanonicalPath()), new Path(workingParserFile.getCanonicalPath()), null, null, false, MAX_LINES);

    this.schema = Schema.parse(workingSchemaFile);
    DataInputStream in = new DataInputStream(localFS.open(new Path(workingParserFile.getCanonicalPath())));
    try {
      this.typeTree = InferredType.readType(in);
    } catch(IOException iex) {
      iex.printStackTrace();
      throw iex;
    } finally {
      in.close();
    }
    //System.err.println("Recovered unknowntext schema: " + schema);
  }

  public byte[] getPayload() {
    // Serialize the parser, return the resulting string
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    DataOutputStream out = new DataOutputStream(baos);
    try {
      this.typeTree.write(out);
      out.close();     
    } catch (IOException iex) {
      return new byte[0];
    } finally {
    }
    return baos.toByteArray();
  }

  /**
   * Iterate through Avro-encoded rows of the file
   */
  public Iterator getIterator() {
    return new Iterator() {
      int lineno = 0;
      BufferedReader in = null;
      Object nextElt = null;
      {
        try {
          in = new BufferedReader(new InputStreamReader(dd.getRawBytes()));
          nextElt = lookahead();
        } catch (IOException iex) {
          LOG.info("iex: " + iex.toString());
          nextElt = null;
        }
      }
      public boolean hasNext() {
        return nextElt != null;
      }
      public synchronized Object next() {
        Object toReturn = nextElt;
        nextElt = lookahead();
        return toReturn;
      }
      public void remove() {
        throw new UnsupportedOperationException();
      }
      Object lookahead() {
        try {
          String str = null;
          while ((str = in.readLine()) != null) {
            GenericContainer resultObj = typeTree.parse(str);
            lineno++;
            if (resultObj != null) {
              return resultObj;
            }
          }
          if (in != null) {
            in.close();
            in = null;
          }
        } catch (IOException iex) {
          iex.printStackTrace();
        }
        return null;
      }
    };
  }
 
  /**
   * @return a <code>String</code> that annotates the schema
   */
  public String getSchemaSourceDescription() {
    return SCHEMA_ID;
  }
}
TOP

Related Classes of com.cloudera.recordbreaker.analyzer.UnknownTextSchemaDescriptor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.