Package org.apache.avro.file

Examples of org.apache.avro.file.DataFileReader


      writer.append(record);
    }
    writer.flush();
    writer.close();

    DataFileReader<GenericData.Record> reader = new DataFileReader(new ReadAvroContainerBuilder.ForwardOnlySeekableInputStream(new ByteArrayInputStream(bout.toByteArray())), new GenericDatumReader());
    Schema schema2 = reader.getSchema();
    assertEquals(schema, schema2);
    for (GenericData.Record record : records) {
      assertTrue(reader.hasNext());
      GenericData.Record record2 = reader.next();
      assertEquals(record, record2);
    }
    assertFalse(reader.hasNext());
    reader.close();

    Record event = new Record();
    event.getFields().put(Fields.ATTACHMENT_BODY, new ByteArrayInputStream(bout.toByteArray()));
    morphline = createMorphline("test-morphlines/readAvroContainer");
    deleteAllDocuments();
    assertTrue(load(event));
    assertEquals(records.length, queryResultSetSize("*:*"));
       
    GenericDatumWriter datumWriter = new GenericDatumWriter(schema);
    bout = new ByteArrayOutputStream();
    Encoder encoder = EncoderFactory.get().binaryEncoder(bout, null);
    for (GenericData.Record record : records) {
      datumWriter.write(record, encoder);
    }
    encoder.flush();

    Decoder decoder = DecoderFactory.get().binaryDecoder(new ByteArrayInputStream(bout.toByteArray()), null);
    DatumReader<GenericData.Record> datumReader = new GenericDatumReader<GenericData.Record>(schema);
    for (int i = 0; i < records.length; i++) {
      GenericData.Record record3 = datumReader.read(null, decoder);
      assertEquals(records[i], record3);
    }
   
    event = new Record();
    event.getFields().put(Fields.ATTACHMENT_BODY, new ByteArrayInputStream(bout.toByteArray()));
    File tmp = new File("target/tmp-test-schema.avsc");
    try {
      tmp.deleteOnExit();
      Files.write(schema.toString(true), tmp, Charsets.UTF_8);
      morphline = createMorphline("test-morphlines/readAvroWithExternalSchema");
      deleteAllDocuments();   
      assertTrue(load(event));
      assertEquals(records.length, queryResultSetSize("*:*"));
    } finally {
      tmp.delete();
    }
       
    for (GenericData.Record record : records) {
      event = new Record();
      event.getFields().put(Fields.ATTACHMENT_BODY, record);
      morphline = createMorphline("test-morphlines/extractAvroTree");
      deleteAllDocuments();
      assertTrue(load(event));
      assertEquals(1, queryResultSetSize("*:*"));
    }
   
    String[] formats = new String[] {"", "AndSnappy"};
    for (String format : formats) {
      morphline = createMorphline("test-morphlines/writeAvroToByteArrayWithContainer" + format);
      event = new Record();
      event.getFields().putAll(Fields.ATTACHMENT_BODY, Arrays.asList(records));
      deleteAllDocuments();
      assertTrue(load(event));
      assertEquals(1, collector.getFirstRecord().get(Fields.ATTACHMENT_BODY).size());
      byte[] bytes = (byte[]) collector.getFirstRecord().getFirstValue(Fields.ATTACHMENT_BODY);
      assertNotNull(bytes);
      reader = new DataFileReader(new ReadAvroContainerBuilder.ForwardOnlySeekableInputStream(new ByteArrayInputStream(bytes)), new GenericDatumReader());
      assertEquals("bar", new String(reader.getMeta("foo"), Charsets.UTF_8));
      assertEquals("Nadja", new String(reader.getMeta("firstName"), Charsets.UTF_8));
      assertEquals(schema, reader.getSchema());
      for (GenericData.Record record : records) {
        assertTrue(reader.hasNext());
        GenericData.Record record2 = reader.next();
        assertEquals(record, record2);
      }
      assertFalse(reader.hasNext());
      reader.close();
    }
   
    formats = new String[] {"Binary", "JSON"};
    for (String format : formats) {
      morphline = createMorphline("test-morphlines/writeAvroToByteArrayWithContainerless" + format);
View Full Code Here


      Notifications.notifyBeginTransaction(morphline);
      assertTrue(morphline.process(record));
      assertEquals(1, collector.getNumStartEvents());
      assertEquals(2104, collector.getRecords().size());
     
      FileReader<GenericData.Record> reader = new DataFileReader(file, new GenericDatumReader());
      int i = 0;
      while (reader.hasNext()) {
        Record actual = collector.getRecords().get(i);
        GenericData.Record expected = reader.next();
        assertTweetEquals(expected, actual, fieldNames, i);
        i++;
      }   
      assertEquals(collector.getRecords().size(), i);
    }
View Full Code Here

  }
 
  private void runTweets(String morphlineConfigFile, String[] fieldNames) throws Exception {
    File file = new File(RESOURCES_DIR + "/test-documents/sample-statuses-20120906-141433-medium.avro");
    List<GenericData.Record> expecteds = new ArrayList();
    FileReader<GenericData.Record> reader = new DataFileReader(file, new GenericDatumReader());
    Schema schema = reader.getSchema();
    while (reader.hasNext()) {
      GenericData.Record expected = reader.next();
      expecteds.add(expected);
    }   
    assertEquals(2104, expecteds.size());

    ByteArrayOutputStream bout = new ByteArrayOutputStream();
    Encoder encoder;
    if (morphlineConfigFile.contains("Json")) {
      encoder = EncoderFactory.get().jsonEncoder(schema, bout);
    } else {
      encoder = EncoderFactory.get().binaryEncoder(bout, null);
    }
    GenericDatumWriter datumWriter = new GenericDatumWriter(schema);
    for (GenericData.Record record : expecteds) {
      datumWriter.write(record, encoder);
    }
    encoder.flush();

    morphline = createMorphline(morphlineConfigFile);
    for (int j = 0; j < 3; j++) { // also test reuse of objects and low level avro buffers
      Record record = new Record();
      record.put(Fields.ATTACHMENT_BODY, bout.toByteArray());
      collector.reset();
      startSession();
      Notifications.notifyBeginTransaction(morphline);
      assertTrue(morphline.process(record));
      assertEquals(1, collector.getNumStartEvents());
      assertEquals(2104, collector.getRecords().size());
     
      reader = new DataFileReader(file, new GenericDatumReader());
      int i = 0;
      while (reader.hasNext()) {
        Record actual = collector.getRecords().get(i);
        GenericData.Record expected = reader.next();
        assertTweetEquals(expected, actual, fieldNames, i);
        i++;
      }   
      assertEquals(collector.getRecords().size(), i);
      }
View Full Code Here

    byte[] bytes;
    if (morphlineConfigFile.contains("Container")) {
      bytes = Files.toByteArray(file);
    } else {   
      List<GenericData.Record> expecteds = new ArrayList();
      FileReader<GenericData.Record> reader = new DataFileReader(file, new GenericDatumReader());
      Schema schema = reader.getSchema();
      while (reader.hasNext()) {
        GenericData.Record expected = reader.next();
        expecteds.add(expected);
      }   
      assertEquals(2, expecteds.size());
 
      ByteArrayOutputStream bout = new ByteArrayOutputStream();
View Full Code Here

      DataFileReader<GenericContainer> reader = null;
      try {
        // TODO: for better performance subclass DataFileReader
        // to eliminate expensive SchemaParser.parse() on each new file in DataFileReader.initialize().
        // Instead replace the parse() with a lookup in the byte[] cache map.
        reader = new DataFileReader(new ForwardOnlySeekableInputStream(in), datumReader);
       
        byte[] writerSchemaBytes = reader.getMeta(DataFileConstants.SCHEMA);
        Preconditions.checkNotNull(writerSchemaBytes);
        ByteArrayKey writerSchemaKey = new ByteArrayKey(writerSchemaBytes);
        ResolvingDecoder resolver = resolverCache.get(writerSchemaKey); // cache for performance
View Full Code Here

      }     
    });  

    // fetch test input data and sort like solr result set
    List<GenericData.Record> records = new ArrayList();
    FileReader<GenericData.Record> reader = new DataFileReader(file, new GenericDatumReader());
    while (reader.hasNext()) {
      GenericData.Record expected = reader.next();
      records.add(expected);
    }
    assertEquals(collector.getRecords().size(), records.size());   
    Collections.sort(records, new Comparator<GenericData.Record>() {
      @Override
View Full Code Here

      ls.inferRecordFormat(localFS, new Path(inputData.getCanonicalPath()), localFS, schemaFile, parseTreeFile, jsonDataFile, avroFile, false, lineCount);

      // Test the inferred structure
      // First, load in the avro file and see how many records there are.
      int avroCount = 0;
      DataFileReader in = new DataFileReader(new File(avroFile.toString()), new GenericDatumReader());
      try {
        Iterator it = in.iterator();
        while (it.hasNext()) {
          avroCount++;
          it.next();
        }
      } finally {
        in.close();
      }

      // Was the synthesized parser able to figure out the file?
      double parseRatio = avroCount / (1.0 * lineCount);
      return (parseRatio > MIN_PARSE_RATIO);
View Full Code Here

  }
  /**
   * Create the statistical summary object from data.
   */
  public Schema createSummaryFromData(File f) throws IOException {
    DataFileReader in = new DataFileReader(f, new GenericDatumReader());
    try {
      Schema s = in.getSchema();

      //
      // There has to be at least one data element for us to infer anything meaningful
      //
      Iterator it = in.iterator();
      if (! it.hasNext()) {
        throw new IOException("No contents");
      }

      //
      // We can only infer schemas from top-level records, not Fixeds or Arrays.
      //
      Object firstRecord = it.next();
      if (firstRecord instanceof GenericFixed ||
          firstRecord instanceof GenericArray) {
        throw new IOException("Not a top-level record");
      }

      // We assume the passed-in top-level Schema always represents a Record.
      if (s.getType() != Schema.Type.RECORD) {
        throw new IOException("Passed-in top-level Schema instance must be of type Schema.Type.RECORD");
      }
      this.root = buildStructure(s, "ROOT");

      //
      // Iterate through all records and collect statistics on each Schema field.
      //
      List<Schema.Field> fields = s.getFields();
      GenericRecord cur = (GenericRecord) firstRecord;
      int counter = 0;
      do {
        this.root.addData(cur);
        counter++;
        if (it.hasNext()) {
          cur = (GenericRecord) it.next();
        } else {
          cur = null;
        }
      } while (cur != null);

      this.root.computePreorder(-1);
      return s;
    } finally {
      in.close();
    }
  }
View Full Code Here

        }));
    args.addAll(Arrays.asList(extraArgs));
    run(args);

    DataFileReader<Object> reader =
      new DataFileReader(OUT_FILE, new GenericDatumReader<Object>());
   
    Iterator<Object> found = reader.iterator();
    for (Object expected :
           new RandomData(Schema.parse(SCHEMA_FILE), Integer.parseInt(COUNT)))
      assertEquals(expected, found.next());

    reader.close();
  }
View Full Code Here

TOP

Related Classes of org.apache.avro.file.DataFileReader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.