Package org.commoncrawl.protocol.shared

Examples of org.commoncrawl.protocol.shared.ArcFileItem


    List<TestRecord> records = splits.get(splitDataIndex).e1;
   
    int itemIndex = 0;
    // iterate and validate stuff ...
    Text key = new Text();
    ArcFileItem value = new ArcFileItem();
    while (reader.next(key, value)) {
     
      TestRecord testRecord = records.get(itemIndex++);
     
      // get test key bytes as utf-8 bytes ...
      byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
      // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters
      // with ?, which causes our test case (which does use invalid characters to from the key, to break.
      Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes,0,testKeyBytes.length,key.getBytes(),0,key.getLength()) == 0);
      // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
      // we search for this specific byte pattern to locate start of content, then compare it against source ...
      Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data,0,testRecord.data.length,value.getContent().getReadOnlyBytes(),value.getContent().getOffset(),value.getContent().getCount()) == 0);
      NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems());
      // validate metadata
      Assert.assertEquals("text/html",headers.findValue(Constants.ARCFileHeader_ARC_MimeType));
      Assert.assertEquals(value.getArcFilePos(),testRecord.streamPos);
      Assert.assertEquals(value.getArcFileSize(),testRecord.rawSize);
      Assert.assertEquals("test-value", headers.findValue("test"));
      Assert.assertEquals(value.getArcFileName(),((FileSplit)split).getPath().getName());
     
    }
    reader.close();
   
    Assert.assertEquals(itemIndex,ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);
View Full Code Here


    int itemIndex = 0;
    // iterate and validate stuff ...
    while (reader.nextKeyValue()) {
     
      Text key = reader.getCurrentKey();
      ArcFileItem value = reader.getCurrentValue();
     
      TestRecord testRecord = records.get(itemIndex++);
     
      // get test key bytes as utf-8 bytes ...
      byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
      // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters
      // with ?, which causes our test case (which does use invalid characters to from the key, to break.
      Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes,0,testKeyBytes.length,key.getBytes(),0,key.getLength()) == 0);
      // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
      // we search for this specific byte pattern to locate start of content, then compare it against source ...
      Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data,0,testRecord.data.length,value.getContent().getReadOnlyBytes(),value.getContent().getOffset(),value.getContent().getCount()) == 0);
      NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems());
      // validate metadata
      Assert.assertEquals("text/html",headers.findValue(Constants.ARCFileHeader_ARC_MimeType));
      Assert.assertEquals(value.getArcFilePos(),testRecord.streamPos);
      Assert.assertEquals(value.getArcFileSize(),testRecord.rawSize);
      Assert.assertEquals("test-value", headers.findValue("test"));
      Assert.assertEquals(value.getArcFileName(),((FileSplit)split).getPath().getName());
     
    }
    reader.close();
   
    Assert.assertEquals(itemIndex,ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);
View Full Code Here

  // test routines
  // ////////////////////////////////////////////////////////////////////////////////

  public void checkCRLFStateMachine() throws Exception {

    ArcFileItem item = new ArcFileItem();
    ArcFileBuilder builder = new ArcFileBuilder(item);

    Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
    Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\n'));
    Assert.assertFalse(builder.checkForCRLFTerminator((byte) '\r'));
View Full Code Here

      public void run() {
        try {

          while (hasMoreItems()) {
            ArcFileItem item = new ArcFileItem();

            getNextItem(item);

            LOG.info("GOT Item URL:" + item.getUri() + " StreamPos:"
                + item.getArcFilePos() + " Content Length:"
                + item.getContent().getCount());
            for (ArcFileHeaderItem headerItem : item.getHeaderItems()) {
              if (headerItem.isFieldDirty(ArcFileHeaderItem.Field_ITEMKEY)) {
                // LOG.info("Header Item:" + headerItem.getItemKey() + " :" +
                // headerItem.getItemValue());
              } else {
                // LOG.info("Header Item:" + headerItem.getItemValue());
View Full Code Here

    final ARCFileRecordReader reader = new ARCFileRecordReader();
    reader.initialize(split, context);
   
    return new RecordReader<Text, ArcFileItem>() {

      final ArcFileItem currentValue = new ArcFileItem();
     
      @Override
      public void close() throws IOException {
        reader.close();
      }

      @Override
      public Text getCurrentKey() throws IOException, InterruptedException {
        return reader.getCurrentKey();
      }

      @Override
      public ArcFileItem getCurrentValue() throws IOException,
          InterruptedException {
        return currentValue;
      }

      @Override
      public float getProgress() throws IOException, InterruptedException {
        return reader.getProgress();
      }

      @Override
      public void initialize(InputSplit split, TaskAttemptContext context)throws IOException, InterruptedException {
       
      }

      @Override
      public boolean nextKeyValue() throws IOException, InterruptedException {
        long preReadPos = reader.reader.getPosition();
        if (reader.nextKeyValue()) {
          long postReadPos = reader.reader.getPosition();
        
          // extract item from raw bytes writable
          ArcFileItemUtils.bytesWritableToArcFileItem(reader.getCurrentKey(), reader.getCurrentValue(), currentValue);
          // set up arc file related fields
          currentValue.setArcFileName(((FileSplit)split).getPath().getName());
          currentValue.setArcFilePos((int)preReadPos);
          currentValue.setArcFileSize((int)(postReadPos-preReadPos));
         
          return true;
        }
        return false;
      }
View Full Code Here

      }

      @Override
      public ArcFileItem createValue() {
        // TODO Auto-generated method stub
        return new ArcFileItem();
      }

      @Override
      public long getPos() throws IOException {
        return reader.getPos();
View Full Code Here

  /**
   * @inheritDoc
   */
  public ArcFileItem createValue() {
    return new ArcFileItem();
  }
View Full Code Here

TOP

Related Classes of org.commoncrawl.protocol.shared.ArcFileItem

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.