Package org.commoncrawl.io.shared

Examples of org.commoncrawl.io.shared.NIOHttpHeaders$HeaderIterator


      // write the ARC File into memory
      ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());
      long streamPos = os.getPos();
     
      long testAttemptTime = System.currentTimeMillis();
      NIOHttpHeaders testHeaders = new NIOHttpHeaders();
      testHeaders.add("test", "test-value");
     
      for (TestRecord record : recordSet) {
        long preWritePos = os.getPos();
        ArcFileReaderTests.write(os,record.url,"test",1,1,record.data,0,record.data.length,testHeaders,"text/html",MD5Hash.digest(record.data).toString(),12345,testAttemptTime);
        long postWritePos = os.getPos();
View Full Code Here


      // with ?, which causes our test case (which does use invalid characters to from the key, to break.
      Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes,0,testKeyBytes.length,key.getBytes(),0,key.getLength()) == 0);
      // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
      // we search for this specific byte pattern to locate start of content, then compare it against source ...
      Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data,0,testRecord.data.length,value.getContent().getReadOnlyBytes(),value.getContent().getOffset(),value.getContent().getCount()) == 0);
      NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems());
      // validate metadata
      Assert.assertEquals("text/html",headers.findValue(Constants.ARCFileHeader_ARC_MimeType));
      Assert.assertEquals(value.getArcFilePos(),testRecord.streamPos);
      Assert.assertEquals(value.getArcFileSize(),testRecord.rawSize);
      Assert.assertEquals("test-value", headers.findValue("test"));
      Assert.assertEquals(value.getArcFileName(),((FileSplit)split).getPath().getName());
     
    }
    reader.close();
   
View Full Code Here

      ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());
     
      long testAttemptTime = System.currentTimeMillis();
     
      for (TestRecord record : records) {
        ArcFileReaderTests.write(os,record.url,"test",1,1,record.data,0,record.data.length,new NIOHttpHeaders(),"text/html",MD5Hash.digest(record.data).toString(),12345,testAttemptTime);
      }
      os.flush();
    }
    finally {
      os.close();
View Full Code Here

      // write the ARC File into memory
      ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());
     
      long testAttemptTime = System.currentTimeMillis();
     
      NIOHttpHeaders testHeaders = new NIOHttpHeaders();
      testHeaders.add("test", "test-value");
     
      for (TestRecord record : recordSet) {
        long preWritePos = os.getPos();
        ArcFileReaderTests.write(os,record.url,"test",1,1,record.data,0,record.data.length,testHeaders,"text/html",MD5Hash.digest(record.data).toString(),12345,testAttemptTime);
        long postWritePos = os.getPos();
View Full Code Here

      // with ?, which causes our test case (which does use invalid characters to from the key, to break.
      Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes,0,testKeyBytes.length,key.getBytes(),0,key.getLength()) == 0);
      // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
      // we search for this specific byte pattern to locate start of content, then compare it against source ...
      Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data,0,testRecord.data.length,value.getContent().getReadOnlyBytes(),value.getContent().getOffset(),value.getContent().getCount()) == 0);
      NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems());
      // validate metadata
      Assert.assertEquals("text/html",headers.findValue(Constants.ARCFileHeader_ARC_MimeType));
      Assert.assertEquals(value.getArcFilePos(),testRecord.streamPos);
      Assert.assertEquals(value.getArcFileSize(),testRecord.rawSize);
      Assert.assertEquals("test-value", headers.findValue("test"));
      Assert.assertEquals(value.getArcFileName(),((FileSplit)split).getPath().getName());
     
    }
    reader.close();
   
View Full Code Here

      writeFirstRecord(os, "test", timestamp);
      List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT);
      long testAttemptTime = System.currentTimeMillis();
     
      for (TestRecord record : records) {
        NIOHttpHeaders headers = new NIOHttpHeaders();
        for (int i=0;i<record.headers.size();++i) {
          headers.set(record.headers.get(i).e0,record.headers.get(i).e1);
        }
       
        write(os,record.url,"test",1,1,record.data,0,record.data.length,headers,"text/html",MD5Hash.digest(record.data).toString(),12345,testAttemptTime);
      }
      os.flush();
      os.close();
     
      final AtomicBoolean streamClosed = new AtomicBoolean();
      // setup ArcFileReader to read the file
      InputStream in = new ByteArrayInputStream(os.getData(),0,os.getLength()) {
       
        public synchronized int read(byte b[], int off, int len) {
          len = 1;
          return super.read(b, off, len);
        }
       
        public void close() throws IOException {
          super.close();
          streamClosed.set(true);
        }
      };
      ARCFileReader reader = new ARCFileReader(in);
      int index = 0;
      Text key = new Text();
      BytesWritable value = new BytesWritable();
     
      // iterate and validate stuff ...
      while (reader.hasMoreItems()) {
        reader.nextKeyValue(key, value);
        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ...
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(compareTo(testKeyBytes,0,testKeyBytes.length,key.getBytes(),0,key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ...
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes());
        if (indexofHeaderTerminator == -1) {
          throw new IOException("No Header Terminator found in Value!");
        }
        indexofHeaderTerminator += 4;
        // read headers ...
        String headersText = new String(value.getBytes(),0,indexofHeaderTerminator,Charset.forName("UTF-8"));
        NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText);
        for (int i=0;i<testRecord.headers.size();++i) {
          Pair<String,String> testHeaderRecord = testRecord.headers.get(i);
          Assert.assertNotNull(headers.findValue(testHeaderRecord.e0));
          Assert.assertEquals(testHeaderRecord.e1,headers.findValue(testHeaderRecord.e0));
        }
       
        Assert.assertTrue(compareTo(testRecord.data,0,testRecord.data.length,value.getBytes(),indexofHeaderTerminator,testRecord.data.length) == 0);
      }
      reader.close();
View Full Code Here

      ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());
     
      long testAttemptTime = System.currentTimeMillis();
     
      for (TestRecord record : records) {
        ArcFileReaderTests.write(os,record.url,"test",1,1,record.data,0,record.data.length,new NIOHttpHeaders(),"text/html",MD5Hash.digest(record.data).toString(),12345,testAttemptTime);
      }
      os.flush();
    }
    finally {
      os.close();
View Full Code Here

  private static final Log LOG = LogFactory.getLog(ArcFileItemUtils.class);


  public static NIOHttpHeaders buildHeaderFromArcFileItemHeaders(
      ArrayList<ArcFileHeaderItem> items) {
    NIOHttpHeaders headers = new NIOHttpHeaders();

    for (ArcFileHeaderItem headerItem : items) {
      headers.add(headerItem.getItemKey(), headerItem.getItemValue());
    }

    return headers;
  }
View Full Code Here

    int headerLen = indexOfTrailingCRLF + 4;
    int contentLen = rawArcPayload.getLength() - headerLen;
   
    // parse headers
    String headerStr = new TextBytes(rawArcPayload.getBytes(),0,headerLen,true).toString();
    NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headerStr);
   
    // extract appropriate header values to populate ArcFileItem
    arcFileItem.getUriAsTextBytes().set(key, true);
    arcFileItem.setHostIP(headers.findValue(Constants.ARCFileHeader_HostIP));
    try {
      arcFileItem.setTimestamp(TIMESTAMP14.parse(headers.findValue(Constants.ARCFileHeader_ARC_Timestamp)).getTime());
    } catch (Exception e) {
      LOG.error("Invalid Timestamp Encountered in Item Metdata. URL:"
          + arcFileItem.getUri() + " Timestamp:" + headers.findValue(Constants.ARCFileHeader_ARC_Timestamp) );
    }
    arcFileItem.setMimeType(headers.findValue(Constants.ARCFileHeader_ARC_MimeType));
    arcFileItem.setRecordLength(rawArcPayload.getLength());
   
    //populate headers
    for (int i=0;i<headers.getKeyCount();++i) {
      String headerKey = headers.getKey(i);
      String headerValue = headers.getValue(i);
     
      ArcFileHeaderItem headerItem = new ArcFileHeaderItem();
     
      headerItem.setItemKey((headerKey != null) ? headerKey : "");
      headerItem.setItemValue((headerValue != null) ? headerValue : "");
View Full Code Here

      // set up the item as the context for the connection ...
      connection.setContext(item);
      // we don't want to populate default http headers ...
      connection.setPopulateDefaultHeaderItems(false);
      // get at headers object
      NIOHttpHeaders headers = connection.getRequestHeaders();
      // populate http request string
      headers.prepend("GET" + " " + theURL.getFile() +" "  + "HTTP/1.1", null);
      // populate host entry ...
      if (theURL.getPort() != -1 && theURL.getPort() != 80) {
        headers.set("Host",theURL.getHost() +":"+String.valueOf(theURL.getPort()));
      }
      else {
        headers.set("Host",theURL.getHost());
      }
      // create a tree map in parallel (to pass to canonicalization routine for s3 auth)
      Map amazonHeaders = new TreeMap();
     
      // add date ...
      String theDate = httpDate();
     
      headers.set("Date", theDate);
      // and set it in amazon headers ...
      addToAmazonHeader("Date", theDate, amazonHeaders);
      // add requester pays if specified ...
      if (_isRequesterPays) {
        headers.set("x-amz-request-payer", "requester");
        addToAmazonHeader("x-amz-request-payer", "requester",amazonHeaders);
      }

      String canonicalString =  S3Utils.makeCanonicalString("GET", _s3BucketName, item.getKey(), null,amazonHeaders );
      String encodedCanonical = S3Utils.encode(_s3SecretKey, canonicalString, false);
     
      // add auth string to headers ...
      headers.set("Authorization","AWS " + _s3AccessId + ":" + encodedCanonical);
     
      // figure out of this is a continuation ...
      if (item.isContinuation()) {
        // figure out where to start ...
        String rangeString = "bytes=" + item.getLastReadPos() + "-" + item.getContentLength();
        // set the range header ...
        headers.set("Range",rangeString);
        // and if etag is valid ...
        if (item.getLastKnownETag() != null) {
          headers.set("If-match",item.getLastKnownETag());
        }
      }
      // add cache control pragmas ...
      headers.set ("Connection", "close");
      headers.set("Cache-Control", "no-cache");
      headers.set("Pragma", "no-cache");
      headers.remove("Accept-Encoding");
      headers.set("Accept-Encoding","identity");
     
      // set up the listener relationship
      connection.setListener(this);
      // and open the  connection
      connection.open();
View Full Code Here

TOP

Related Classes of org.commoncrawl.io.shared.NIOHttpHeaders$HeaderIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.