Source Code of org.commoncrawl.util.shared.ArcFileReaderTests$CompressedStream

package org.commoncrawl.util.shared;


/**
* Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 **/


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Random;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.zip.GZIPOutputStream;


import junit.framework.Assert;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.Text;
import org.commoncrawl.crawl.common.shared.Constants;
import org.commoncrawl.io.shared.NIOHttpHeaders;
import org.commoncrawl.protocol.shared.ArcFileItem;
import org.commoncrawl.util.shared.ByteArrayUtils;
import org.commoncrawl.util.shared.CCStringUtils;
import org.commoncrawl.util.shared.GZIPUtils;
import org.commoncrawl.util.shared.GZIPUtils.UnzipResult;
import org.commoncrawl.util.shared.IPAddressUtils;
import org.commoncrawl.util.shared.Tuples.Pair;
import org.junit.Test;


import com.google.common.collect.Lists;


/** 
 * ARCFileReader tests
 * 
 * @author rana
 *
 */
public class ArcFileReaderTests {


  private static final Log              LOG                      = LogFactory
      .getLog(ArcFileReaderTests.class);
  
  static String getMetaLine(String uri,String arcFileName, String contentType,
      String hostIP, long fetchBeginTimeStamp, long recordLength)
      throws IOException {


    if (fetchBeginTimeStamp <= 0) {
      throw new IOException("Bogus fetchBeginTimestamp: "
          + Long.toString(fetchBeginTimeStamp));
    }


    return createMetaline(uri, arcFileName,hostIP, TIMESTAMP14.format(new Date(
        fetchBeginTimeStamp)), contentType, Long.toString(recordLength));
  }
  
  static SimpleDateFormat       TIMESTAMP14              = new SimpleDateFormat(
      "yyyyMMddHHmmss");


  static final char             HEADER_FIELD_SEPARATOR   = ' ';
  static final String           UTF8                     = "UTF-8";
  static final char             LINE_SEPARATOR           = '\n';
  static final byte[]           ARC_GZIP_EXTRA_FIELD     = { 8, 0, 'L',
    'X', 4, 0, 0, 0, 0, 0                                     };
  static final String           DEFAULT_ENCODING         = "ISO-8859-1";
  static final String           ARC_MAGIC_NUMBER         = "filedesc://";


  /**
   * An override so we get access to underlying output stream and offer an end()
   * that does not accompany closing underlying stream.
   * 
   * @author stack
   */
  static class CompressedStream extends GZIPOutputStream {
    public CompressedStream(OutputStream out) throws IOException {
      super(out);
    }


    /**
     * @return Reference to stream being compressed.
     */
    OutputStream getWrappedStream() {
      return this.out;
    }


    /**
     * Release the deflater's native process resources, which otherwise would
     * not occur until either finalization or DeflaterOutputStream.close()
     * (which would also close underlying stream).
     */
    public void end() {
      def.end();
    }
  }
  
  static String createMetaline(String uri,String arcFileName, String hostIP,
      String timeStamp, String mimetype, String recordLength) {
    return uri + HEADER_FIELD_SEPARATOR + hostIP + HEADER_FIELD_SEPARATOR
        + timeStamp + HEADER_FIELD_SEPARATOR + mimetype
        + HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR;
  }
  
  static byte[] generateARCFileMetaData(String arcFileName,String date) throws IOException {


    String metadataHeaderLinesTwoAndThree = getMetadataHeaderLinesTwoAndThree("1 "
        + "0");
    int recordLength = metadataHeaderLinesTwoAndThree
        .getBytes(DEFAULT_ENCODING).length;
    String metadataHeaderStr = ARC_MAGIC_NUMBER + arcFileName
        + " 0.0.0.0 " + date + " text/plain " + recordLength
        + metadataHeaderLinesTwoAndThree;


    ByteArrayOutputStream metabaos = new ByteArrayOutputStream(recordLength);


    // Write the metadata header.
    metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
    // Write out a LINE_SEPARATORs to end this record.
    metabaos.write(LINE_SEPARATOR);


    // Now get bytes of all just written and compress if flag set.
    byte[] bytes = metabaos.toByteArray();


    // GZIP the header but catch the gzipping into a byte array so we
    // can add the special IA GZIP header to the product. After
    // manipulations, write to the output stream (The JAVA GZIP
    // implementation does not give access to GZIP header. It
    // produces a 'default' header only). We can get away w/ these
    // maniupulations because the GZIP 'default' header doesn't
    // do the 'optional' CRC'ing of the header.


    byte[] gzippedMetaData = gzip(bytes);


    if (gzippedMetaData[3] != 0) {
      throw new IOException("The GZIP FLG header is unexpectedly "
          + " non-zero.  Need to add smarter code that can deal "
          + " when already extant extra GZIP header fields.");
    }


    // Set the GZIP FLG header to '4' which says that the GZIP header
    // has extra fields. Then insert the alex {'L', 'X', '0', '0', '0,
    // '0'} 'extra' field. The IA GZIP header will also set byte
    // 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same.
    gzippedMetaData[3] = 4;
    gzippedMetaData[9] = 3;


    byte[] assemblyBuffer = new byte[gzippedMetaData.length
        + ARC_GZIP_EXTRA_FIELD.length];
    // '10' in the below is a pointer past the following bytes of the
    // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See
    // RFC1952 for explaination of the abbreviations just used.
    System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
    System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10,
        ARC_GZIP_EXTRA_FIELD.length);
    System.arraycopy(gzippedMetaData, 10, assemblyBuffer,
        10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10);
    bytes = assemblyBuffer;


    //System.out.println("Header Bytes:" + HexDump.dumpHexString(bytes));
    return bytes;
  }
  
  static String getMetadataHeaderLinesTwoAndThree(String version) {
    StringBuffer buffer = new StringBuffer();
    buffer.append(LINE_SEPARATOR);
    buffer.append(version);
    buffer.append(" CommonCrawl");
    buffer.append(LINE_SEPARATOR);
    buffer.append("URL IP-address Archive-date Content-type Archive-length");
    buffer.append(LINE_SEPARATOR);
    return buffer.toString();
  }
  
  /**
   * Gzip passed bytes. Use only when bytes is small.
   * 
   * @param bytes
   *          What to gzip.
   * @return A gzip member of bytes.
   * @throws IOException
   */
  static byte[] gzip(byte[] bytes) throws IOException {
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    GZIPOutputStream gzipOS = new GZIPOutputStream(baos);
    gzipOS.write(bytes, 0, bytes.length);
    gzipOS.close();
    return baos.toByteArray();
  }
  
  
  public static void writeFirstRecord(final OutputStream os,final String fileName,long ts) throws IOException {
    os.write(generateARCFileMetaData(fileName,TIMESTAMP14.format(new Date(System.currentTimeMillis()))));
  }


  public static boolean write(OutputStream os,String normalizedURL,String arcFileName, int segmentid, int crawlNumber,byte[] crawlData,int crawlDataOffset,int crawlDataLen,NIOHttpHeaders headers, String contentType,
      String signature, int hostIP,long lastAttemptTime) throws IOException {


    String encodedURI = normalizedURL;


    String hostIPStr = IPAddressUtils.IntegerToIPAddressString(hostIP);
    long fetchBeginTimestamp = lastAttemptTime;
    String encoding = headers.findValue("Content-Encoding");
    String truncationFlags = "";




    {


      if (crawlData != null && encoding != null
          && encoding.equalsIgnoreCase("gzip")) {
        int compressedSize = crawlData.length;
        try {
          UnzipResult result = GZIPUtils.unzipBestEffort(crawlData,2 << 20);


          crawlData = result.data;
          crawlDataOffset = 0;
          crawlDataLen = result.data.length;


          if (result.wasTruncated) {
            if (truncationFlags.length() != 0)
              truncationFlags += ",";
            truncationFlags += ArcFileItem.Flags
                .toString(ArcFileItem.Flags.TruncatedInInflate);
          }
        } catch (Exception e) {
          LOG.error("URL:" + normalizedURL
              + " Rejected - GZIP Decompression Failed");
          crawlData = null;
        }
      }


      // content must not be null
      if (crawlData == null) {
        LOG.error("URL:" + normalizedURL + " Rejected - Content is NULL");
      } else {


        // add in our custom headers ...
        headers.add(Constants.ARCFileHeader_ParseSegmentId,
            ((Integer) segmentid).toString());
        headers.add(Constants.ARCFileHeader_OriginalURL, normalizedURL);


        headers.add(Constants.ARCFileHeader_Signature, signature);
        headers.add(Constants.ARCFileHeader_CrawlNumber, Integer
            .toString(crawlNumber));
        headers.add(Constants.ARCFileHeader_FetchTimeStamp, Long
            .toString(fetchBeginTimestamp));
        // headers.add(Environment.ARCFileHeader_CrawlerId,
        // Integer.toString((int)urlItem.get));


        if (truncationFlags.length() != 0) {
          headers
              .add(Constants.ARCFileHeader_ContentTruncated, truncationFlags);
        }


        String headerString = headers.toString() + "\r\n";


        byte[] headerBytes = headerString.getBytes("UTF-8");


        // content is truncated further upstream, so this redundant check /
        // truncation is problematic
        // int contentLength = Math.min(crawlData.length,CONTENT_SIZE_LIMIT);


        // extract metadata line upfront, since if the url exceeds a certain
        // size limit , we are going to reject the entry...
        byte metaDataLine[];


        try {
          metaDataLine = getMetaLine(encodedURI,arcFileName, contentType, hostIPStr,
              fetchBeginTimestamp, crawlDataLen + headerBytes.length).getBytes(
              UTF8);
        } catch (IOException e) {
          LOG.error("Metadata Line Validation FAILED with Exception:"
              + CCStringUtils.stringifyException(e));
          // bail here ...
          return false;
        }


        // get ready to write out a new gziped entry ...
        OutputStream compressedStream = preWriteRecordTasks(os,headerBytes.length, crawlDataLen, contentType);
        try {
          // read to write an entry ...
          compressedStream.write(metaDataLine);


          // write out the headers ...
          compressedStream.write(headerBytes, 0, headerBytes.length);
          // write out the content
          compressedStream.write(crawlData, 0, crawlDataLen);
          // line separator ...
          compressedStream.write(LINE_SEPARATOR);


        } finally {
          // flush the gzip stream...
          postWriteRecordTasks(compressedStream);
        }
      }
      
      return true;
    }
    
  }
  
  static OutputStream preWriteRecordTasks(OutputStream os,int headerBytesLength,
      int contentBytesLength, String contentType) throws IOException {


    // Wrap stream in GZIP Writer.
    // The below construction immediately writes the GZIP 'default'
    // header out on the underlying stream.
    return new CompressedStream(os);
  }


  static OutputStream postWriteRecordTasks(OutputStream os) throws IOException {
    CompressedStream o = (CompressedStream) os;
    o.finish();
    o.flush();
    o.end();
    return o.getWrappedStream();
  }
  static String randomConstrainedString(final Random random,char validChars[],final int minLength, final int maxLength) {
    final int length = random.nextInt(maxLength - minLength) + minLength;
    final char[] chars = new char[length];
    for (int i = 0, x = chars.length; i < x; )
      chars[i++] = validChars[random.nextInt(validChars.length)];
    return new String(chars);
  }
  
  static String randomString(final Random random,
      final int minLength, final int maxLength) {
      final int length = random.nextInt(maxLength - minLength) + minLength;
      final char[] chars = new char[length];
      for (int i = 0, x = chars.length; i < x; )
          do {
          final int cp = random.nextInt(0x10FFFF + 1);
            if (!Character.isDefined(cp))
                continue;
            final char[] chs = Character.toChars(cp);
            if (chs.length > x - i)
                continue;
            for (final char ch : chs) {
              if (!Character.isWhitespace(ch)) { 
                  chars[i++] = ch;
              }
            }
            break;
          
      } while (true);


    return new String(chars);
  }


  static final String[] testHeaderKeys = { 
    "x-cc-test-header-1",
    "x-cc-test-header-2",
    "x-cc-test-header-3"
  };
  
  static final String validHeaderChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";


  public static class TestRecord { 
    public String url;
    public byte[] data;
    public List<Pair<String,String>> headers;
    public int streamPos;
    public int rawSize; 
  }
  
  
  public static List<TestRecord> buildTestRecords(int recordCount) {
    Random random = new Random();
    
    List<TestRecord> records = Lists.newArrayList();
    
    char headerChars[] = validHeaderChars.toCharArray();
    
    for (int i=0;i<recordCount;++i) {
      TestRecord record = new TestRecord();
      // intentionally add a space in the url to mimic malformed headers
      record.url = "http://foo/ " + randomString(random, 5, 100);
      record.data = randomString(random, 1000, 3000).getBytes(Charset.forName("UTF-8"));
      record.headers = Lists.newArrayList();
      for (int j=0;j<testHeaderKeys.length;++j) { 
        record.headers.add(new Pair<String, String>(testHeaderKeys[j],randomConstrainedString(random, headerChars, 100, 200)));
      }
      records.add(record);
    }
    return records;
  }
  
  public static final int BASIC_TEST_RECORD_COUNT = 100;
  
  /** 
   * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... 
   */
  @Test
  public void testReader() {
    DataOutputBuffer os = new DataOutputBuffer();
    long timestamp = System.currentTimeMillis();
    try { 
      // write the ARC File into memory 
      writeFirstRecord(os, "test", timestamp);
      List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT);
      long testAttemptTime = System.currentTimeMillis();
      
      for (TestRecord record : records) { 
        NIOHttpHeaders headers = new NIOHttpHeaders();
        for (int i=0;i<record.headers.size();++i) { 
          headers.set(record.headers.get(i).e0,record.headers.get(i).e1);
        }
        
        write(os,record.url,"test",1,1,record.data,0,record.data.length,headers,"text/html",MD5Hash.digest(record.data).toString(),12345,testAttemptTime);
      }
      os.flush();
      os.close();
      
      final AtomicBoolean streamClosed = new AtomicBoolean();
      // setup ArcFileReader to read the file 
      InputStream in = new ByteArrayInputStream(os.getData(),0,os.getLength()) {
        
        public synchronized int read(byte b[], int off, int len) {
          len = 1;
          return super.read(b, off, len);
        }
        
        public void close() throws IOException {
          super.close();
          streamClosed.set(true);
        }
      };
      ARCFileReader reader = new ARCFileReader(in);
      int index = 0;
      Text key = new Text();
      BytesWritable value = new BytesWritable();
      
      // iterate and validate stuff ... 
      while (reader.hasMoreItems()) {
        reader.nextKeyValue(key, value);
        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(compareTo(testKeyBytes,0,testKeyBytes.length,key.getBytes(),0,key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes());
        if (indexofHeaderTerminator == -1) { 
          throw new IOException("No Header Terminator found in Value!");
        }
        indexofHeaderTerminator += 4;
        // read headers ... 
        String headersText = new String(value.getBytes(),0,indexofHeaderTerminator,Charset.forName("UTF-8"));
        NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText);
        for (int i=0;i<testRecord.headers.size();++i) { 
          Pair<String,String> testHeaderRecord = testRecord.headers.get(i);
          Assert.assertNotNull(headers.findValue(testHeaderRecord.e0));
          Assert.assertEquals(testHeaderRecord.e1,headers.findValue(testHeaderRecord.e0));
        }
        
        Assert.assertTrue(compareTo(testRecord.data,0,testRecord.data.length,value.getBytes(),indexofHeaderTerminator,testRecord.data.length) == 0);
      }
      reader.close();
      
      Assert.assertEquals(index,BASIC_TEST_RECORD_COUNT);
      Assert.assertTrue(streamClosed.get());
    }
    catch (IOException e) { 
      e.printStackTrace();
      throw new RuntimeException(e);
    }
  }
  
  /** 
   * helper offset based byte array comparator 
   * @param buffer1
   * @param offset1
   * @param length1
   * @param buffer2
   * @param offset2
   * @param length2
   * @return
   */
  public static int compareTo(byte[] buffer1, int offset1, int length1,
      byte[] buffer2, int offset2, int length2) {
    // Short circuit equal case
    if (buffer1 == buffer2 &&
        offset1 == offset2 &&
        length1 == length2) {
      return 0;
    }
    // Bring WritableComparator code local
    int end1 = offset1 + length1;
    int end2 = offset2 + length2;
    for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) {
      int a = (buffer1[i] & 0xff);
      int b = (buffer2[j] & 0xff);
      if (a != b) {
        return a - b;
      }
    }
    return length1 - length2;
  }
  
  
}
Source Code of org.commoncrawl.util.shared.ArcFileReaderTests$CompressedStream

Related Classes of org.commoncrawl.util.shared.ArcFileReaderTests$CompressedStream