Package org.jwat.warc

Source Code of org.jwat.warc.WarcHeader

/**
* Java Web Archive Toolkit - Software to read and validate ARC, WARC
* and GZip files. (http://jwat.org/)
* Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.jwat.warc;

import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.common.ContentType;
import org.jwat.common.Diagnosis;
import org.jwat.common.DiagnosisType;
import org.jwat.common.Diagnostics;
import org.jwat.common.HeaderLine;
import org.jwat.common.MaxLengthRecordingInputStream;
import org.jwat.common.Uri;
import org.jwat.common.UriProfile;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.text.DateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
* Central class for working with WARC headers. This class includes support for
* reading and writing WARC headers. Methods are also available to validate
* individual headers and a WARC header as a whole.
*
* @author nicl
*/
public class WarcHeader {

    /** An URI with encapsulating <> characters. */
    public static final boolean URI_LTGT = true;

    /** An URI without encapsulating <> characters. */
    public static final boolean URI_NAKED = false;

    /** Associated WarcReader context.
     *  Must be set prior to calling the various methods. */
    protected WarcReader reader;

    /** Diagnostics used to report diagnoses.
     *  Must be set prior to calling the various methods. */
    protected Diagnostics<Diagnosis> diagnostics;

    /** WARC-Target-URI profile. */
    protected UriProfile warcTargetUriProfile;

    /** URI profile. */
    protected UriProfile uriProfile;

    /** WARC field parser used.
     *  Must be set prior to calling the various methods. */
    protected WarcFieldParsers fieldParsers;

    /** WARC <code>DateFormat</code> as specified by the WARC ISO standard. */
    protected DateFormat warcDateFormat;

    /** WARC record starting offset relative to the source WARC file input
     *  stream. The offset is correct for both compressed and uncompressed streams. */
    protected long startOffset = -1;

    /*
     * Version related fields.
     */

    /** Was "WARC/" identified while looking for the version string. */
    public boolean bMagicIdentified;
    /** Did the version string include between 2 and 4 substrings delimited by ".". */
    public boolean bVersionParsed;
    /** Is the version format valid. */
    public boolean bValidVersionFormat;
    /** Is the version recognized. (0.17, 0.18 or 1.0) */
    public boolean bValidVersion;

    /** Raw version string. */
    public String versionStr;
    /** Array based on the version string split by the "." delimiter and converted to integers. */
    public int[] versionArr;

    /** Major version number from WARC header. */
    public int major = -1;
    /** Minor version number from WARC header. */
    public int minor = -1;

    /*
     * WARC header fields.
     */

    /** Array used for duplicate header detection. */
    protected boolean[] seen = new boolean[WarcConstants.FN_INDEX_OF_LAST];

    /** Is the header missing one of the mandatory headers. */
    public boolean bMandatoryMissing;

    /** WARC-Type field string value. */
    public String warcTypeStr;
    /** WARC-Type converted to an integer id, if identified. */
    public Integer warcTypeIdx;

    /** WARC-Filename field string value.
     *  (warcinfo record type only) */
    public String warcFilename;

    /** WARC-Record-Id field string value. */
    public String warcRecordIdStr;
    /** WARC-Record-Id converted to an <code>Uri</code> object, if valid. */
    public Uri warcRecordIdUri;

    /** WARC-Date field string value. */
    public String warcDateStr;
    /** WARC-Date converted to a <code>Date</code> object, if valid. */
    public Date warcDate;

    /** Content-Length field string value. */
    public String contentLengthStr;
    /** Content-Length converted to a <code>Long</code> object, if valid. */
    public Long contentLength;

    /** Content-Type field string value. */
    public String contentTypeStr;
    /** Content-Type converted to a <code>ContentType</code> object, if valid. */
    public ContentType contentType;

    /** WARC-Truncated field string value. */
    public String warcTruncatedStr;
    /** WARC-Truncated converted to an integer id, if valid. */
    public Integer warcTruncatedIdx;

    /** WARC-IP-Address field string value. */
    public String warcIpAddress;
    /** WARC-IP-Address converted to an <code>InetAddress</code> object, if valid. */
    public InetAddress warcInetAddress;

    /** List of WARC-Concurrent-To field string values and converted <code>URI</code> objects,  if valid. */
    public List<WarcConcurrentTo> warcConcurrentToList = new LinkedList<WarcConcurrentTo>();

    /** WARC-Refers-To field string value. */
    public String warcRefersToStr;
    /** WARC-Refers-To converted to an <code>Uri</code> object, if valid. */
    public Uri warcRefersToUri;

    /** WARC_Target-URI field string value. */
    public String warcTargetUriStr;
    /** WARC-TargetURI converted to an <code>Uri</code> object, if valid. */
    public Uri warcTargetUriUri;

    /** WARC-Warcinfo-Id field string value. */
    public String warcWarcinfoIdStr;
    /** WARC-Warcinfo-Id converted to an <code>Uri</code> object, if valid. */
    public Uri warcWarcinfoIdUri;

    /** WARC-Block-Digest field string value. */
    public String warcBlockDigestStr;
    /** WARC-Block-Digest converted to a <code>WarcDigest</code> object, if valid. */
    public WarcDigest warcBlockDigest;

    /** WARC-Payload-Digest field string value. */
    public String warcPayloadDigestStr;
    /** WARC-Payload-Digest converted to a <code>WarcDigest</code> object, if valid. */
    public WarcDigest warcPayloadDigest;

    /** WARC-Identified-Payload-Type field string value. */
    public String warcIdentifiedPayloadTypeStr;
    /** WARC-Identified-Payload-Type converted to a <code>ContentType</code> object, if valid. */
    public ContentType warcIdentifiedPayloadType;

    /** WARC-Profile field string value.
     *  (revisit record only) */
    public String warcProfileStr;
    /** WARC-Profile field converted to an <code>Uri</code> object, if valid.
     *  (revisit record only) */
    public Uri warcProfileUri;
    /** WARC-Profile converted to an integer id, if valid.
     *  (revisit record only) */
    public Integer warcProfileIdx;

    /** WARC-Segment-Number field string value. */
    public String warcSegmentNumberStr;
    /** WARC-Segment-Number converted to an <code>Integer</code> object, if valid. */
    public Integer warcSegmentNumber;

    /** WARC-Segment-Origin-Id field string value.
     *  (continuation record only) */
    public String warcSegmentOriginIdStr;
    /** WARC-Segment-Origin-Id converted to an <code>Uri</code> object, if valid.
     *  (continuation record only) */
    public Uri warcSegmentOriginIdUrl;

    /** WARC-Segment-Total-Length field string value.
     *  (continuation record only) */
    public String warcSegmentTotalLengthStr;
    /** WARC-Segment-Total-Length converted to a <code>Long</code> object, if valid.
     *  (continuation record only) */
    public Long warcSegmentTotalLength;

    // see https://docs.google.com/document/d/1QyQBA7Ykgxie75V8Jziz_O7hbhwf7PF6_u9O6w6zgp0/edit
    /** WARC-Refers-To-Target-URI field string value. */
    public String warcRefersToTargetUriStr;
    /** WARC-Refers-To-Target-URI converted to an <code>Uri</code> object, if valid. */
    public Uri warcRefersToTargetUriUri;
    /** WARC-Refers-To-Date */
    public String warcRefersToDateStr;
    /** WARC-Date converted to a <code>Date</code> object, if valid. */
    public Date warcRefersToDate;

    /*
     * WARC header fields collections.
     */

    /** Raw WARC header output stream. */
    protected ByteArrayOutputStream headerBytesOut = new ByteArrayOutputStream();

    /** Raw WARC header byte array. */
    public byte[] headerBytes;

    /** List of parsed header fields. */
    protected List<HeaderLine> headerList = new LinkedList<HeaderLine>();

    /** Map of parsed header fields. */
    protected Map<String, HeaderLine> headerMap = new HashMap<String, HeaderLine>();

    /**
     * Non public constructor to allow unit testing.
     */
    protected WarcHeader() {
    }

    /**
     * Create and initialize a new <code>WarcHeader</code> for writing.
     * @param writer writer which shall be used
     * @param diagnostics diagnostics object used by writer
     * @return a <code>WarcHeader</code> prepared for writing
     */
    public static WarcHeader initHeader(WarcWriter writer, Diagnostics<Diagnosis> diagnostics) {
        WarcHeader header = new WarcHeader();
        // Set default version to "1.0".
        header.major = 1;
        header.minor = 0;
        header.warcTargetUriProfile = writer.warcTargetUriProfile;
        header.uriProfile = writer.uriProfile;
        header.fieldParsers = writer.fieldParsers;
        header.warcDateFormat = writer.warcDateFormat;
        header.diagnostics = diagnostics;
        return header;
    }

    /**
     * Create and initialize a new <code>WarcHeader</code> for reading.
     * @param reader reader which shall be used
     * @param startOffset start offset of header
     * @param diagnostics diagnostics object used by reader
     * @return a <code>WarcHeader</code> prepared for reading
     */
    public static WarcHeader initHeader(WarcReader reader, long startOffset, Diagnostics<Diagnosis> diagnostics) {
        WarcHeader header = new WarcHeader();
        header.reader = reader;
        header.warcTargetUriProfile = reader.warcTargetUriProfile;
        header.uriProfile = reader.uriProfile;
        header.fieldParsers = reader.fieldParsers;
        header.diagnostics = diagnostics;
        // This is only relevant for uncompressed sequentially read records
        header.startOffset = startOffset;
        return header;
    }

    /**
     * Add an error diagnosis of the given type on a specific entity with
     * optional extra information. The information varies according to the
     * diagnosis type.
     * @param type diagnosis type
     * @param entity entity examined
     * @param information optional extra information
     */
    protected void addErrorDiagnosis(DiagnosisType type, String entity, String... information) {
        diagnostics.addError(new Diagnosis(type, entity, information));
    }

    /**
     * Add a warning diagnosis of the given type on a specific entity with
     * optional extra information. The information varies according to the
     * diagnosis type.
     * @param type diagnosis type
     * @param entity entity examined
     * @param information optional extra information
     */
    protected void addWarningDiagnosis(DiagnosisType type, String entity, String... information) {
        diagnostics.addWarning(new Diagnosis(type, entity, information));
    }

    /**
     * Returns the starting offset of the record in the containing WARC.
     * @return the starting offset of the record
     */
    public long getStartOffset() {
        return startOffset;
    }

    /**
     * Try to parse a WARC header and return a boolean indicating the success or
     * failure of this.
     * @param in input stream with WARC data
     * @return boolean indicating whether a header was parsed or not
     * @throws IOException if an i/o exception occurs while parsing for a header
     */
    public boolean parseHeader(ByteCountingPushBackInputStream in) throws IOException {
        if (parseVersion(in)) {
            // debug
            //System.out.println(wr.bMagicIdentified);
            //System.out.println(wr.bVersionParsed);
            //System.out.println(wr.major + "." + wr.minor);
            if (bVersionParsed && versionArr.length == 2) {
                switch (major) {
                case 1:
                    if (minor == 0) {
                        bValidVersion = true;
                    }
                    break;
                case 0:
                    switch (minor) {
                    case 17:
                    case 18:
                        bValidVersion = true;
                        break;
                    }
                    break;
                default:
                    break;
                }
                if (!bValidVersion) {
                    diagnostics.addError(
                            new Diagnosis(DiagnosisType.UNKNOWN,
                                    "Magic version number", versionStr));
                }
            } else {
                diagnostics.addError(
                        new Diagnosis(DiagnosisType.INVALID_DATA,
                                "Magic Version string", versionStr));
            }

            MaxLengthRecordingInputStream mrin = new MaxLengthRecordingInputStream(in, reader.recordHeaderMaxSize);
            ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(mrin, reader.recordHeaderMaxSize);

            parseHeaders(pbin);
            pbin.close();

            checkFields();

            headerBytes = headerBytesOut.toByteArray();
        }
        return bMagicIdentified;
    }

    /**
     * Looks forward in the input stream for a valid WARC version line.
     * @param in data input stream
     * @return true, if magic WARC header found
     * @throws IOException if an error occurs while reading version data
     */
    protected boolean parseVersion(ByteCountingPushBackInputStream in) throws IOException {
        bMagicIdentified = false;
        bVersionParsed = false;
        boolean bInvalidDataBeforeVersion = false;
        boolean bEmptyLinesBeforeVersion = false;
        HeaderLine line;
        String tmpStr;
        boolean bSeekMagic = true;
        // Loop until when have found something that looks like a version line.
        while (bSeekMagic) {
            // This is only relevant for uncompressed sequentially read records
            startOffset = in.getConsumed();
            line = reader.lineReader.readLine(in);
            if (!reader.lineReader.bEof) {
                switch (line.type) {
                case HeaderLine.HLT_LINE:
                    tmpStr = line.line;
                    // debug
                    //System.out.println(tmpStr);
                    if (tmpStr.length() > 0) {
                        if (tmpStr.toUpperCase().startsWith(WarcConstants.WARC_MAGIC_HEADER)) {
                            bMagicIdentified = true;
                            versionStr = tmpStr.substring(WarcConstants.WARC_MAGIC_HEADER.length());
                            String[] tmpArr = versionStr.split("\\.", -1);        // Not optimal
                            if (tmpArr.length >= 2 && tmpArr.length <= 4) {
                                bVersionParsed = true;
                                bValidVersionFormat = true;
                                versionArr = new int[tmpArr.length];
                                for (int i=0; i<tmpArr.length; ++i) {
                                    try {
                                        versionArr[i] = Integer.parseInt(tmpArr[i]);
                                    } catch (NumberFormatException e) {
                                        versionArr[i] = -1;
                                        bValidVersionFormat = false;
                                    }
                                }
                                major = versionArr[0];
                                minor = versionArr[1];
                            }
                            headerBytesOut.write(line.raw);
                            bSeekMagic = false;
                        } else {
                            // Invalid data aka Gibberish.
                            bInvalidDataBeforeVersion = true;
                        }
                    } else {
                        // Empty line.
                        bEmptyLinesBeforeVersion = true;

                    }
                    break;
                case HeaderLine.HLT_HEADERLINE:
                    // Invalid data - header or binary.
                    bInvalidDataBeforeVersion = true;
                    break;
                }
            } else {
                // EOF.
                bSeekMagic = false;
            }
        }
        if (bInvalidDataBeforeVersion) {
            addErrorDiagnosis(DiagnosisType.INVALID, "Data before WARC version");
        }
        if (bEmptyLinesBeforeVersion) {
            addErrorDiagnosis(DiagnosisType.INVALID, "Empty lines before WARC version");
        }
        return bMagicIdentified;
    }

    /**
     * Reads WARC header lines one line at a time until an empty line is
     * encountered.
     * @param in header input stream
     * @throws IOException if an error occurs while reading the WARC header
     */
    protected void parseHeaders(ByteCountingPushBackInputStream in) throws IOException {
        HeaderLine headerLine;
        boolean bLoop = true;
        while (bLoop) {
            headerLine = reader.headerLineReader.readLine(in);
            if (!reader.headerLineReader.bEof) {
                headerBytesOut.write(headerLine.raw);
                switch (headerLine.type) {
                case HeaderLine.HLT_HEADERLINE:
                    if (headerLine.name != null && headerLine.name.length() > 0) {
                        // debug
                        //System.out.println(headerLine.name);
                        //System.out.println(headerLine.value);
                        addHeader(headerLine);
                    } else {
                        // Empty field name.
                        addWarningDiagnosis(DiagnosisType.EMPTY, "Header line");
                    }
                    break;
                case HeaderLine.HLT_LINE:
                    if (headerLine.line.length() == 0) {
                        // Empty line.
                        bLoop = false;
                    } else {
                        // Unknown header line.
                        addWarningDiagnosis(DiagnosisType.UNKNOWN, "Header line", headerLine.line);
                    }
                    break;
                default:
                    throw new IllegalStateException("Invalid HeaderLine output!");
                }
            } else {
                // EOF.
                bLoop = false;
            }
        }
    }

    /**
     * Identify a (WARC) header name, validate the value and set the header.
     * @param headerLine the headerLine
     */
    protected void addHeader(HeaderLine headerLine) {
        String fieldName = headerLine.name;
        String fieldValue = headerLine.value;
        WarcConcurrentTo warcConcurrentTo;
        Integer fn_idx = WarcConstants.fieldNameIdxMap.get(fieldName.toLowerCase());
        if (fn_idx != null) {
            // WARC field name defined in WARC specification.
            if (!seen[fn_idx] || WarcConstants.fieldNamesRepeatableLookup[fn_idx]) {
                seen[fn_idx] = true;
                switch (fn_idx.intValue()) {
                case WarcConstants.FN_IDX_WARC_TYPE:
                    warcTypeStr = fieldParsers.parseString(fieldValue,
                            WarcConstants.FN_WARC_TYPE);
                    if (warcTypeStr != null) {
                        warcTypeIdx = WarcConstants.recordTypeIdxMap.get(warcTypeStr.toLowerCase());
                    }
                    if (warcTypeIdx == null && warcTypeStr != null && warcTypeStr.length() > 0) {
                        warcTypeIdx = WarcConstants.RT_IDX_UNKNOWN;
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_RECORD_ID:
                    warcRecordIdStr = fieldValue;
                    warcRecordIdUri = fieldParsers.parseUri(fieldValue, URI_LTGT,
                            uriProfile, WarcConstants.FN_WARC_RECORD_ID);
                    break;
                case WarcConstants.FN_IDX_WARC_DATE:
                    warcDateStr = fieldValue;
                    warcDate = fieldParsers.parseDate(fieldValue,
                            WarcConstants.FN_WARC_DATE);
                    break;
                case WarcConstants.FN_IDX_CONTENT_LENGTH:
                    contentLengthStr = fieldValue;
                    contentLength = fieldParsers.parseLong(fieldValue,
                            WarcConstants.FN_CONTENT_LENGTH);
                    break;
                case WarcConstants.FN_IDX_CONTENT_TYPE:
                    contentTypeStr = fieldValue;
                    contentType = fieldParsers.parseContentType(fieldValue,
                            WarcConstants.FN_CONTENT_TYPE);
                    break;
                case WarcConstants.FN_IDX_WARC_CONCURRENT_TO:
                    Uri tmpUri = fieldParsers.parseUri(fieldValue, URI_LTGT,
                            uriProfile, WarcConstants.FN_WARC_CONCURRENT_TO);
                    if (fieldValue != null && fieldValue.trim().length() > 0) {
                        warcConcurrentTo = new WarcConcurrentTo();
                        warcConcurrentTo.warcConcurrentToStr = fieldValue;
                        warcConcurrentTo.warcConcurrentToUri = tmpUri;
                        warcConcurrentToList.add(warcConcurrentTo);
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_BLOCK_DIGEST:
                    warcBlockDigestStr = fieldValue;
                    warcBlockDigest = fieldParsers.parseDigest(fieldValue,
                            WarcConstants.FN_WARC_BLOCK_DIGEST);
                    break;
                case WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST:
                    warcPayloadDigestStr = fieldValue;
                    warcPayloadDigest = fieldParsers.parseDigest(fieldValue,
                            WarcConstants.FN_WARC_PAYLOAD_DIGEST);
                    break;
                case WarcConstants.FN_IDX_WARC_IP_ADDRESS:
                    warcIpAddress = fieldValue;
                    warcInetAddress = fieldParsers.parseIpAddress(fieldValue,
                            WarcConstants.FN_WARC_IP_ADDRESS);
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO:
                    warcRefersToStr = fieldValue;
                    warcRefersToUri = fieldParsers.parseUri(fieldValue, URI_LTGT,
                            uriProfile, WarcConstants.FN_WARC_REFERS_TO);
                    break;
                case WarcConstants.FN_IDX_WARC_TARGET_URI:
                    warcTargetUriStr = fieldValue;
                    warcTargetUriUri = fieldParsers.parseUri(fieldValue, URI_NAKED,
                            warcTargetUriProfile, WarcConstants.FN_WARC_TARGET_URI);
                    break;
                case WarcConstants.FN_IDX_WARC_TRUNCATED:
                    warcTruncatedStr = fieldParsers.parseString(fieldValue,
                            WarcConstants.FN_WARC_TRUNCATED);
                    if (warcTruncatedStr != null) {
                        warcTruncatedIdx = WarcConstants.truncatedTypeIdxMap.get(warcTruncatedStr.toLowerCase());
                    }
                    if (warcTruncatedIdx == null && warcTruncatedStr != null && warcTruncatedStr.length() > 0) {
                        warcTruncatedIdx = WarcConstants.TT_IDX_FUTURE_REASON;
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_WARCINFO_ID:
                    warcWarcinfoIdStr = fieldValue;
                    warcWarcinfoIdUri = fieldParsers.parseUri(fieldValue, URI_LTGT,
                            uriProfile, WarcConstants.FN_WARC_WARCINFO_ID);
                    break;
                case WarcConstants.FN_IDX_WARC_FILENAME:
                    warcFilename = fieldParsers.parseString(fieldValue,
                            WarcConstants.FN_WARC_FILENAME);
                    break;
                case WarcConstants.FN_IDX_WARC_PROFILE:
                    warcProfileStr = fieldValue;
                    warcProfileUri = fieldParsers.parseUri(fieldValue, URI_NAKED,
                            uriProfile, WarcConstants.FN_WARC_PROFILE);
                    if (warcProfileStr != null) {
                        warcProfileIdx = WarcConstants.profileIdxMap.get(warcProfileStr.toLowerCase());
                    }
                    if (warcProfileIdx == null && warcProfileStr != null && warcProfileStr.length() > 0) {
                        warcProfileIdx = WarcConstants.PROFILE_IDX_UNKNOWN;
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE:
                    warcIdentifiedPayloadTypeStr = fieldValue;
                    warcIdentifiedPayloadType = fieldParsers.parseContentType(fieldValue,
                            WarcConstants.FN_WARC_IDENTIFIED_PAYLOAD_TYPE);
                    break;
                case WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID:
                    warcSegmentOriginIdStr = fieldValue;
                    warcSegmentOriginIdUrl = fieldParsers.parseUri(fieldValue, URI_LTGT,
                            uriProfile, WarcConstants.FN_WARC_SEGMENT_ORIGIN_ID);
                    break;
                case WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER:
                    warcSegmentNumberStr = fieldValue;
                    warcSegmentNumber = fieldParsers.parseInteger(fieldValue,
                            WarcConstants.FN_WARC_SEGMENT_NUMBER);
                    break;
                case WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH:
                    warcSegmentTotalLengthStr = fieldValue;
                    warcSegmentTotalLength = fieldParsers.parseLong(fieldValue,
                            WarcConstants.FN_WARC_SEGMENT_TOTAL_LENGTH);
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI:
                    warcRefersToTargetUriStr = fieldValue;
                    warcRefersToTargetUriUri = fieldParsers.parseUri(fieldValue, URI_NAKED,
                            uriProfile, WarcConstants.FN_WARC_REFERS_TO_TARGET_URI);
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO_DATE:
                    warcRefersToDateStr = fieldValue;
                    warcRefersToDate = fieldParsers.parseDate(fieldValue,
                            WarcConstants.FN_WARC_REFERS_TO_DATE);
                    break;
                }
            } else {
                // Duplicate field.
                addErrorDiagnosis(DiagnosisType.DUPLICATE, "'" + fieldName + "' header", fieldValue);
            }
        }
        HeaderLine tmpLine = headerMap.get(fieldName.toLowerCase());
        if (tmpLine == null) {
            headerMap.put(fieldName.toLowerCase(), headerLine);
        } else {
            tmpLine.lines.add(headerLine);
        }
        headerList.add(headerLine);
    }

    /**
     * Get a <code>List</code> of all the headers found during parsing.
     * @return <code>List</code> of <code>HeaderLine</code>
     */
    public List<HeaderLine> getHeaderList() {
        return Collections.unmodifiableList(headerList);
    }

    /**
     * Get a header line structure or null, if no header line structure is
     * stored with the given header name.
     * @param field header name
     * @return <code>HeaderLine</code> structure or null
     */
    public HeaderLine getHeader(String field) {
        if (field != null && field.length() > 0) {
            return headerMap.get(field.toLowerCase());
        } else {
            return null;
        }
    }

    /**
     * Add a String header using the supplied string and return a
     * <code>HeaderLine</code> object corresponding to how the header would be
     * read.
     * @param fieldName name of field to add
     * @param fieldValue field value string
     * @return <code>HeaderLine</code> object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, String fieldValue) {
        HeaderLine headerLine = new HeaderLine();
        headerLine.name = fieldName;
        headerLine.value = fieldValue;
        addHeader(headerLine);
        return headerLine;
    }

    /**
     * Add an Integer header using the supplied string and object values and return
     * a <code>HeaderLine</code> object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param integerFieldValue <code>Integer</code> field value object
     * @param fieldValueStr Integer field value string
     * @return <code>HeaderLine</code> object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, Integer integerFieldValue, String fieldValueStr) {
        if (integerFieldValue == null && fieldValueStr != null) {
            integerFieldValue = fieldParsers.parseInteger(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && integerFieldValue != null) {
            fieldValueStr = integerFieldValue.toString();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_INTEGER,
                integerFieldValue, null, null, null, null, null, null);
    }

    /**
     * Add a Long header using the supplied string and object values and return
     * a <code>HeaderLine</code> object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param longFieldValue <code>Long</code> field value object
     * @param fieldValueStr Long field value string
     * @return <code>HeaderLine</code> object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, Long longFieldValue, String fieldValueStr) {
        if (longFieldValue == null && fieldValueStr != null) {
            longFieldValue = fieldParsers.parseLong(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && longFieldValue != null) {
            fieldValueStr = longFieldValue.toString();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_LONG,
                null, longFieldValue, null, null, null, null, null);
    }

    /**
     * Add an Digest header using the supplied string and object values and return
     * a <code>HeaderLine</code> object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param digestFieldValue <code>Digest</code> field value object
     * @param fieldValueStr Digest field value string
     * @return <code>HeaderLine</code> object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, WarcDigest digestFieldValue, String fieldValueStr) {
        if (digestFieldValue == null && fieldValueStr != null) {
            digestFieldValue = fieldParsers.parseDigest(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && digestFieldValue != null) {
            fieldValueStr = digestFieldValue.toString();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_DIGEST,
                null, null, digestFieldValue, null, null, null, null);
    }

    /**
     * Add an Content-Type header using the supplied string and object values and return
     * a <code>HeaderLine</code> object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param contentTypeFieldValue <code>ContentType</code> field value object
     * @param fieldValueStr Content-Type field value string
     * @return <code>HeaderLine</code> object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, ContentType contentTypeFieldValue, String fieldValueStr) {
        if (contentTypeFieldValue == null && fieldValueStr != null) {
            contentTypeFieldValue = fieldParsers.parseContentType(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && contentTypeFieldValue != null) {
            fieldValueStr = contentTypeFieldValue.toString();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_CONTENTTYPE,
                null, null, null, contentTypeFieldValue, null, null, null);
    }

    /**
     * Add an Date header using the supplied string and object values and return
     * a <code>HeaderLine</code> object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param dateFieldValue <code>Date</code> field value object
     * @param fieldValueStr Date field value string
     * @return <code>HeaderLine</code> object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, Date dateFieldValue, String fieldValueStr) {
        if (dateFieldValue == null && fieldValueStr != null) {
            dateFieldValue = fieldParsers.parseDate(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && dateFieldValue != null) {
            fieldValueStr = warcDateFormat.format(dateFieldValue);
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_DATE,
                null, null, null, null, dateFieldValue, null, null);
    }

    /**
     * Add an InetAddress header using the supplied string and object values and return
     * a <code>HeaderLine</code> object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param inetAddrFieldValue <code>InetAddress</code> field value object
     * @param fieldValueStr IP-Address field value string
     * @return <code>HeaderLine</code> object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, InetAddress inetAddrFieldValue, String fieldValueStr) {
        if (inetAddrFieldValue == null && fieldValueStr != null) {
            inetAddrFieldValue = fieldParsers.parseIpAddress(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && inetAddrFieldValue != null) {
            fieldValueStr = inetAddrFieldValue.getHostAddress();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_INETADDRESS,
                null, null, null, null, null, inetAddrFieldValue, null);
    }

    /**
     * Add an URI header using the supplied string and object values and return
     * a <code>HeaderLine</code> object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param uriFieldValue <code>URI</code> field value object
     * @param fieldValueStr URI field value string
     * @return <code>HeaderLine</code> object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, Uri uriFieldValue, String fieldValueStr) {
        if (uriFieldValue == null && fieldValueStr != null) {
            if (WarcConstants.FN_WARC_TARGET_URI.equalsIgnoreCase(fieldName)) {
                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, warcTargetUriProfile, fieldName);
            } else if (WarcConstants.FN_WARC_PROFILE.equalsIgnoreCase(fieldName)) {
                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, uriProfile, fieldName);
            } else if (WarcConstants.FN_WARC_REFERS_TO_TARGET_URI.equalsIgnoreCase(fieldName)) {
                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, warcTargetUriProfile, fieldName);
            } else {
                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_LTGT, uriProfile, fieldName);
            }
        } else if (fieldValueStr == null && uriFieldValue != null) {
            if (WarcConstants.FN_WARC_TARGET_URI.equalsIgnoreCase(fieldName)
                    || WarcConstants.FN_WARC_PROFILE.equalsIgnoreCase(fieldName)
                    || WarcConstants.FN_WARC_REFERS_TO_TARGET_URI.equalsIgnoreCase(fieldName)) {
                fieldValueStr = uriFieldValue.toString();
            } else {
                fieldValueStr = "<" + uriFieldValue.toString() + ">";
            }
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_URI,
                null, null, null, null, null, null, uriFieldValue);
    }

    /**
     * Add a header with the supplied field name, data type and value and
     * return a <code>HeaderLine</code> corresponding to how the header will
     * be read. The data type is validated against the field data type.
     * The values used are the field value string and the parameter
     * corresponding to the data type.
     * @param fieldName header field name
     * @param fieldValueStr field value in string form
     * @param dt data type of the field value string when converted to an object
     * @param integerFieldValue <code>Integer</code> object field value
     * @param longFieldValue <code>Long</code> object field value
     * @param digestFieldValue <code>Digest</code> object field value
     * @param contentTypeFieldValue <code>ContentType</code> object field value
     * @param dateFieldValue <code>Date</code> object field value
     * @param inetAddrFieldValue <code>InetAddress</code> object field value
     * @param uriFieldValue <code>URI</code> object field value
     * @return <code>HeaderLine</code> object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, String fieldValueStr, int dt,
            Integer integerFieldValue, Long longFieldValue,
            WarcDigest digestFieldValue, ContentType contentTypeFieldValue,
            Date dateFieldValue, InetAddress inetAddrFieldValue,
            Uri uriFieldValue) {
        Integer fn_idx = WarcConstants.fieldNameIdxMap.get(fieldName.toLowerCase());
        if (fn_idx != null) {
            // Implicit cast from integer to long, if needed.
            if (WarcConstants.FN_IDX_DT[fn_idx] == WarcConstants.FDT_LONG
                    && dt == WarcConstants.FDT_INTEGER) {
                longFieldValue = (long)integerFieldValue;
                dt = WarcConstants.FDT_LONG;
            }
            if (dt == WarcConstants.FN_IDX_DT[fn_idx]) {
                // WARC field name defined in WARC specification.
                if (seen[fn_idx] && !WarcConstants.fieldNamesRepeatableLookup[fn_idx]) {
                    // Duplicate field.
                    addErrorDiagnosis(DiagnosisType.DUPLICATE,
                            "'" + fieldName + "' header",
                            fieldValueStr);
                }
                seen[fn_idx] = true;
                switch (fn_idx.intValue()) {
                /*
                 * Integer.
                 */
                case WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER:
                    warcSegmentNumberStr = fieldValueStr;
                    warcSegmentNumber = integerFieldValue;
                    break;
                /*
                 * Long.
                 */
                case WarcConstants.FN_IDX_CONTENT_LENGTH:
                    contentLengthStr = fieldValueStr;
                    contentLength = longFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH:
                    warcSegmentTotalLengthStr = fieldValueStr;
                    warcSegmentTotalLength = longFieldValue;
                    break;
                /*
                 * Digest.
                 */
                case WarcConstants.FN_IDX_WARC_BLOCK_DIGEST:
                    warcBlockDigestStr = fieldValueStr;
                    warcBlockDigest = digestFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST:
                    warcPayloadDigestStr = fieldValueStr;
                    warcPayloadDigest = digestFieldValue;
                    break;
                /*
                 * ContentType.
                 */
                case WarcConstants.FN_IDX_CONTENT_TYPE:
                    contentTypeStr = fieldValueStr;
                    contentType = contentTypeFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE:
                    warcIdentifiedPayloadTypeStr = fieldValueStr;
                    warcIdentifiedPayloadType = contentTypeFieldValue;
                    break;
                /*
                 * Date.
                 */
                case WarcConstants.FN_IDX_WARC_DATE:
                    warcDateStr = fieldValueStr;
                    warcDate = dateFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO_DATE:
                    warcRefersToDateStr = fieldValueStr;
                    warcRefersToDate = dateFieldValue;
                    break;
                /*
                 * InetAddress.
                 */
                case WarcConstants.FN_IDX_WARC_IP_ADDRESS:
                    warcIpAddress = fieldValueStr;
                    warcInetAddress = inetAddrFieldValue;
                    break;
                /*
                 * URI.
                 */
                case WarcConstants.FN_IDX_WARC_RECORD_ID:
                    warcRecordIdStr = fieldValueStr;
                    warcRecordIdUri = uriFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_CONCURRENT_TO:
                    if (fieldValueStr != null || uriFieldValue != null) {
                        WarcConcurrentTo warcConcurrentTo = new WarcConcurrentTo();
                        warcConcurrentTo.warcConcurrentToStr = fieldValueStr;
                        warcConcurrentTo.warcConcurrentToUri = uriFieldValue;
                        warcConcurrentToList.add(warcConcurrentTo);
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO:
                    warcRefersToStr = fieldValueStr;
                    warcRefersToUri = uriFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_TARGET_URI:
                    warcTargetUriStr = fieldValueStr;
                    warcTargetUriUri = uriFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_WARCINFO_ID:
                    warcWarcinfoIdStr = fieldValueStr;
                    warcWarcinfoIdUri = uriFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_PROFILE:
                    warcProfileStr = fieldValueStr;
                    warcProfileUri = uriFieldValue;
                    if (warcProfileStr != null) {
                        warcProfileIdx = WarcConstants.profileIdxMap.get(warcProfileStr.toLowerCase());
                    }
                    if (warcProfileIdx == null && warcProfileStr != null && warcProfileStr.length() > 0) {
                        warcProfileIdx = WarcConstants.PROFILE_IDX_UNKNOWN;
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID:
                    warcSegmentOriginIdStr = fieldValueStr;
                    warcSegmentOriginIdUrl = uriFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI:
                    warcRefersToTargetUriStr = fieldValueStr;
                    warcRefersToTargetUriUri = uriFieldValue;
                    break;
                default:
                    break;
                }
            } else {
                // Invalid datatype for field.
                addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED,
                        "Invalid datatype for '" + fieldName + "' header",
                        WarcConstants.FDT_IDX_STRINGS[WarcConstants.FN_IDX_DT[fn_idx]],
                        WarcConstants.FDT_IDX_STRINGS[dt]);
                // Consider throwing exception at some point.
            }
        }
        HeaderLine headerLine = new HeaderLine();
        headerLine.name = fieldName;
        headerLine.value = fieldValueStr;
        HeaderLine tmpLine = headerMap.get(fieldName.toLowerCase());
        if (tmpLine == null) {
            headerMap.put(fieldName.toLowerCase(), headerLine);
        } else {
            tmpLine.lines.add(headerLine);
        }
        return headerLine;
    }

    /**
     * Validate the WARC header relative to the WARC-Type and according to the
     * WARC ISO standard.
     */
    protected void checkFields() {
        bMandatoryMissing = false;

        /*
         * Unknown Warc-Type and/or Warc-Profile.
         */

        if (warcTypeIdx != null && warcTypeIdx == WarcConstants.RT_IDX_UNKNOWN) {
            // Warning: Unknown Warc-Type.
            addWarningDiagnosis(DiagnosisType.UNKNOWN, "'" + WarcConstants.FN_WARC_TYPE + "' value", warcTypeStr);
        }

        if (warcProfileIdx != null && warcProfileIdx == WarcConstants.PROFILE_IDX_UNKNOWN) {
            // Warning: Unknown Warc-Profile.
            addWarningDiagnosis(DiagnosisType.UNKNOWN, "'" + WarcConstants.FN_WARC_PROFILE + "' value", warcProfileStr);
        }

        /*
         * Mandatory fields.
         */

        // TODO Required yes, but is it always invalid.
        if (warcTypeIdx == null) {
            // Mandatory valid Warc-Type missing.
            addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_WARC_TYPE + "' header", warcTypeStr);
            bMandatoryMissing = true;
        }
        if (warcRecordIdUri == null) {
            // Mandatory valid Warc-Record-Id missing.
            addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_WARC_RECORD_ID + "' header", warcRecordIdStr);
            bMandatoryMissing = true;
        }
        if (warcDate == null) {
            // Mandatory valid Warc-Date missing.
            addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_WARC_DATE + "' header", warcDateStr);
            bMandatoryMissing = true;
        }
        if (contentLength == null) {
            // Mandatory valid Content-Length missing.
            addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID, "'" + WarcConstants.FN_CONTENT_LENGTH + "' header", contentLengthStr);
            bMandatoryMissing = true;
        }

        /*
         * Content-Type should be present if Content-Length > 0.
         * Except for continuation records.
         */

        if (contentLength != null && contentLength.longValue() > 0L &&
                        (contentTypeStr == null || contentTypeStr.length() == 0)) {
            if (warcTypeIdx == null || warcTypeIdx != WarcConstants.RT_IDX_CONTINUATION) {
                addWarningDiagnosis(DiagnosisType.RECOMMENDED_MISSING,
                        "'" + WarcConstants.FN_CONTENT_TYPE + "' header");
            }
        }

        /*
         * WARC record type dependent policies.
         */

        if (warcTypeIdx != null) {
            /*
             * Warcinfo record should have "application/warc-fields" content-type.
             */

            if (warcTypeIdx == WarcConstants.RT_IDX_WARCINFO) {
                if (contentType != null &&
                        (!contentType.contentType.equals("application")
                        || !contentType.mediaType.equals("warc-fields"))) {
                    addWarningDiagnosis(DiagnosisType.RECOMMENDED,
                            "'" + WarcConstants.FN_CONTENT_TYPE + "' value",
                            WarcConstants.CT_APP_WARC_FIELDS,
                            contentTypeStr);
                }
            }

            if (warcTypeIdx == WarcConstants.RT_IDX_RESPONSE) {
                if (warcSegmentNumber != null && warcSegmentNumber != 1) {
                    addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED,
                            "'" + WarcConstants.FN_WARC_SEGMENT_NUMBER + "' value",
                            warcSegmentNumber.toString(),
                            "1");
                }
            }

            if (warcTypeIdx == WarcConstants.RT_IDX_CONTINUATION) {
                if (warcSegmentNumber != null && warcSegmentNumber < 2) {
                    addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED,
                            "'" + WarcConstants.FN_WARC_SEGMENT_NUMBER + "' value",
                            warcSegmentNumber.toString(),
                            ">1");
                }
            }

            /*
             * Check the policies for each field.
             */

            WarcConcurrentTo warcConcurrentTo;
            if (warcTypeIdx  > 0) {
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_CONTENT_TYPE, contentType, contentTypeStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_IP_ADDRESS, warcInetAddress, warcIpAddress);
                for (int i=0; i<warcConcurrentToList.size(); ++i) {
                    warcConcurrentTo = warcConcurrentToList.get(0);
                    checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_CONCURRENT_TO, warcConcurrentTo.warcConcurrentToUri, warcConcurrentTo.warcConcurrentToStr);
                }
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_REFERS_TO, warcRefersToUri, warcRefersToStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_TARGET_URI, warcTargetUriUri, warcTargetUriStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_TRUNCATED, warcTruncatedIdx, warcTruncatedStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_WARCINFO_ID, warcWarcinfoIdUri, warcWarcinfoIdStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_BLOCK_DIGEST, warcBlockDigest, warcBlockDigestStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST, warcPayloadDigest, warcPayloadDigestStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_FILENAME, warcFilename, warcFilename);
                // Could also use warcProfileIdx for really strict.
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_PROFILE, warcProfileUri, warcProfileStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE, warcIdentifiedPayloadType, warcIdentifiedPayloadTypeStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER, warcSegmentNumber, warcSegmentNumberStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID, warcSegmentOriginIdUrl, warcSegmentOriginIdStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH, warcSegmentTotalLength, warcSegmentTotalLengthStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI, warcRefersToTargetUriUri, warcRefersToTargetUriStr);
                checkFieldPolicy(warcTypeIdx, WarcConstants.FN_IDX_WARC_REFERS_TO_DATE, warcRefersToDate, warcRefersToDateStr);
            }
        }
    }

    /**
     * Given a WARC record type and a WARC field looks up the policy in a
     * matrix build from the WARC ISO standard.
     * @param recordType WARC record type id
     * @param fieldType WARC field type id
     * @param fieldObj WARC field
     * @param valueStr WARC raw field value
     */
    protected void checkFieldPolicy(int recordType, int fieldType, Object fieldObj, String valueStr) {
        int policy = WarcConstants.field_policy[recordType][fieldType];
        switch (policy) {
        case WarcConstants.POLICY_MANDATORY:
            if (fieldObj == null) {
                addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID,
                        "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value",
                        valueStr);
            }
            break;
        case WarcConstants.POLICY_SHALL:
            if (fieldObj == null) {
                addErrorDiagnosis(DiagnosisType.REQUIRED_INVALID,
                        "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value",
                        valueStr);
            }
            break;
        case WarcConstants.POLICY_SHALL_NOT:
            if (fieldObj != null) {
                addErrorDiagnosis(DiagnosisType.UNDESIRED_DATA,
                        "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value",
                        valueStr);
            }
            break;
        case WarcConstants.POLICY_MAY_NOT:
            if (fieldObj != null) {
                addWarningDiagnosis(DiagnosisType.UNDESIRED_DATA,
                        "'" + WarcConstants.FN_IDX_STRINGS[fieldType] + "' value",
                        valueStr);
            }
            break;
        case WarcConstants.POLICY_MAY:
        case WarcConstants.POLICY_IGNORE:
        default:
            break;
        }
    }

}
TOP

Related Classes of org.jwat.warc.WarcHeader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.