try {
parseLeader(ldr, byteArray);
directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
}
catch (IOException e) {
throw new MarcException("error parsing leader with data: "
+ new String(byteArray), e);
}
catch (MarcException e) {
if (permissive)
{
if (recordBuf[recordBuf.length-1] == Constants.RT && recordBuf[recordBuf.length-2] == Constants.FT)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Error parsing leader, trying to re-read leader either shorter or longer");
// make an attempt to recover record.
int offset = 0;
while (offset < recordBuf.length)
{
if (recordBuf[offset] == Constants.FT)
{
break;
}
offset++;
}
if (offset % 12 == 1)
{
// move one byte from body to leader, make new leader, and try again
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Leader appears to be too short, moving one byte from record body to leader, and trying again");
byte oldBody[] = recordBuf;
recordBuf = new byte[oldBody.length-1];
System.arraycopy(oldBody, 1, recordBuf, 0, oldBody.length-1);
directoryLength = offset-1;
ldr.setIndicatorCount(2);
ldr.setSubfieldCodeLength(2);
ldr.setImplDefined1((""+(char)byteArray[7]+" ").toCharArray());
ldr.setImplDefined2((""+(char)byteArray[18]+(char)byteArray[19]+(char)byteArray[20]).toCharArray());
ldr.setEntryMap("4500".toCharArray());
if (byteArray[10] == (byte)' ' || byteArray[10] == (byte)'a') // if its ' ' or 'a'
{
ldr.setCharCodingScheme((char)byteArray[10]);
}
}
else if (offset % 12 == 11)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Leader appears to be too long, moving one byte from leader to record body, and trying again");
byte oldBody[] = recordBuf;
recordBuf = new byte[oldBody.length+1];
System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
recordBuf[0] = (byte)'0';
directoryLength = offset+1;
ldr.setIndicatorCount(2);
ldr.setSubfieldCodeLength(2);
ldr.setImplDefined1((""+(char)byteArray[7]+" ").toCharArray());
ldr.setImplDefined2((""+(char)byteArray[16]+(char)byteArray[17]+(char)byteArray[18]).toCharArray());
ldr.setEntryMap("4500".toCharArray());
if (byteArray[8] == (byte)' ' || byteArray[8] == (byte)'a') // if its ' ' or 'a'
{
ldr.setCharCodingScheme((char)byteArray[10]);
}
if (byteArray[10] == (byte)' ' || byteArray[10] == (byte)'a') // if its ' ' or 'a'
{
ldr.setCharCodingScheme((char)byteArray[10]);
}
}
else
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"error parsing leader with data: " + new String(byteArray));
throw new MarcException("error parsing leader with data: "
+ new String(byteArray), e);
}
}
}
else
{
throw new MarcException("error parsing leader with data: "
+ new String(byteArray), e);
}
}
char tmp[] = ldr.getEntryMap();
if (permissive && !(""+ tmp[0]+tmp[1]+tmp[2]+tmp[3]).equals("4500"))
{
if (tmp[0] >= '0' && tmp[0] <= '9' &&
tmp[1] >= '0' && tmp[1] <= '9' &&
tmp[2] >= '0' && tmp[2] <= '9' &&
tmp[3] >= '0' && tmp[3] <= '9')
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO,
"Unusual character found at end of leader [ "+tmp[0]+tmp[1]+tmp[2]+tmp[3]+" ]");
}
else
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO,
"Erroneous character found at end of leader [ "+tmp[0]+tmp[1]+tmp[2]+tmp[3]+" ]; changing them to the standard \"4500\"");
ldr.setEntryMap("4500".toCharArray());
}
}
// if MARC 21 then check encoding
switch (ldr.getCharCodingScheme()) {
case 'a':
encoding = "UTF8";
break;
case ' ':
if (convertToUTF8)
encoding = defaultEncoding;
else
encoding = "ISO8859_1";
break;
default:
if (convertToUTF8)
if (permissive)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Record character encoding should be 'a' or ' ' in this record it is '"+ldr.getCharCodingScheme()+"'. Attempting to guess the correct encoding.");
encoding = "BESTGUESS";
}
else
encoding = defaultEncoding;
else
encoding = "ISO8859_1";
break;
}
String utfCheck;
if (encoding.equalsIgnoreCase("BESTGUESS"))
{
try
{
String marc8EscSeqCheck = new String(recordBuf, "ISO-8859-1");
// If record has MARC8 character set selection strings, it must be MARC8 encoded
if (marc8EscSeqCheck.split("\\e[-(,)$bsp]", 2).length > 1)
{
encoding = "MARC8";
}
else
{
boolean hasHighBitChars = false;
for (int i = 0; i < recordBuf.length; i++)
{
if (recordBuf[i] < 0) // the high bit is set
{
hasHighBitChars = true;
break;
}
}
if (!hasHighBitChars)
{
encoding = "ISO8859_1"; // You can choose any encoding you want here, the results will be the same.
}
else
{
utfCheck = new String(recordBuf, "UTF-8");
byte byteCheck[] = utfCheck.getBytes("UTF-8");
encoding = "UTF8";
if (recordBuf.length == byteCheck.length)
{
for (int i = 0; i < recordBuf.length; i++)
{
if (byteCheck[i] != recordBuf[i])
{
encoding = "MARC8-Maybe";
break;
}
}
}
else
{
encoding = "MARC8-Maybe";
}
}
}
}
catch (UnsupportedEncodingException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
else if (permissive && encoding.equals("UTF8"))
{
try
{
utfCheck = new String(recordBuf, "UTF-8");
byte byteCheck[] = utfCheck.getBytes("UTF-8");
if (recordBuf.length != byteCheck.length)
{
boolean foundESC = false;
for (int i = 0; i < recordBuf.length; i++)
{
if (recordBuf[i] == 0x1B)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Record claims to be UTF-8, but its not. Its probably MARC8.");
encoding = "MARC8-Maybe";
foundESC = true;
break;
}
if (byteCheck[i] != recordBuf[i])
{
encoding = "MARC8-Maybe";
}
}
if (!foundESC)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Record claims to be UTF-8, but its not. It may be MARC8, or maybe UNIMARC, or maybe raw ISO-8859-1 ");
}
}
if (utfCheck.contains("a$1!"))
{
encoding = "MARC8-Broken";
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Record claims to be UTF-8, but its not. It seems to be MARC8-encoded but with missing escape codes.");
}
}
catch (UnsupportedEncodingException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
else if (permissive && !encoding.equals("UTF8") && convertToUTF8)
{
try
{
utfCheck = new String(recordBuf, "UTF-8");
byte byteCheck[] = utfCheck.getBytes("UTF-8");
if (recordBuf.length == byteCheck.length)
{
for (int i = 0; i < recordBuf.length; i++)
{
// need to check for byte < 0 to see if the high bit is set, because Java doesn't have unsigned types.
if (recordBuf[i] < 0x00 || byteCheck[i] != recordBuf[i])
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Record claims not to be UTF-8, but it seems to be.");
encoding = "UTF8-Maybe";
break;
}
}
}
}
catch (UnsupportedEncodingException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
record.setLeader(ldr);
boolean discardOneAtStartOfDirectory = false;
boolean discardOneSomewhereInDirectory = false;
if ((directoryLength % 12) != 0)
{
if (permissive && directoryLength % 12 == 11 && recordBuf[1] != (byte)'0')
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Directory length is not a multiple of 12 bytes long. Prepending a zero and trying to continue.");
byte oldBody[] = recordBuf;
recordBuf = new byte[oldBody.length+1];
System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
recordBuf[0] = (byte)'0';
directoryLength = directoryLength+1;
}
else
{
if (permissive && directoryLength % 12 == 1 && recordBuf[1] == (byte)'0' && recordBuf[2] == (byte)'0')
{
discardOneAtStartOfDirectory = true;
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Directory length is not a multiple of 12 bytes long. Discarding byte from start of directory and trying to continue.");
}
else if (permissive && directoryLength % 12 == 1 && recordLength > 10000 && recordBuf[0] == (byte)'0' &&
recordBuf[1] == (byte)'0' && recordBuf[2] > (byte)'0' && recordBuf[2] <= (byte)'9')
{
discardOneSomewhereInDirectory = true;
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Directory length is not a multiple of 12 bytes long. Will look for oversized field and try to work around it.");
}
else
{
if (errors != null)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Directory length is not a multiple of 12 bytes long. Unable to continue.");
}
throw new MarcException("Directory length is not a multiple of 12 bytes long. Unable to continue.");
}
}
}
DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf));
int size = directoryLength / 12;
String[] tags = new String[size];
int[] lengths = new int[size];
byte[] tag = new byte[3];
byte[] length = new byte[4];
byte[] start = new byte[5];
String tmpStr;
try {
if (discardOneAtStartOfDirectory) inputrec.read();
int totalOffset = 0;
for (int i = 0; i < size; i++)
{
inputrec.readFully(tag);
tmpStr = new String(tag);
tags[i] = tmpStr;
boolean proceedNormally = true;
if (discardOneSomewhereInDirectory)
{
byte lenCheck[] = new byte[10];
inputrec.mark(20);
inputrec.readFully(lenCheck);
if (byteCompare(lenCheck, 4, 5, totalOffset)) // proceed normally
{
proceedNormally = true;
}
else if (byteCompare(lenCheck, 5, 5, totalOffset)) // field length is 5 bytes! Bad Marc record, proceed normally
{
discardOneSomewhereInDirectory = false;
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Field is longer than 9999 bytes. Writing this record out will result in a bad record.");
proceedNormally = false;
}
else
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Unable to reconcile problems in directory. Unable to continue.");
throw new MarcException("Directory length is not a multiple of 12 bytes long. Unable to continue.");
}
inputrec.reset();
}
if (proceedNormally)
{
inputrec.readFully(length);
tmpStr = new String(length);
lengths[i] = Integer.parseInt(tmpStr);
inputrec.readFully(start);
}
else // length is 5 bytes long
{
inputrec.readFully(start);
tmpStr = new String(start);
lengths[i] = Integer.parseInt(tmpStr);
inputrec.readFully(start);
}
totalOffset += lengths[i];
}
// If we still haven't found the extra byte, throw out the last byte and try to continue;
if (discardOneSomewhereInDirectory) inputrec.read();
if (inputrec.read() != Constants.FT)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Expected field terminator at end of directory. Unable to continue.");
throw new MarcException("expected field terminator at end of directory");
}
int numBadLengths = 0;
int totalLength = 0;
for (int i = 0; i < size; i++)
{
int fieldLength = getFieldLength(inputrec);
if (fieldLength+1 != lengths[i] && permissive)
{
if (numBadLengths < 3 && (totalLength + fieldLength < recordLength + 26))
{
inputrec.mark(9999);
byteArray = new byte[lengths[i]];
inputrec.readFully(byteArray);
inputrec.reset();
if (fieldLength+1 < lengths[i] && byteArray[lengths[i]-1] == Constants.FT)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Field Terminator character found in the middle of a field.");
}
else
{
numBadLengths++;
lengths[i] = fieldLength+1;
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Field length found in record different from length stated in the directory.");
if (fieldLength+1 > 9999)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Field length is greater than 9999, record cannot be represented as a binary Marc record.");
}
}
}
}
totalLength += lengths[i];
if (isControlField(tags[i]))
{
byteArray = new byte[lengths[i] - 1];
inputrec.readFully(byteArray);
if (inputrec.read() != Constants.FT)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Expected field terminator at end of field. Unable to continue.");
throw new MarcException("expected field terminator at end of field");
}
ControlField field = factory.newControlField();
field.setTag(tags[i]);
field.setData(getDataAsString(byteArray));
record.addVariableField(field);
}
else
{
byteArray = new byte[lengths[i]];
inputrec.readFully(byteArray);
try {
record.addVariableField(parseDataField(tags[i], byteArray));
} catch (IOException e) {
throw new MarcException(
"error parsing data field for tag: " + tags[i]
+ " with data: "
+ new String(byteArray), e);
}
}
}
// We've determined that although the record says it is UTF-8, it is not.
// Here we make an attempt to determine the actual encoding of the data in the record.
if (permissive && conversionCheck1.length() > 1 &&
conversionCheck2.length() > 1 && conversionCheck3.length() > 1)
{
guessAndSelectCorrectNonUTF8Encoding();
}
if (inputrec.read() != Constants.RT)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Expected record terminator at end of record. Unable to continue.");
throw new MarcException("expected record terminator");
}
}
catch (IOException e)
{
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Error reading from data file. Unable to continue.");
throw new MarcException("an error occured reading input", e);
}
}