// $Id: AnselToUnicode.java,v 1.9 2010/07/08 14:54:18 haschart Exp $
/**
* Copyright (C) 2002 Bas Peters (mail@bpeters.com)
*
* This file is part of MARC4J
*
* MARC4J is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* MARC4J is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with MARC4J; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.marc4j.converter.impl;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.util.Vector;
import org.marc4j.ErrorHandler;
import org.marc4j.MarcException;
import org.marc4j.converter.CharConverter;
/**
* <p>
* A utility to convert MARC-8 data to non-precomposed UCS/Unicode.
* </p>
*
* <p>
* The MARC-8 to Unicode mapping used is the version with the March 2005
* revisions.
* </p>
*
* @author Bas Peters
* @author Corey Keith
* @version $Revision: 1.9 $
*/
public class AnselToUnicode extends CharConverter {
class Queue extends Vector {
/**
* Puts an item into the queue.
*
* @param item
* the item to be put into the queue.
*/
public Object put(Object item) {
addElement(item);
return item;
}
/**
* Gets an item from the front of the queue.
*/
public Object get() {
Object obj;
int len = size();
obj = peek();
removeElementAt(0);
return obj;
}
/**
* Peeks at the front of the queue.
*/
public Object peek() {
int len = size();
return elementAt(0);
}
/**
* Returns true if the queue is empty.
*/
public boolean empty() {
return size() == 0;
}
}
class CodeTracker {
int offset;
int g0;
int g1;
boolean multibyte;
public String toString() {
return "Offset: " + offset + " G0: " + Integer.toHexString(g0)
+ " G1: " + Integer.toHexString(g1) + " Multibyte: "
+ multibyte;
}
}
protected CodeTableInterface ct;
protected boolean loadedMultibyte = false;
protected ErrorHandler errorList = null;
/**
* Creates a new instance and loads the MARC4J supplied
* conversion tables based on the official LC tables.
*
*/
public AnselToUnicode()
{
ct = loadGeneratedTable(false);
}
/**
* Creates a new instance and loads the MARC4J supplied
* conversion tables based on the official LC tables.
*
*/
public AnselToUnicode(boolean loadMultibyte)
{
ct = loadGeneratedTable(loadMultibyte);
}
/**
* Creates a new instance and loads the MARC4J supplied
* conversion tables based on the official LC tables.
*
*/
public AnselToUnicode(ErrorHandler errorList)
{
ct = loadGeneratedTable(false);
this.errorList = errorList;
}
/**
* Creates a new instance and loads the MARC4J supplied
* conversion tables based on the official LC tables.
*
*/
public AnselToUnicode(ErrorHandler errorList, boolean loadMultibyte)
{
ct = loadGeneratedTable(loadMultibyte);
this.errorList = errorList;
}
private CodeTableInterface loadGeneratedTable(boolean loadMultibyte)
{
try
{
Class generated = Class.forName("org.marc4j.converter.impl.CodeTableGenerated");
Constructor cons = generated.getConstructor();
Object ct = cons.newInstance();
loadedMultibyte = true;
return((CodeTableInterface)ct);
}
catch (Exception e)
{
CodeTableInterface ct;
if (loadMultibyte)
{
ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetables.xml"));
}
else
{
ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetablesnocjk.xml"));
}
loadedMultibyte = loadMultibyte;
return(ct);
}
}
/**
* Constructs an instance with the specified pathname.
*
* Use this constructor to create an instance with a customized code table
* mapping. The mapping file should follow the structure of LC's XML MARC-8
* to Unicode mapping (see:
* http://www.loc.gov/marc/specifications/codetables.xml).
*
*/
public AnselToUnicode(String pathname) {
ct = new CodeTable(pathname);
loadedMultibyte = true;
}
/**
* Constructs an instance with the specified input stream.
*
* Use this constructor to create an instance with a customized code table
* mapping. The mapping file should follow the structure of LC's XML MARC-8
* to Unicode mapping (see:
* http://www.loc.gov/marc/specifications/codetables.xml).
*
*/
public AnselToUnicode(InputStream in) {
ct = new CodeTable(in);
loadedMultibyte = true;
}
/**
* Loads the entire mapping (including multibyte characters) from the Library
* of Congress.
*/
private void loadMultibyte() {
ct = new CodeTable(getClass().getResourceAsStream(
"resources/codetables.xml"));
}
private void checkMode(char[] data, CodeTracker cdt) {
int extra = 0;
int extra2 = 0;
int extra3 = 0;
while (cdt.offset + extra + extra2 < data.length && isEscape(data[cdt.offset]))
{
if (cdt.offset + extra + extra2 + 1 == data.length)
{
cdt.offset += 1;
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Escape character found at end of field, discarding it.");
}
else
{
throw new MarcException("Escape character found at end of field");
}
break;
}
switch (data[cdt.offset + 1 + extra]) {
case 0x28: // '('
case 0x2c: // ','
set_cdt(cdt, 0, data, 2 + extra, false);
break;
case 0x29: // ')'
case 0x2d: // '-'
set_cdt(cdt, 1, data, 2 + extra, false);
break;
case 0x24: // '$'
if (!loadedMultibyte) {
loadMultibyte();
loadedMultibyte = true;
}
switch (data[cdt.offset + 2 + extra + extra2]) {
case 0x29: // ')'
case 0x2d: // '-'
set_cdt(cdt, 1, data, 3 + extra + extra2, true);
break;
case 0x2c: // ','
set_cdt(cdt, 0, data, 3 + extra + extra2, true);
break;
case 0x31: // '1'
cdt.g0 = data[cdt.offset + 2 + extra + extra2];
cdt.offset += 3 + extra + extra2;
cdt.multibyte = true;
break;
case 0x20: // ' '
// space found in escape code: look ahead and try to proceed
extra2++;
break;
default:
// unknown code character found: discard escape sequence and return
cdt.offset += 1;
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
}
else
{
throw new MarcException("Unknown character set code found following escape character.");
}
break;
}
break;
case 0x67: // 'g'
case 0x62: // 'b'
case 0x70: // 'p'
cdt.g0 = data[cdt.offset + 1 + extra];
cdt.offset += 2 + extra;
cdt.multibyte = false;
break;
case 0x73: // 's'
cdt.g0 = 0x42;
cdt.offset += 2 + extra;
cdt.multibyte = false;
break;
case 0x20: // ' '
// space found in escape code: look ahead and try to proceed
if (errorList == null)
{
throw new MarcException("Extraneous space character found within MARC8 character set escape sequence");
}
extra++;
break;
default:
// unknown code character found: discard escape sequence and return
cdt.offset += 1;
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
}
else
{
throw new MarcException("Unknown character set code found following escape character.");
}
break;
}
}
if (errorList != null && ( extra != 0 || extra2 != 0))
{
errorList.addError(ErrorHandler.ERROR_TYPO, "" + (extra+extra2) + " extraneous space characters found within MARC8 character set escape sequence");
}
}
private void set_cdt(CodeTracker cdt, int g0_or_g1, char[] data, int addnlOffset, boolean multibyte)
{
if (data[cdt.offset + addnlOffset] == '!' && data[cdt.offset + addnlOffset + 1] == 'E')
{
addnlOffset++;
}
else if (data[cdt.offset + addnlOffset] == ' ')
{
if (errorList != null)
{
errorList.addError(ErrorHandler.ERROR_TYPO, "Extraneous space character found within MARC8 character set escape sequence. Skipping over space.");
}
else
{
throw new MarcException("Extraneous space character found within MARC8 character set escape sequence");
}
addnlOffset++;
}
else if ("(,)-$!".indexOf(data[cdt.offset + addnlOffset]) != -1)
{
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Extraneaous intermediate character found following escape character. Discarding intermediate character.");
}
else
{
throw new MarcException("Extraneaous intermediate character found following escape character.");
}
addnlOffset++;
}
if ("34BE1NQS2".indexOf(data[cdt.offset + addnlOffset]) == -1)
{
cdt.offset += 1;
cdt.multibyte = false;
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character.");
}
else
{
throw new MarcException("Unknown character set code found following escape character.");
}
}
else // All is well, proceed normally
{
if (g0_or_g1 == 0) cdt.g0 = data[cdt.offset + addnlOffset];
else cdt.g1 = data[cdt.offset + addnlOffset];
cdt.offset += 1 + addnlOffset;
cdt.multibyte = multibyte;
}
}
/**
* <p>
* Converts MARC-8 data to UCS/Unicode.
* </p>
*
* @param data - the MARC-8 data in an array of char
* @return String - the UCS/Unicode data
*/
public String convert(char data[])
{
StringBuffer sb = new StringBuffer();
int len = data.length;
CodeTracker cdt = new CodeTracker();
cdt.g0 = 0x42;
cdt.g1 = 0x45;
cdt.multibyte = false;
cdt.offset = 0;
checkMode(data, cdt);
Queue diacritics = new Queue();
while (cdt.offset < data.length)
{
if (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
&& hasNext(cdt.offset, len))
{
while (cdt.offset < len && ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
&& hasNext(cdt.offset, len))
{
char c = getChar(data[cdt.offset], cdt.g0, cdt.g1);
if (c != 0) diacritics.put(new Character(c));
cdt.offset++;
checkMode(data, cdt);
}
if (cdt.offset >= len)
{
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Diacritic found at the end of field, without the character that it is supposed to decorate");
break;
}
}
char c2 = getChar(data[cdt.offset], cdt.g0, cdt.g1);
cdt.offset++;
checkMode(data, cdt);
if (c2 != 0) sb.append(c2);
while (!diacritics.isEmpty())
{
char c1 = ((Character) diacritics.get()).charValue();
sb.append(c1);
}
}
else if (cdt.multibyte)
{
if (data[cdt.offset]== 0x20)
{
// if a 0x20 byte occurs amidst a sequence of multibyte characters
// skip over it and output a space.
sb.append(getChar(data[cdt.offset], cdt.g0, cdt.g1));
cdt.offset += 1;
}
else if (cdt.offset + 3 <= data.length && (errorList == null || data[cdt.offset+1]!= 0x20 && data[cdt.offset+2]!= 0x20))
{
char c = getMBChar(makeMultibyte(data[cdt.offset], data[cdt.offset+1], data[cdt.offset+2]));
if (errorList == null || c != 0)
{
sb.append(c);
cdt.offset += 3;
}
else if (cdt.offset + 6 <= data.length && data[cdt.offset+4]!= 0x20 && data[cdt.offset+5]!= 0x20 &&
getMBChar(makeMultibyte(data[cdt.offset+3], data[cdt.offset+4], data[cdt.offset+5])) != 0)
{
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and continuing reading Multibyte characters");
sb.append("[?]");
cdt.offset += 3;
}
}
else if (cdt.offset + 4 <= data.length && data[cdt.offset] > 0x7f &&
getMBChar(makeMultibyte(data[cdt.offset+1], data[cdt.offset+2], data[cdt.offset+3])) != 0)
{
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous character in MARC8 multibyte character, Copying bad character and continuing reading Multibyte characters");
sb.append(getChar(data[cdt.offset], 0x42, 0x45));
cdt.offset += 1;
}
}
else
{
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set");
}
cdt.multibyte = false;
cdt.g0 = 0x42;
cdt.g1 = 0x45;
}
}
else if (errorList != null && cdt.offset + 4 <= data.length && ( data[cdt.offset+1] == 0x20 || data[cdt.offset+2]== 0x20))
{
int multiByte = makeMultibyte( data[cdt.offset], ((data[cdt.offset+1] != 0x20)? data[cdt.offset+1] : data[cdt.offset+2]), data[cdt.offset+3]);
char c = getMBChar(multiByte);
if (c != 0)
{
if (errorList != null)
{
errorList.addError(ErrorHandler.ERROR_TYPO, "Extraneous space found within MARC8 multibyte character");
}
sb.append(c);
sb.append(' ');
cdt.offset += 4;
}
else
{
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set");
}
cdt.multibyte = false;
cdt.g0 = 0x42;
cdt.g1 = 0x45;
}
}
else if (cdt.offset + 3 > data.length ||
cdt.offset + 3 == data.length && (data[cdt.offset+1]== 0x20 || data[cdt.offset+2]== 0x20))
{
if (errorList != null)
{
errorList.addError(ErrorHandler.MINOR_ERROR, "Partial MARC8 multibyte character, inserting change to default character set");
cdt.multibyte = false;
cdt.g0 = 0x42;
cdt.g1 = 0x45;
}
// if a field ends with an incomplete encoding of a multibyte character
// simply discard that final partial character.
else
{
cdt.offset += 3;
}
}
}
else
{
char c = getChar(data[cdt.offset], cdt.g0, cdt.g1);
if (c != 0) sb.append(c);
else
{
String val = "0000"+Integer.toHexString((int)(data[cdt.offset]));
sb.append("<U+"+ (val.substring(val.length()-4, val.length()))+ ">" );
}
cdt.offset += 1;
}
if (hasNext(cdt.offset, len))
{
checkMode(data, cdt);
}
}
return sb.toString();
}
private int makeMultibyte(char[] data) {
int[] chars = new int[3];
chars[0] = data[0] << 16;
chars[1] = data[1] << 8;
chars[2] = data[2];
return chars[0] | chars[1] | chars[2];
}
public int makeMultibyte(char c1, char c2, char c3)
{
int[] chars = new int[3];
chars[0] = c1 << 16;
chars[1] = c2 << 8;
chars[2] = c3;
return chars[0] | chars[1] | chars[2];
}
private char getChar(int ch, int g0, int g1) {
if (ch <= 0x7E)
return ct.getChar(ch, g0);
else
return ct.getChar(ch, g1);
}
public char getMBChar(int ch) {
return ct.getChar(ch, 0x31);
}
private static boolean hasNext(int pos, int len) {
if (pos < (len - 1))
return true;
return false;
}
private static boolean isEscape(int i) {
if (i == 0x1B)
return true;
return false;
}
}