public Document getDocument( final InputStream rawContent, final Reference2ObjectMap<Enum<?>,Object> metadata ) throws IOException {
return new AbstractDocument() {
final DataInputStream rawContentDataInputStream = new DataInputStream( rawContent );
int nextFieldToRead = 0;
final MutableString uri = new MutableString();
{
uri.readSelfDelimUTF8( rawContent ).compact();
}
@Override
public void close() throws IOException {
super.close();
rawContent.close();
}
public CharSequence title() {
return (CharSequence)metadata.get( MetadataKeys.TITLE );
}
public String toString() {
return title().toString();
}
public CharSequence uri() {
return uri.length() == 0 ? null : uri;
}
/** Skips until the end of the current field, and increments <code>nextFieldToRead</code>.
* @throws ClassNotFoundException
* @throws IOException
*/
private void skipOneField() throws IOException, ClassNotFoundException {
switch( fieldType( nextFieldToRead ) ) {
case TEXT:
MutableString word = new MutableString();
MutableString nonWord = new MutableString();
do {
word.readSelfDelimUTF8( rawContent );
if ( exact ) nonWord.readSelfDelimUTF8( rawContent );
} while ( word.length() > 0 || ( exact && nonWord.length() > 0 ) );
break;
case VIRTUAL:
final int nfrag = rawContentDataInputStream.readInt();
for ( int i = 0; i < 2 * nfrag; i++ ) MutableString.skipSelfDelimUTF8( rawContent );
break;
default: // Non-text and non-virtual
new ObjectInputStream( rawContent ).readObject();
}
nextFieldToRead++;
}
/** Skips to the given field.
*
* @param field the field to skip to.
* @throws IOException
* @throws ClassNotFoundException
*/
private void skipToField( final int field ) throws IOException, ClassNotFoundException {
if ( nextFieldToRead > field ) throw new IllegalStateException( "Trying to skip to field " + field + " after " + nextFieldToRead );
while ( nextFieldToRead < field ) skipOneField();
}
public Object content( final int field ) {
ensureFieldIndex( field );
Object result = null;
if ( DEBUG ) LOGGER.debug( "Called content(" + field + "); nextField:" + nextFieldToRead );
try {
skipToField( field );
if ( fieldType( nextFieldToRead ) == FieldType.VIRTUAL ) {
final int nfrag = rawContentDataInputStream.readInt();
MutableString doc = new MutableString();
MutableString text = new MutableString();
VirtualDocumentFragment[] fragArray = new VirtualDocumentFragment[ nfrag ];
for ( int i = 0; i < nfrag; i++ ) {
doc.readSelfDelimUTF8( rawContent );
text.readSelfDelimUTF8( rawContent );
fragArray[ i ] = new AnchorExtractor.Anchor( doc.copy(), text.copy() );
}
result = new ObjectArrayList<VirtualDocumentFragment>( fragArray );
}
else if ( fieldType( nextFieldToRead ) != FieldType.TEXT ) {
result = new ObjectInputStream( rawContent ).readObject();
if ( DEBUG ) LOGGER.debug( "Read " + result + " from field " + fieldName( nextFieldToRead ) + " of object " + title() );
nextFieldToRead++;
}
else {
if ( DEBUG ) LOGGER.debug( "Returning reader for " + field );
result = new Reader() {
FastBufferedReader fbr = null;
int f = field;
public void close() {}
public int read( final char[] cbuf, final int off, final int len ) throws IOException {
if ( fbr == null ) {
if ( DEBUG ) LOGGER.debug( "Initialising reader for content " + f );
MutableString text = new MutableString();
MutableString word = new MutableString();
MutableString nonWord = new MutableString();
do {
text.append( word.readSelfDelimUTF8( rawContent ) );
if ( exact ) text.append( nonWord.readSelfDelimUTF8( rawContent ) );
else text.append( ' ' );
} while ( word.length() > 0 || ( exact && nonWord.length() > 0 ) );
fbr = new FastBufferedReader( text );
nextFieldToRead++;
}
return fbr.read( cbuf, off, len );
}
};
}
} catch ( IOException e ) {
throw new RuntimeException( e );
} catch (ClassNotFoundException e) {
throw new RuntimeException( e );
}
return result;
}
public WordReader wordReader( final int field ) {
ensureFieldIndex( field );
if ( DEBUG ) LOGGER.debug( "Called wordReader(" + field + ")" );
try {
skipToField( field );
} catch ( Exception e ) {
throw new RuntimeException( e );
}
//logger.debug( "Asked for a new word reader for field " + fieldName( field ) );
switch ( fieldType( field ) ) {
case TEXT:
return new WordReader() {
private static final long serialVersionUID = 1L;
public boolean next( final MutableString word, final MutableString nonWord ) throws IOException {
try {
word.readSelfDelimUTF8( rawContent );
}
catch( EOFException e ) {
return false; // TODO: a bit raw
}
nonWord.length( 0 );
if ( exact ) {
try {
nonWord.readSelfDelimUTF8( rawContent );
}
catch( EOFException e ) {
return true; // TODO: a bit raw
}
}
else nonWord.append( ' ' );
final boolean goOn = word.length() != 0 || ( exact && nonWord.length() != 0 );
if ( DEBUG ) LOGGER.debug( "Got word <" + word + "|" + nonWord + "> exact=" + exact + " returning " + goOn );
if ( ! goOn ) nextFieldToRead++;
return goOn;
}
public WordReader setReader( final Reader reader ) {