package it.unimi.dsi.mg4j.util.parser.callback;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2006-2010 Paolo Boldi
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectList;
import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.parser.Attribute;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.Element;
import it.unimi.dsi.parser.callback.DefaultCallback;
import it.unimi.dsi.util.CircularCharArrayBuffer;
import java.util.Map;
import org.apache.log4j.Logger;
/** A callback extracting anchor text. When instantiating the extractor, you can specify the number of characters to
* be considered before the anchor, after the anchor or during the anchor (just the first characters are taken into
* consideration in the last two characters, and just the last ones in the first case).
*
* <p>At the end of parsing, the result (the list of anchors) is available in {@link #anchors}, whose
* elements provide the content of the <samp>href</samp> attribute
* the text of the anchor and around the anchor; text is however modified so that fragment of words at the beginning
* of the pre-anchor context, or at the end of the post-anchor context, are cut away.
*
* <p>For example, a fragment like:
*
* <code>
* ...foo fOO FOO FOO <a href="xxx">ANCHOR TEXT</a> BAR BAR BAr bar...
* </code>
*
* (where the uppercase part represents the pre- and post-anchor context) generates the element
*
* <code>
* Anchor("xxx", "FOO FOO ANCHOR TEXT BAR BAR")
* </code>
*/
public class AnchorExtractor extends DefaultCallback {
/** A class representing an anchor. It is used to return the results of parsing.
*
*/
public final static class Anchor implements VirtualDocumentFragment {
private static final long serialVersionUID = 1L;
/** The content of the <samp>href</samp> attribute for this anchor. */
private final MutableString href;
/** The text surrounding this anchor. */
private final MutableString anchorText;
public Anchor( final MutableString href, final MutableString anchorText ) {
this.href = href;
this.anchorText = anchorText;
}
public MutableString documentSpecifier() {
return href;
}
public MutableString text() {
return anchorText;
}
public String toString() {
return "<" + href + ", \"" + anchorText + "\">";
}
}
public static final Logger LOGGER = Logger.getLogger( AnchorExtractor.class );
public static final boolean DEBUG = false;
/** The resulting list of {@linkplain Anchor anchors}. */
public final ObjectList<Anchor> anchors = new ObjectArrayList<Anchor>();
/** The circular buffer for pre-anchor context. */
private final CircularCharArrayBuffer preAnchor;
/** The circular buffer for anchor. */
private final MutableString anchor;
/** The maximum number of characters in the anchor. */
private final int maxAnchor;
/** The maximum number of characters after anchor. */
private final int maxAfter;
/** The post-anchor. */
private final MutableString postAnchor;
/** The current URL (if state is IN_ANCHOR). */
private MutableString url;
/** The resulting string (pre+anchor+post). */
private MutableString result;
/** When an anchor opens, the pre-anchor buffer is copied in this array. */
private char[] preAnchorArray;
private enum State {
BEFORE_ANCHOR, IN_ANCHOR, AFTER_ANCHOR
};
private State state;
/**
*
* @param maxBefore maximum number of words to be considered before of the anchor.
* @param maxAfter maximum number of words to be considered after the anchor.
*/
public AnchorExtractor( int maxBefore, int maxAnchor, int maxAfter ) {
preAnchor = new CircularCharArrayBuffer( maxBefore );
anchor = new MutableString( maxAnchor );
postAnchor = new MutableString( maxAfter );
result = new MutableString( maxBefore + maxAnchor + maxAfter );
this.maxAfter = maxAfter;
this.maxAnchor = maxAnchor;
state = State.BEFORE_ANCHOR;
}
public void configure( final BulletParser parser ) {
parser.parseTags( true );
parser.parseAttributes( true );
parser.parseText( true );
parser.parseAttribute( Attribute.HREF );
}
public void startDocument() {
state = State.BEFORE_ANCHOR;
anchors.clear();
preAnchor.clear();
anchor.setLength( 0 );
postAnchor.setLength( 0 );
url = null;
}
public void endDocument() {
if ( url != null ) {
emit();
}
url = null;
}
public boolean startElement( final Element element, final Map<Attribute,MutableString> attrMap ) {
if ( element == Element.A && attrMap != null && attrMap.containsKey( Attribute.HREF ) ) {
if ( state == State.AFTER_ANCHOR ) {
emit();
state = State.BEFORE_ANCHOR;
}
if ( state == State.BEFORE_ANCHOR ) {
preAnchorArray = preAnchor.toCharArray();
preAnchor.clear();
if ( DEBUG ) System.out.println( "Freezing now pre: <" + new String( preAnchorArray ) + ">" );
state = State.IN_ANCHOR;
url = attrMap.get( Attribute.HREF );
anchor.setLength( 0 );
postAnchor.setLength( 0 );
}
}
return true;
}
public boolean endElement( final Element element ) {
if ( element == Element.A && state == State.IN_ANCHOR ) {
state = State.AFTER_ANCHOR;
}
return true;
}
public boolean characters( final char[] characters, final int offset, final int length, final boolean flowBroken ) {
switch ( state ) {
case BEFORE_ANCHOR:
preAnchor.add( characters, offset, length );
break;
case IN_ANCHOR:
anchor.append( characters, offset, Math.min( length, maxAnchor - anchor.length() ) );
break;
case AFTER_ANCHOR:
preAnchor.add( characters, offset, length );
postAnchor.append( characters, offset, Math.min( length, maxAfter - postAnchor.length() ) );
break;
}
if ( state == State.AFTER_ANCHOR && postAnchor.length() == maxAfter && url != null ) {
emit();
state = State.BEFORE_ANCHOR;
}
return true;
}
private void emit() {
int posPre, posPost, posAnchor;
// Cut pre until the first start of word
posPre = 0;
if ( preAnchorArray.length > 0 && Character.isLetterOrDigit( preAnchorArray[ posPre ] ) )
// Skip starting non-space
for ( ; posPre < preAnchorArray.length && Character.isLetterOrDigit( preAnchorArray[ posPre ] ); posPre++ );
// Same for post
char[] postAnchorArray = postAnchor.array();
posPost = postAnchor.length() - 1;
if ( posPost >= 0 && Character.isLetterOrDigit( postAnchorArray[ posPost ] ) ) {
// Skip ending non-space
for ( ; posPost >= 0 && Character.isLetterOrDigit( postAnchorArray[ posPost ] ); posPost-- );
}
// Same for anchor
char[] anchorArray = anchor.array();
posAnchor = anchor.length() - 1;
if ( anchor.length() == maxAnchor && posAnchor >= 0 && Character.isLetterOrDigit( anchorArray[ posAnchor ] ) )
// Skip starting non-space
for ( ; posAnchor >= 0 && Character.isLetterOrDigit( anchorArray[ posAnchor ] ); posAnchor-- );
result.setLength( 0 );
result.append( preAnchorArray, posPre, preAnchorArray.length - posPre ).append( anchorArray, 0, posAnchor + 1 ).append( postAnchorArray, 0, posPost + 1 );
anchors.add( new Anchor( url, result.copy() ) );
url = null;
}
}