Package com.digitalpebble.behemoth.tika

Source Code of com.digitalpebble.behemoth.tika.TikaMarkupHandler

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.digitalpebble.behemoth.tika;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;

import com.digitalpebble.behemoth.Annotation;

/*******************************************************************************
* SAX Handler which gets events from the Tika parser events and create Behemoth
* annotations accordingly.
*
******************************************************************************/

public class TikaMarkupHandler implements BehemothHandler {

    private StringBuffer textBuffer;

    private List<Annotation> annotationBuffer;

    private LinkedList<Annotation> startedAnnotations;

    public TikaMarkupHandler() {
        textBuffer = new StringBuffer();
        annotationBuffer = new LinkedList<Annotation>();
        startedAnnotations = new LinkedList<Annotation>();
    }

    public void characters(char[] ch, int start, int length)
            throws SAXException {
        textBuffer.append(ch, start, length);
    }

    public void startDocument() throws SAXException {
        textBuffer = new StringBuffer();
        annotationBuffer = new LinkedList<Annotation>();
        startedAnnotations = new LinkedList<Annotation>();
    }

    public void endDocument() throws SAXException {
        // there should be no annotation left at this stage
        if (startedAnnotations.size() != 0) {
            // TODO log + error message
        }
    }

    public void startElement(String uri, String localName, String qName,
            Attributes atts) throws SAXException {
        int startOffset = textBuffer.length();

        Annotation annot = new Annotation();
        annot.setStart(startOffset);
        // use the localname as a type
        annot.setType(localName);
        // convert the attributes into features
        for (int i = 0; i < atts.getLength(); i++) {
            String key = atts.getLocalName(i);
            String value = atts.getValue(i);
            annot.getFeatures().put(key, value);
        }
        this.startedAnnotations.addLast(annot);
    }

    public void endElement(String uri, String localName, String qName)
            throws SAXException {

        int endOffset = textBuffer.length();

        // add a \n after the head if the text is not empty
        // i.e. there is a title
        if (localName.equals("head") && endOffset > 0)
            textBuffer.append("\n");

        // try to get the corresponding annotation
        // we start from the last temporary
        // and go up the stack
        Iterator<Annotation> iter = startedAnnotations.iterator();
        Annotation startedAnnot = null;
        while (iter.hasNext()) {
            Annotation temp = iter.next();
            if (temp.getType().equals(localName)) {
                startedAnnot = temp;
                break;
            }
        }
        // found something?
        if (startedAnnot == null) {
            // TODO log etc...
            return;
        }

        startedAnnot.setEnd(endOffset);
        startedAnnotations.remove(startedAnnot);
        annotationBuffer.add(startedAnnot);
    }

    public void ignorableWhitespace(char[] ch, int start, int length)
            throws SAXException {
        characters(ch, start, length);
    }

    // the following methods are simply ignored

    public void startPrefixMapping(String prefix, String uri)
            throws SAXException {
    }

    public void endPrefixMapping(String prefix) throws SAXException {
    }

    public void setDocumentLocator(Locator locator) {
    }

    public void skippedEntity(String name) throws SAXException {
    }

    public void processingInstruction(String target, String data)
            throws SAXException {
    }

    public String getText() {
        return this.textBuffer.toString();
    }

    public List<Annotation> getAnnotations() {
        return this.annotationBuffer;
    }

}
TOP

Related Classes of com.digitalpebble.behemoth.tika.TikaMarkupHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.