Source Code of bixo.parser.DOMParser

/*
 * Copyright 2009-2013 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.parser;


import java.io.InputStream;


import javax.xml.XMLConstants;


import org.ccil.cowan.tagsoup.Parser;
import org.dom4j.Document;
import org.dom4j.io.SAXReader;
import org.hsqldb.lib.StringInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.XMLFilterImpl;


import com.scaleunlimited.cascading.NullContext;


import bixo.datum.ParsedDatum;
import bixo.utils.IoUtils;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.OperationCall;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntryCollector;




@SuppressWarnings({"serial", "rawtypes"})
public abstract class DOMParser extends BaseOperation<NullContext> implements Function<NullContext> {


    /**
     * Lowercase element names, and optionally strip out XML namespace, so that XPath can be easily 
     * used to extract elements.
     *
     */
    private static class DowngradeXmlFilter extends XMLFilterImpl {
        
        private boolean _removeNamespaces;
        
        public DowngradeXmlFilter(boolean removeNamespaces) {
            super();
            
            _removeNamespaces = removeNamespaces;
        }
        
        @Override
        public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
            // Always lower-case element names, for easier XPath matching
            String lower = localName.toLowerCase();


            if (_removeNamespaces) {
                AttributesImpl attributes = new AttributesImpl();
                for (int i = 0; i < atts.getLength(); i++) {
                    String local = atts.getLocalName(i);
                    String qname = atts.getQName(i);
                    if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length())
                                    && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
                                    && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
                        attributes.addAttribute(
                                        atts.getURI(i), local, qname,
                                        atts.getType(i), atts.getValue(i));
                    }
                }


                super.startElement(XMLConstants.NULL_NS_URI, lower, lower, attributes);
            } else {
                super.startElement(uri, lower, lower, atts);
            }
        }


        @Override
        public void endElement(String uri, String localName, String name) throws SAXException {
            String lower = localName.toLowerCase();
            super.endElement(XMLConstants.NULL_NS_URI, lower, lower);
        }
    }
    
    
    private boolean _removeNamespaces;
    
    private transient SAXReader _reader = null;
    private transient ParsedDatum _input;
    
    public DOMParser(Fields outputFields) {
        this(outputFields, true);
    }
    
    public DOMParser(Fields outputFields, boolean removeNamespaces) {
        super(outputFields);
        
        _removeNamespaces = removeNamespaces;
    }
    
    @Override
    public void prepare(FlowProcess process, OperationCall<NullContext> opCall) {
        super.prepare(process, opCall);
        
        _reader = new SAXReader(new Parser());
        _reader.setXMLFilter(new DowngradeXmlFilter(_removeNamespaces));
        _reader.setEncoding("UTF-8");
        _input = new ParsedDatum();
    }
    
    @Override
    public boolean isSafe() {
        // Parsing is computationally intensive, so we don't want to get run
        // multiple times.
        return false;
    }
    
    @Override
    public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
        _input.setTupleEntry(funcCall.getArguments());
        InputStream is = new StringInputStream(_input.getParsedText());
        
        try {
            Document parsedContent = _reader.read(is);
            process(_input, parsedContent, funcCall.getOutputCollector(), process);
        } catch (Exception e) {
            handleException(_input, e, funcCall.getOutputCollector());
        } finally {
            IoUtils.safeClose(is);
        }


    }
    
    
    /**
     * The _input ParsedDatum was successfully converted into a Dom4J Document.
     * at this point you would typically emit one or more output tuples (with
     * appropriate fields), using the collector.
     * 
     * @param datum Input datum, which wraps a Cascading Tuple.
     * @param doc Result of converting incoming XML document to a Dom4J Document
     * @param collector Collector to use if you want to emit tuples.
     * @param process The FlowProcess for this operation.
     */
    protected abstract void process(ParsedDatum datum, Document doc, TupleEntryCollector collector, FlowProcess process) throws Exception;
    
    /**
     * An exception occurred while parsing or processing the _input ParsedDatum. Options are to
     * ignore it, emit a tuple (with appropriate fields), or throw a RuntimeException
     * to kill the job.
     * 
     * @param datum Input datum, which wraps a Cascading Tuple.
     * @param e Exception while parsing or processing document
     * @param collector Collector to use if you want to emit a tuple.
     */
    protected abstract void handleException(ParsedDatum datum, Exception e, TupleEntryCollector collector);
}
Source Code of bixo.parser.DOMParser

Related Classes of bixo.parser.DOMParser