Package bixo.parser

Source Code of bixo.parser.DOMParser

/*
* Copyright 2009-2013 Scale Unlimited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package bixo.parser;

import java.io.InputStream;

import javax.xml.XMLConstants;

import org.ccil.cowan.tagsoup.Parser;
import org.dom4j.Document;
import org.dom4j.io.SAXReader;
import org.hsqldb.lib.StringInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.XMLFilterImpl;

import com.scaleunlimited.cascading.NullContext;

import bixo.datum.ParsedDatum;
import bixo.utils.IoUtils;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.OperationCall;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntryCollector;


@SuppressWarnings({"serial", "rawtypes"})
public abstract class DOMParser extends BaseOperation<NullContext> implements Function<NullContext> {

    /**
     * Lowercase element names, and optionally strip out XML namespace, so that XPath can be easily
     * used to extract elements.
     *
     */
    private static class DowngradeXmlFilter extends XMLFilterImpl {
       
        private boolean _removeNamespaces;
       
        public DowngradeXmlFilter(boolean removeNamespaces) {
            super();
           
            _removeNamespaces = removeNamespaces;
        }
       
        @Override
        public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
            // Always lower-case element names, for easier XPath matching
            String lower = localName.toLowerCase();

            if (_removeNamespaces) {
                AttributesImpl attributes = new AttributesImpl();
                for (int i = 0; i < atts.getLength(); i++) {
                    String local = atts.getLocalName(i);
                    String qname = atts.getQName(i);
                    if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length())
                                    && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
                                    && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
                        attributes.addAttribute(
                                        atts.getURI(i), local, qname,
                                        atts.getType(i), atts.getValue(i));
                    }
                }

                super.startElement(XMLConstants.NULL_NS_URI, lower, lower, attributes);
            } else {
                super.startElement(uri, lower, lower, atts);
            }
        }

        @Override
        public void endElement(String uri, String localName, String name) throws SAXException {
            String lower = localName.toLowerCase();
            super.endElement(XMLConstants.NULL_NS_URI, lower, lower);
        }
    }
   
   
    private boolean _removeNamespaces;
   
    private transient SAXReader _reader = null;
    private transient ParsedDatum _input;
   
    public DOMParser(Fields outputFields) {
        this(outputFields, true);
    }
   
    public DOMParser(Fields outputFields, boolean removeNamespaces) {
        super(outputFields);
       
        _removeNamespaces = removeNamespaces;
    }
   
    @Override
    public void prepare(FlowProcess process, OperationCall<NullContext> opCall) {
        super.prepare(process, opCall);
       
        _reader = new SAXReader(new Parser());
        _reader.setXMLFilter(new DowngradeXmlFilter(_removeNamespaces));
        _reader.setEncoding("UTF-8");
        _input = new ParsedDatum();
    }
   
    @Override
    public boolean isSafe() {
        // Parsing is computationally intensive, so we don't want to get run
        // multiple times.
        return false;
    }
   
    @Override
    public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
        _input.setTupleEntry(funcCall.getArguments());
        InputStream is = new StringInputStream(_input.getParsedText());
       
        try {
            Document parsedContent = _reader.read(is);
            process(_input, parsedContent, funcCall.getOutputCollector(), process);
        } catch (Exception e) {
            handleException(_input, e, funcCall.getOutputCollector());
        } finally {
            IoUtils.safeClose(is);
        }

    }
   
   
    /**
     * The _input ParsedDatum was successfully converted into a Dom4J Document.
     * at this point you would typically emit one or more output tuples (with
     * appropriate fields), using the collector.
     *
     * @param datum Input datum, which wraps a Cascading Tuple.
     * @param doc Result of converting incoming XML document to a Dom4J Document
     * @param collector Collector to use if you want to emit tuples.
     * @param process The FlowProcess for this operation.
     */
    protected abstract void process(ParsedDatum datum, Document doc, TupleEntryCollector collector, FlowProcess process) throws Exception;
   
    /**
     * An exception occurred while parsing or processing the _input ParsedDatum. Options are to
     * ignore it, emit a tuple (with appropriate fields), or throw a RuntimeException
     * to kill the job.
     *
     * @param datum Input datum, which wraps a Cascading Tuple.
     * @param e Exception while parsing or processing document
     * @param collector Collector to use if you want to emit a tuple.
     */
    protected abstract void handleException(ParsedDatum datum, Exception e, TupleEntryCollector collector);
}
TOP

Related Classes of bixo.parser.DOMParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.