Source Code of bixo.parser.SimpleParser

/*
 * Copyright 2009-2013 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.parser;


import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.Serializable;
import java.net.URL;
import java.util.Locale;
import java.util.concurrent.Callable;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;


import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.IdentityHtmlMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import bixo.config.ParserPolicy;
import bixo.datum.FetchedDatum;
import bixo.datum.ParsedDatum;
import bixo.utils.IoUtils;


@SuppressWarnings("serial")
public class SimpleParser extends BaseParser {
    private static final Logger LOGGER = LoggerFactory.getLogger(SimpleParser.class);


    /**
     * Fixed version of Tika 1.0's IdentityHtmlMapper
     */
    private static class FixedIdentityHtmlMapper extends IdentityHtmlMapper implements Serializable {


        public static final HtmlMapper INSTANCE = new FixedIdentityHtmlMapper();


        @Override
        public String mapSafeElement(String name) {
            return name.toLowerCase(Locale.ENGLISH);
        }
    }


    private boolean _extractLanguage = true;
    protected BaseContentExtractor _contentExtractor;
    protected BaseLinkExtractor _linkExtractor;
    protected ParseContext _parseContext;
    private transient Parser _parser;
    
    public SimpleParser() {
        this(new ParserPolicy());
    }
    
    public SimpleParser(ParserPolicy parserPolicy) {
        this(new SimpleContentExtractor(), new SimpleLinkExtractor(), parserPolicy, null);
    }
    
    /**
     * @param contentExtractor to use instead of new {@link SimpleContentExtractor}()
     * @param linkExtractor to use instead of new {@link SimpleLinkExtractor}()
     * @param parserPolicy to customize operation of the parser
     * <BR><BR><B>Note:</B> There is no need to construct your own
     * {@link SimpleLinkExtractor} simply to control the set of link tags
     * and attributes it processes. Instead, use {@link ParserPolicy#setLinkTags}
     * and {@link ParserPolicy#setLinkAttributeTypes}, and then pass this policy
     * to {@link SimpleParser#SimpleParser(ParserPolicy)}.
     */
    public SimpleParser(BaseContentExtractor contentExtractor, BaseLinkExtractor linkExtractor, ParserPolicy parserPolicy) {
        this(contentExtractor, linkExtractor, parserPolicy, null);
    }
    
    /**
     * @param parserPolicy to customize operation of the parser
     * @param includeMarkup true if output should be raw HTML, versus extracted text
     * <BR><BR><B>Note:</B> There is no need to construct your own
     * {@link SimpleLinkExtractor} simply to control the set of link tags
     * and attributes it processes. Instead, use {@link ParserPolicy#setLinkTags}
     * and {@link ParserPolicy#setLinkAttributeTypes}, and then pass this policy
     * to {@link SimpleParser#SimpleParser(ParserPolicy)}.
     */
    public SimpleParser(ParserPolicy parserPolicy, boolean includeMarkup) {
        this(includeMarkup ? new HtmlContentExtractor() : new SimpleContentExtractor(),
             new SimpleLinkExtractor(),
             parserPolicy, includeMarkup);
    }


    /**
     * @param parserPolicy to customize operation of the parser
     * @param includeMarkup true if output should be raw HTML, versus extracted text
     * <BR><BR><B>Note:</B> There is no need to construct your own
     * {@link SimpleLinkExtractor} simply to control the set of link tags
     * and attributes it processes. Instead, use {@link ParserPolicy#setLinkTags}
     * and {@link ParserPolicy#setLinkAttributeTypes}, and then pass this policy
     * to {@link SimpleParser#SimpleParser(ParserPolicy)}.
     */
    public SimpleParser(BaseContentExtractor contentExtractor, BaseLinkExtractor linkExtractor, ParserPolicy parserPolicy, boolean includeMarkup) {
        super(parserPolicy);


        _contentExtractor = contentExtractor;
        _linkExtractor = linkExtractor;


        if (includeMarkup) {
            _parseContext = new ParseContext();
            _parseContext.set(HtmlMapper.class, FixedIdentityHtmlMapper.INSTANCE);
        }
    }


    /**
     * @param contentExtractor to use instead of new {@link SimpleContentExtractor}()
     * @param linkExtractor to use instead of new {@link SimpleLinkExtractor}()
     * @param parserPolicy to customize operation of the parser
     * @param parseContext used to pass context info to the parser
     * <BR><BR><B>Note:</B> There is no need to construct your own
     * {@link SimpleLinkExtractor} simply to control the set of link tags
     * and attributes it processes. Instead, use {@link ParserPolicy#setLinkTags}
     * and {@link ParserPolicy#setLinkAttributeTypes}, and then pass this policy
     * to {@link SimpleParser#SimpleParser(ParserPolicy)}.
     */
    public SimpleParser(BaseContentExtractor contentExtractor, BaseLinkExtractor linkExtractor, ParserPolicy parserPolicy, ParseContext parseContext) {
        super(parserPolicy);
        
        _contentExtractor = contentExtractor;
        _linkExtractor = linkExtractor;
        _parseContext = parseContext;
    }


    protected synchronized void init() {
        if (_parser == null) {
            _parser = getTikaParser();
        }
        
        _contentExtractor.reset();
        _linkExtractor.setLinkTags(getParserPolicy().getLinkTags());
        _linkExtractor.setLinkAttributeTypes(getParserPolicy().getLinkAttributeTypes());
        _linkExtractor.reset();
    }


    public Parser getTikaParser() {
        return new AutoDetectParser();
    }


    public void setExtractLanguage(boolean extractLanguage) {
        _extractLanguage = extractLanguage;
    }
    
    public boolean isExtractLanguage() {
        return _extractLanguage;
    }
    
    @Override
    public ParsedDatum parse(FetchedDatum fetchedDatum) throws Exception {
        init();


        if (LOGGER.isTraceEnabled()) {
          LOGGER.trace(String.format("Parsing %s", fetchedDatum.getUrl()));
        }
        
        // Provide clues to the parser about the format of the content.
        Metadata metadata = new Metadata();
        metadata.add(Metadata.RESOURCE_NAME_KEY, fetchedDatum.getUrl());
        metadata.add(Metadata.CONTENT_TYPE, fetchedDatum.getContentType());
        String charset = getCharset(fetchedDatum);
        metadata.add(Metadata.CONTENT_LANGUAGE, getLanguage(fetchedDatum, charset));
        
        InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes(), 0, fetchedDatum.getContentLength());


        try {
          URL baseUrl = getContentLocation(fetchedDatum);
          metadata.add(Metadata.CONTENT_LOCATION, baseUrl.toExternalForm());


            Callable<ParsedDatum> c = new TikaCallable(_parser, _contentExtractor, _linkExtractor, is, metadata, isExtractLanguage(), _parseContext);
            FutureTask<ParsedDatum> task = new FutureTask<ParsedDatum>(c);
            Thread t = new Thread(task);
            t.start();
            
            ParsedDatum result;
            try {
                result = task.get(getParserPolicy().getMaxParseDuration(), TimeUnit.MILLISECONDS);
            } catch (TimeoutException e) {
                task.cancel(true);
                t.interrupt();
                throw e;
            } finally {
                t = null;
            }
            
            // TODO KKr Should there be a BaseParser to take care of copying
            // these two fields?
            result.setHostAddress(fetchedDatum.getHostAddress());
            result.setPayload(fetchedDatum.getPayload());
            return result;
        } finally {
            IoUtils.safeClose(is);
        }
    }


}
Source Code of bixo.parser.SimpleParser

Related Classes of bixo.parser.SimpleParser