Package org.apache.stanbol.enhancer.engines.metaxa.core.html

Examples of org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlTextExtractUtil


        return null;
    }
   
    protected String extractTextFromHtml(String string, String charset, RDFContainer rdf) throws ExtractorException {
        // parse the HTML and extract full-text and metadata
        HtmlTextExtractUtil extractor;
        try {
            extractor = new HtmlTextExtractUtil();
        } catch (InitializationException e) {
            throw new ExtractorException("Could not initialize HtmlExtractor: " + e.getMessage());
        }
        InputStream stream = new ByteArrayInputStream(string.getBytes());
        RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
        URI id = rdf.getDescribedUri();
        RDFContainer result = containerFactory.getRDFContainer(id);
        extractor.extract(id, charset, stream, result);
        Model meta = result.getModel();
       
        // append metadata and full-text to a string buffer
        StringBuilder buffer = new StringBuilder(32 * 1024);
        append(buffer, extractor.getTitle(meta), "\n");
        append(buffer, extractor.getAuthor(meta), "\n");
        append(buffer, extractor.getDescription(meta), "\n");
        List<String> keywords = extractor.getKeywords(meta);
        for (String kw : keywords) {
            append(buffer, kw, " ");
        }
        buffer.append("\n");
        append(buffer, extractor.getText(meta), " ");
        logger.debug("text extracted:\n{}", buffer);
        meta.close();
       
        // return the buffer's content
        return buffer.toString();
View Full Code Here

TOP

Related Classes of org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlTextExtractUtil

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.