Package org.apache.tika.sax

Examples of org.apache.tika.sax.LinkContentHandler


            "in the US is <a href=\"http://antoniospizzas.com/\">Antonio's Pizza</a>.</p>" +
            "<p>It is located in Amherst, MA.</p></body></html>";
    //<start id="tika-html"/>
    InputStream input = new ByteArrayInputStream(html.getBytes(Charset.forName("UTF-8")));
    ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/>
    LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/>
    ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/>
    Metadata metadata = new Metadata();//<co id="html.store"/>
    Parser parser = new HtmlParser();//<co id="html.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, handler, metadata, context);//<co id="html.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    System.out.println("Body: " + text.toString());
    System.out.println("Links: " + links.getLinks());
    /*
    <calloutlist>
        <callout arearefs="html.text.co"><para>Construct a ContentHandler that will just extract between the body tags.</para></callout>
        <callout arearefs="html.link.co"><para>Construct ContentHandler that knows about HTML links</para></callout>
        <callout arearefs="html.merge"><para>Wrap up our ContentHandlers into one</para></callout>
View Full Code Here


    public void testCustomHtmlSchema() throws Exception {
        // Default schema does not allow tables inside anchors
        String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";

        Metadata metadata = new Metadata();
        LinkContentHandler linkContentHandler = new LinkContentHandler();

        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
                linkContentHandler, metadata, new ParseContext());

        // Expect no anchor text
        assertEquals("", linkContentHandler.getLinks().get(0).getText());

        // We'll change the schema to allow tables inside anchors!
        Schema schema = new HTMLSchema();
        schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);

        ParseContext parseContext = new ParseContext();
        parseContext.set(Schema.class, schema);
        linkContentHandler = new LinkContentHandler();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
                linkContentHandler, metadata, parseContext);

        // Expect anchor text
        assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
    }
View Full Code Here

        while (scanner.hasNext())
          sb.append(scanner.nextLine());

        HtmlParser parser = new HtmlParser();
        Metadata met = new Metadata();
        LinkContentHandler handler = new LinkContentHandler();

        parser.parse(new ByteArrayInputStream(sb.toString().getBytes()),
            handler, met);
        List<Link> links = handler.getLinks();
        children = new LinkedList<ProtocolFile>();
        for (Link link : links) {
          String href = link.getUri();
          String linkName = link.getTitle();
          String curPath = this.pwd().getProtocolPath().getPathString();
View Full Code Here

        ByteArrayInputStream bais = new ByteArrayInputStream(content);
        Metadata md = new Metadata();

        String text = null;

        LinkContentHandler linkHandler = new LinkContentHandler();
        ContentHandler textHandler = new BodyContentHandler();
        TeeContentHandler teeHandler = new TeeContentHandler(linkHandler,
                textHandler);
        ParseContext parseContext = new ParseContext();
        // parse
        try {
            tika.getParser().parse(bais, teeHandler, md, parseContext);
            text = textHandler.toString();
        } catch (Exception e) {
            LOG.error("Exception while parsing " + url, e.getMessage());
            eventMeters.scope(
                    "error_content_parsing_" + e.getClass().getSimpleName())
                    .mark();
            collector.fail(tuple);
            eventMeters.scope("tuple_fail").mark();
            return;
        } finally {
            try {
                bais.close();
            } catch (IOException e) {
                LOG.error("Exception while closing stream", e);
            }
        }

        long duration = System.currentTimeMillis() - start;

        LOG.info("Parsed " + url + " in " + duration + " msec");

        // get the outlinks and convert them to strings (for now)
        String fromHost;
        URL url_;
        try {
            url_ = new URL(url);
            fromHost = url_.getHost().toLowerCase();
        } catch (MalformedURLException e1) {
            // we would have known by now as previous
            // components check whether the URL is valid
            LOG.error("MalformedURLException on " + url);
            eventMeters.scope(
                    "error_outlinks_parsing_" + e1.getClass().getSimpleName())
                    .mark();
            collector.fail(tuple);
            eventMeters.scope("tuple_fail").mark();
            return;
        }

        List<Link> links = linkHandler.getLinks();
        Set<String> slinks = new HashSet<String>(links.size());
        for (Link l : links) {
            if (StringUtils.isBlank(l.getUri()))
                continue;
            String urlOL = null;
View Full Code Here

    xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
    xmlHandler.setResult(new StreamResult(dataBuffer));
   
    BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
    BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
    LinkContentHandler linkHandler = new LinkContentHandler();
   
    TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler );

    InputStream instream = entity.obtainContent();
    try {
      parser.parse(instream, parallelHandler, metadata, new ParseContext());
     
      ArrayList<Link> extractedTasks = new ArrayList<Link>();
      int depth = task.getDepth() + 1;
      if (task instanceof LinkTask) {
        for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
          try {
            URI uri = new URI(tikaLink.getUri());
            // Test to see if the scheme is empty
            // This would indicate a relative URL, so resolve it against the task URI
            if(uri.getScheme() == null) {
View Full Code Here

    xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
    xmlHandler.setResult(new StreamResult(dataBuffer));
   
    BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
    BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
    LinkContentHandler linkHandler = new LinkContentHandler();
   
    TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler );

    InputStream instream = entity.obtainContent();
    try {
      parser.parse(instream, parallelHandler, metadata, new ParseContext());
     
      ArrayList<Link> extractedTasks = new ArrayList<Link>();
      if (task instanceof Link) {
        int depth = task.getDepth() + 1;
        for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
          try {
            URI uri = new URI(tikaLink.getUri());
            // Test to see if the scheme is empty
            // This would indicate a relative URL, so resolve it against the task URI
            if(uri.getScheme() == null) {
View Full Code Here

    public void testCustomHtmlSchema() throws Exception {
        // Default schema does not allow tables inside anchors
        String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";

        Metadata metadata = new Metadata();
        LinkContentHandler linkContentHandler = new LinkContentHandler();

        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
                linkContentHandler, metadata, new ParseContext());

        // Expect no anchor text
        assertEquals("", linkContentHandler.getLinks().get(0).getText());

        // We'll change the schema to allow tables inside anchors!
        Schema schema = new HTMLSchema();
        schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);

        ParseContext parseContext = new ParseContext();
        parseContext.set(Schema.class, schema);
        linkContentHandler = new LinkContentHandler();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
                linkContentHandler, metadata, parseContext);

        // Expect anchor text
        assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.LinkContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.