Package org.apache.tika.parser

Examples of org.apache.tika.parser.ParseContext


            + "<title>hello</title>"
            + "</head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test1.getBytes("ISO-8859-1")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("some description", metadata.get("og:description"));
        assertTrue(metadata.isMultiValued("og:image"));
    }
View Full Code Here


        Metadata metadata = new Metadata();
        LinkContentHandler linkContentHandler = new LinkContentHandler();

        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
                linkContentHandler, metadata, new ParseContext());

        // Expect no anchor text
        assertEquals("", linkContentHandler.getLinks().get(0).getText());

        // We'll change the schema to allow tables inside anchors!
        Schema schema = new HTMLSchema();
        schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);

        ParseContext parseContext = new ParseContext();
        parseContext.set(Schema.class, schema);
        linkContentHandler = new LinkContentHandler();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
                linkContentHandler, metadata, parseContext);
View Full Code Here

                }

                public void skippedEntity(String name) throws SAXException {
                }},
                new Metadata(),
                new ParseContext());

        // The text occurs at line 24 (if lines start at 0) or 25 (if lines start at 1).
        assertEquals(24, textPosition[line]);
        // The column reported seems fuzzy, just test it is close enough.
        assertTrue(Math.abs(textPosition[col]-47) < 10);
View Full Code Here

    @Test
    public void testNullHeaders() throws Exception {
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();

        InputStream input = getTestDocument("NullHeader.docx");
        try {
            parser.parse(input, handler, metadata, context);
            assertFalse(handler.toString().length()==0);
View Full Code Here

        Metadata metadata = new Metadata();

        InputStream stream = OOXMLParserTest.class.getResourceAsStream(
                "/test-documents/testWORD_various.docx");
        try {
            new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        String content = handler.toString();
View Full Code Here

        Metadata metadata = new Metadata();

        InputStream stream = OOXMLParserTest.class.getResourceAsStream(
                "/test-documents/testPPT_various.pptx");
        try {
            new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        String content = handler.toString();
View Full Code Here

        Metadata metadata = new Metadata();

        InputStream stream = OOXMLParserTest.class.getResourceAsStream(
                "/test-documents/testPPT_masterFooter.pptx");
        try {
            new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        String content = handler.toString();
View Full Code Here

        Metadata metadata = new Metadata();

        InputStream stream = OOXMLParserTest.class.getResourceAsStream(
                "/test-documents/testWordArt.pptx");
        try {
            new AutoDetectParser().parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }
        String content = handler.toString();
        assertContains("Here is some red word Art", content);
View Full Code Here

             "/test-documents/testEXCEL_custom_props.xlsx");
       Metadata metadata = new Metadata();
      
       try {
          ContentHandler handler = new BodyContentHandler(-1);
          ParseContext context = new ParseContext();
          context.set(Locale.class, Locale.US);
          new OOXMLParser().parse(input, handler, metadata, context);
       } finally {
          input.close();
       }
      
View Full Code Here

             "/test-documents/testWORD_custom_props.docx");
       Metadata metadata = new Metadata();

       try {
          ContentHandler handler = new BodyContentHandler(-1);
          ParseContext context = new ParseContext();
          context.set(Locale.class, Locale.US);
          new OOXMLParser().parse(input, handler, metadata, context);
       } finally {
          input.close();
       }
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.ParseContext

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.