Package org.apache.tika

Examples of org.apache.tika.Tika


     * Test case for TIKA-210
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
     */
    public void testCharactersDirectlyUnderBodyElement() throws Exception {
        String test = "<html><body>test</body></html>";
        String content = new Tika().parseToString(
                new ByteArrayInputStream(test.getBytes("UTF-8")));
        assertEquals("test", content);
    }
View Full Code Here


     * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
     */
    public void testWhitespaceBetweenTableCells() throws Exception {
        String test =
            "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
        String content = new Tika().parseToString(
                new ByteArrayInputStream(test.getBytes("UTF-8")));
        assertTrue(content.contains("a"));
        assertTrue(content.contains("b"));
        assertFalse(content.contains("ab"));
    }
View Full Code Here

     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
     */
    public void testLineBreak() throws Exception {
        String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
        String text = new Tika().parseToString(
                new ByteArrayInputStream(test.getBytes("US-ASCII")));
        String[] parts = text.trim().split("\\s+");
        assertEquals(3, parts.length);
        assertEquals("foo", parts[0]);
        assertEquals("bar", parts[1]);
View Full Code Here

    /**
     * Test case for TIKA-434 - Pushback buffer overflow in TagSoup
     */
    public void testPushback() throws IOException, TikaException {
        String content = new Tika().parseToString(
                HtmlParserTest.class.getResourceAsStream("/test-documents/tika434.html"), new Metadata());


        assertNotNull(content);
    }
View Full Code Here

    }

    // TIKA-1011
    public void testUserDefinedCharset() throws Exception {
        String content = new Tika().parseToString(
                HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata());
        assertNotNull(content);
    }
View Full Code Here

        out.println("    ports you specify as one or more arguments.");
        out.println();
    }

    private void version() {
        System.out.println(new Tika().toString());
    }
View Full Code Here

    {
      contentType = mimeType.getName();
    }
    else
    {
      contentType = new Tika().detect(dataStoreReference);
    }

    try
    {
      File file = new File(new URI(dataStoreReference));
View Full Code Here

            InputStream is = new FileInputStream(file);

            // extract met from prod using tika
            LOG.fine("Invoking tika extractor on file ["
                    + file.getAbsolutePath() + "]");
            Tika tika = new Tika();
            tika.parse(is, tikaMet); // extract metadata
            tikaMet.add("content", tika.parseToString(file)); // extract content

            LOG.fine("Number of captured tika metadata keys: ["
                    + tikaMet.names().length + "]");

            // copy tika met into oodt met
View Full Code Here

    }

    public static String getMimeTypeWithByteBuffer(java.nio.ByteBuffer buffer) throws IOException {
        byte[] b = buffer.array();

        Tika tika = new Tika();
        return tika.detect(b);
    }
View Full Code Here

        else
            parentFolder = (Folder) session.getObject(folderId);
       
        try {
            File f = new File(fileName);
            Tika tika = new Tika();    
            String mimeType = tika.detect(f);
            LOG.info("Detected MIME type: "+ mimeType);
           
            // extract metadata: first get a parser
            MetadataParser parser = CFG.getParser(mimeType);
            if (null == parser) {
View Full Code Here

TOP

Related Classes of org.apache.tika.Tika

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.