Examples of Tika


Examples of com.dotcms.repackage.org.apache.tika.Tika

    Map<String, String> metaMap = new HashMap<String, String>();

    // store content metadata on disk
        File contentM=APILocator.getFileAssetAPI().getContentMetadataFile(inode);

    Tika t = new Tika();
    Metadata met = new Metadata();
    t.setMaxStringLength(-1);
    Reader fulltext = null;
    InputStream is = null;
    // if the limit is not "unlimited"
    // I can use the faster parseToString
    try {

      if(forceMemory){
        // no worry about the limit and less time to process.
        String content = t.parseToString(new FileInputStream(binFile), met);
        metaMap = new HashMap<String, String>();
        for (int i = 0; i < met.names().length; i++) {
          String name = met.names()[i];
          if (UtilMethods.isSet(name) && met.get(name) != null) {
            // we will want to normalize our metadata for searching
            String[] x = translateKey(name);
            for (String y : x)
              metaMap.put(y, met.get(name));
          }
        }
        metaMap.put(FileAssetAPI.CONTENT_FIELD, content);
      }
      else {


        is = TikaInputStream.get(binFile);
        fulltext = t.parse(is, met);
        metaMap = new HashMap<String, String>();
        for (int i = 0; i < met.names().length; i++) {
          String name = met.names()[i];
          if (UtilMethods.isSet(name) && met.get(name) != null) {
            // we will want to normalize our metadata for searching
View Full Code Here

Examples of org.apache.tika.Tika

  }

  public String getText() {
    if (text == null || lastTextUpdate < getFile().lastModified()) {
      try {
        Tika tika = new Tika();
        tika.setMaxStringLength(-1);
        text = tika.parseToString(getFile()).replaceAll("\n\\s*\n+","\n\n");
      } catch (TikaException e) {
        text = "";
      } catch (IOException e) {
        text = "";
      } finally {
View Full Code Here

Examples of org.apache.tika.Tika

    }

    public static String getMimeTypeWithByteBuffer(java.nio.ByteBuffer buffer) throws IOException {
        byte[] b = buffer.array();

        Tika tika = new Tika();
        return tika.detect(b);
    }
View Full Code Here

Examples of org.apache.tika.Tika

    }

    public static String getMimeTypeWithByteBuffer(java.nio.ByteBuffer buffer) throws IOException {
        byte[] b = buffer.array();

        Tika tika = new Tika();
        return tika.detect(b);
    }
View Full Code Here

Examples of org.apache.tika.Tika

    }

    public void XtestParseUTF8() throws IOException, SAXException, TikaException {
        String path = "/test-documents/testXHTML_utf8.html";
        Metadata metadata = new Metadata();
        String content = new Tika().parseToString(
                HtmlParserTest.class.getResourceAsStream(path), metadata);

        assertTrue("Did not contain expected text:"
                + "Title : Tilte with UTF-8 chars öäå", content
                .contains("Title : Tilte with UTF-8 chars öäå"));
View Full Code Here

Examples of org.apache.tika.Tika

    }

    public void testXhtmlParsing() throws Exception {
        String path = "/test-documents/testXHTML.html";
        Metadata metadata = new Metadata();
        String content = new Tika().parseToString(
                HtmlParserTest.class.getResourceAsStream(path), metadata);

        assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("XHTML test document", metadata.get(Metadata.TITLE));
View Full Code Here

Examples of org.apache.tika.Tika

     * Test case for TIKA-210
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
     */
    public void testCharactersDirectlyUnderBodyElement() throws Exception {
        String test = "<html><body>test</body></html>";
        String content = new Tika().parseToString(
                new ByteArrayInputStream(test.getBytes("UTF-8")));
        assertEquals("test", content);
    }
View Full Code Here

Examples of org.apache.tika.Tika

     * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
     */
    public void testWhitespaceBetweenTableCells() throws Exception {
        String test =
            "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
        String content = new Tika().parseToString(
                new ByteArrayInputStream(test.getBytes("UTF-8")));
        assertTrue(content.contains("a"));
        assertTrue(content.contains("b"));
        assertFalse(content.contains("ab"));
    }
View Full Code Here

Examples of org.apache.tika.Tika

     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
     */
    public void testLineBreak() throws Exception {
        String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
        String text = new Tika().parseToString(
                new ByteArrayInputStream(test.getBytes("US-ASCII")));
        String[] parts = text.trim().split("\\s+");
        assertEquals(3, parts.length);
        assertEquals("foo", parts[0]);
        assertEquals("bar", parts[1]);
View Full Code Here

Examples of org.apache.tika.Tika

   * If you need better performance just use {@link com.gentics.cr.file.ResolvableFileBean#getMimeType()}
   * @throws IOException in case Tika detection fails.
   */
  public String getDetectedMimetype() throws IOException {
    if (file != null) {
      return new Tika().detect(file);
    } else {
      return UNKNOWN_MIMETYPE;
    }
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.