Examples of org.apache.tika.metadata.Metadata

org.apache.tika.metadata.Metadata
A multi-valued metadata container.

    public void execute(InputStream in, GraphPropertyWorkData data) throws Exception {
        String mimeType = (String) data.getProperty().getMetadata().get(LumifyProperties.MIME_TYPE.getPropertyName());
        checkNotNull(mimeType, LumifyProperties.MIME_TYPE.getPropertyName() + " is a required metadata field");


        Charset charset = Charset.forName("UTF-8");
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, mimeType);
        String text = extractText(in, mimeType, metadata);


        ExistingElementMutation<Vertex> m = data.getElement().prepareMutation();


        // TODO set("url", extractUrl(metadata));

View Full Code Here

                return mimeType;
            }
        }


        DefaultDetector detector = new DefaultDetector();
        Metadata metadata = new Metadata();
        MediaType mediaType = detector.detect(new BufferedInputStream(in), metadata);
        mimeType = mediaType.toString();
        if (mimeType != null) {
            return mimeType;
        }

View Full Code Here

        FetchedDatum fetchedDatum = new FetchedDatum(arguments.getTuple());


        if (fetchedDatum.getContentType().startsWith("text/html")) {
          init();


          Metadata metadata = new Metadata();
          InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes());
          
          try {
            _parser.parse(is, _handler, metadata, new ParseContext());

View Full Code Here

    
    @Test
    public void testNotTerminating() throws Exception {
        DelayParser parser = new DelayParser(true);
        InputStream is = Mockito.mock(InputStream.class);
        Metadata md = Mockito.mock(Metadata.class);
        
        BaseContentExtractor contentExtractor = Mockito.mock(BaseContentExtractor.class);
        BaseLinkExtractor linkExtractor = Mockito.mock(BaseLinkExtractor.class);
        
        Callable<ParsedDatum> c = new TikaCallable(parser, contentExtractor, linkExtractor, is, md);

View Full Code Here

    
    @Test
    public void testTerminating() throws Exception {
        Parser parser = new DelayParser(false);
        InputStream is = Mockito.mock(InputStream.class);
        Metadata md = new Metadata();
        
        BaseContentExtractor contentExtractor = Mockito.mock(BaseContentExtractor.class);
        BaseLinkExtractor linkExtractor = Mockito.mock(BaseLinkExtractor.class);
        Mockito.when(linkExtractor.getLinks()).thenReturn(new Outlink[0]);

View Full Code Here

        FetchedDatum fetchedDatum = new FetchedDatum(arguments.getTuple());
        
        // Now, if the FetchedDatum mime-type is application/mbox, we want to parse it and
        // output the results
        if (fetchedDatum.getContentType().equals("application/mbox")) {
          Metadata metadata = new Metadata();
          ParseContext context = new ParseContext();
          InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes());
          
          try {
            _parser.parse(is, _handler, metadata, context);


            // _content now has all of the body text, and metadata has the header info.
            String messageId = metadata.get(Metadata.IDENTIFIER);
            String emailAddress = metadata.get(Metadata.CREATOR);


            if (emailAddress == null) {
              LOGGER.warn("No email address for message: " + messageId);
              return;
            }


            String address = null;
            String name = null;


            Matcher addressMatcher = FULL_EMAIL_ADDRESS_PATTERN.matcher(emailAddress);
            if (addressMatcher.matches()) {
              name = addressMatcher.group(1);
              address = addressMatcher.group(2);
            } else {
              addressMatcher = SIMPLE_EMAIL_ADDRESS_PATTERN.matcher(emailAddress);
              if (addressMatcher.matches()) {
                address = addressMatcher.group(1);
              } else {
                LOGGER.warn("Email address has invalid format: " + emailAddress);
                return;
              }
            }


            // Now we might need to remain the address, if this user has aliases.
            if (EMAIL_ALIASES.containsKey(address)) {
              address = EMAIL_ALIASES.get(address);
            }


            Tuple tuple = new Tuple(messageId, address, name, 0.0);
            functionCall.getOutputCollector().add(tuple);


            String replyId = metadata.get(Metadata.RELATION);


            if (replyId != null) {
              double score = analyzeReply(_content.toString());
              if (score > 0.0) {
                tuple = new Tuple(replyId, null, null, score);

View Full Code Here

        if (LOGGER.isTraceEnabled()) {
          LOGGER.trace(String.format("Parsing %s", fetchedDatum.getUrl()));
        }
        
        // Provide clues to the parser about the format of the content.
        Metadata metadata = new Metadata();
        metadata.add(Metadata.RESOURCE_NAME_KEY, fetchedDatum.getUrl());
        metadata.add(Metadata.CONTENT_TYPE, fetchedDatum.getContentType());
        String charset = getCharset(fetchedDatum);
        metadata.add(Metadata.CONTENT_LANGUAGE, getLanguage(fetchedDatum, charset));
        
        InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes(), 0, fetchedDatum.getContentLength());


        try {
          URL baseUrl = getContentLocation(fetchedDatum);
          metadata.add(Metadata.CONTENT_LOCATION, baseUrl.toExternalForm());


            Callable<ParsedDatum> c = new TikaCallable(_parser, _contentExtractor, _linkExtractor, is, metadata, isExtractLanguage(), _parseContext);
            FutureTask<ParsedDatum> task = new FutureTask<ParsedDatum>(c);
            Thread t = new Thread(task);
            t.start();

View Full Code Here

  public void testTika() throws Exception {
    //<start id="tika"/>
    InputStream input = new FileInputStream(
            new File("src/test/resources/pdfBox-sample.pdf"));//<co id="tika.is"/>
    ContentHandler textHandler = new BodyContentHandler();//<co id="tika.handler"/>
    Metadata metadata = new Metadata();//<co id="tika.metadata"/>
    Parser parser = new AutoDetectParser();//<co id="tika.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, textHandler, metadata, context);//<co id="tika.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));//<co id="tika.title"/>
    System.out.println("Body: " + textHandler.toString());//<co id="tika.body"/>
    /*
<calloutlist>
    <callout arearefs="tika.is"><para>Create the <classname>InputStream</classname> to read in the content</para></callout>
    <callout arearefs="tika.handler"><para>The <classname>BodyContentHandler</classname> is a Tika-provided <classname>ContentHandler</classname> that extracts just the "body" of the InputStream</para></callout>

View Full Code Here

    //<start id="tika-html"/>
    InputStream input = new ByteArrayInputStream(html.getBytes(Charset.forName("UTF-8")));
    ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/>
    LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/>
    ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/>
    Metadata metadata = new Metadata();//<co id="html.store"/>
    Parser parser = new HtmlParser();//<co id="html.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, handler, metadata, context);//<co id="html.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    System.out.println("Body: " + text.toString());
    System.out.println("Links: " + links.getLinks());
    /*
    <calloutlist>
        <callout arearefs="html.text.co"><para>Construct a ContentHandler that will just extract between the body tags.</para></callout>

View Full Code Here

        assertNotNull("Test stream: ["+urlOrFileName+"] is null!", in);
        if (!in.markSupported()) {
            in = new java.io.BufferedInputStream(in);
        }
        try {
            Metadata metadata = new Metadata();
            String mime = this.mimeTypes.detect(in, metadata).toString();
            assertEquals(urlOrFileName + " is not properly detected: detected.", expected, mime);


            //Add resource name and test again
            metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
            mime = this.mimeTypes.detect(in, metadata).toString();
            assertEquals(urlOrFileName + " is not properly detected after adding resource name.", expected, mime);
        } finally {
            in.close();
        }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.tika.metadata.Metadata

bixo.parser.TikaCallableTest

com.tamingtext.tika.TikaTest

cx.fbn.nevernote.threads.IndexRunner

org.apache.any23.mime.TikaMIMETypeDetector

org.apache.chemistry.opencmis.client.parser.MetadataParserTika

org.apache.cxf.jaxrs.ext.search.tika.TikaContentExtractor

org.apache.jackrabbit.core.query.lucene.NodeIndexer

org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditor

org.apache.jackrabbit.server.io.DefaultHandler

org.apache.jackrabbit.server.io.ImportContextImpl

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.