Package org.apache.tika.metadata

Examples of org.apache.tika.metadata.Metadata


    public void execute(InputStream in, GraphPropertyWorkData data) throws Exception {
        String mimeType = (String) data.getProperty().getMetadata().get(LumifyProperties.MIME_TYPE.getPropertyName());
        checkNotNull(mimeType, LumifyProperties.MIME_TYPE.getPropertyName() + " is a required metadata field");

        Charset charset = Charset.forName("UTF-8");
        Metadata metadata = new Metadata();
        metadata.set(Metadata.CONTENT_TYPE, mimeType);
        String text = extractText(in, mimeType, metadata);

        ExistingElementMutation<Vertex> m = data.getElement().prepareMutation();

        // TODO set("url", extractUrl(metadata));
View Full Code Here


                return mimeType;
            }
        }

        DefaultDetector detector = new DefaultDetector();
        Metadata metadata = new Metadata();
        MediaType mediaType = detector.detect(new BufferedInputStream(in), metadata);
        mimeType = mediaType.toString();
        if (mimeType != null) {
            return mimeType;
        }
View Full Code Here

        FetchedDatum fetchedDatum = new FetchedDatum(arguments.getTuple());

        if (fetchedDatum.getContentType().startsWith("text/html")) {
          init();

          Metadata metadata = new Metadata();
          InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes());
         
          try {
            _parser.parse(is, _handler, metadata, new ParseContext());
View Full Code Here

   
    @Test
    public void testNotTerminating() throws Exception {
        DelayParser parser = new DelayParser(true);
        InputStream is = Mockito.mock(InputStream.class);
        Metadata md = Mockito.mock(Metadata.class);
       
        BaseContentExtractor contentExtractor = Mockito.mock(BaseContentExtractor.class);
        BaseLinkExtractor linkExtractor = Mockito.mock(BaseLinkExtractor.class);
       
        Callable<ParsedDatum> c = new TikaCallable(parser, contentExtractor, linkExtractor, is, md);
View Full Code Here

   
    @Test
    public void testTerminating() throws Exception {
        Parser parser = new DelayParser(false);
        InputStream is = Mockito.mock(InputStream.class);
        Metadata md = new Metadata();
       
        BaseContentExtractor contentExtractor = Mockito.mock(BaseContentExtractor.class);
        BaseLinkExtractor linkExtractor = Mockito.mock(BaseLinkExtractor.class);
        Mockito.when(linkExtractor.getLinks()).thenReturn(new Outlink[0]);
       
View Full Code Here

        FetchedDatum fetchedDatum = new FetchedDatum(arguments.getTuple());
       
        // Now, if the FetchedDatum mime-type is application/mbox, we want to parse it and
        // output the results
        if (fetchedDatum.getContentType().equals("application/mbox")) {
          Metadata metadata = new Metadata();
          ParseContext context = new ParseContext();
          InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes());
         
          try {
            _parser.parse(is, _handler, metadata, context);

            // _content now has all of the body text, and metadata has the header info.
            String messageId = metadata.get(Metadata.IDENTIFIER);
            String emailAddress = metadata.get(Metadata.CREATOR);

            if (emailAddress == null) {
              LOGGER.warn("No email address for message: " + messageId);
              return;
            }

            String address = null;
            String name = null;

            Matcher addressMatcher = FULL_EMAIL_ADDRESS_PATTERN.matcher(emailAddress);
            if (addressMatcher.matches()) {
              name = addressMatcher.group(1);
              address = addressMatcher.group(2);
            } else {
              addressMatcher = SIMPLE_EMAIL_ADDRESS_PATTERN.matcher(emailAddress);
              if (addressMatcher.matches()) {
                address = addressMatcher.group(1);
              } else {
                LOGGER.warn("Email address has invalid format: " + emailAddress);
                return;
              }
            }

            // Now we might need to remain the address, if this user has aliases.
            if (EMAIL_ALIASES.containsKey(address)) {
              address = EMAIL_ALIASES.get(address);
            }

            Tuple tuple = new Tuple(messageId, address, name, 0.0);
            functionCall.getOutputCollector().add(tuple);

            String replyId = metadata.get(Metadata.RELATION);

            if (replyId != null) {
              double score = analyzeReply(_content.toString());
              if (score > 0.0) {
                tuple = new Tuple(replyId, null, null, score);
View Full Code Here

        if (LOGGER.isTraceEnabled()) {
          LOGGER.trace(String.format("Parsing %s", fetchedDatum.getUrl()));
        }
       
        // Provide clues to the parser about the format of the content.
        Metadata metadata = new Metadata();
        metadata.add(Metadata.RESOURCE_NAME_KEY, fetchedDatum.getUrl());
        metadata.add(Metadata.CONTENT_TYPE, fetchedDatum.getContentType());
        String charset = getCharset(fetchedDatum);
        metadata.add(Metadata.CONTENT_LANGUAGE, getLanguage(fetchedDatum, charset));
       
        InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes(), 0, fetchedDatum.getContentLength());

        try {
          URL baseUrl = getContentLocation(fetchedDatum);
          metadata.add(Metadata.CONTENT_LOCATION, baseUrl.toExternalForm());

            Callable<ParsedDatum> c = new TikaCallable(_parser, _contentExtractor, _linkExtractor, is, metadata, isExtractLanguage(), _parseContext);
            FutureTask<ParsedDatum> task = new FutureTask<ParsedDatum>(c);
            Thread t = new Thread(task);
            t.start();
View Full Code Here

  public void testTika() throws Exception {
    //<start id="tika"/>
    InputStream input = new FileInputStream(
            new File("src/test/resources/pdfBox-sample.pdf"));//<co id="tika.is"/>
    ContentHandler textHandler = new BodyContentHandler();//<co id="tika.handler"/>
    Metadata metadata = new Metadata();//<co id="tika.metadata"/>
    Parser parser = new AutoDetectParser();//<co id="tika.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, textHandler, metadata, context);//<co id="tika.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));//<co id="tika.title"/>
    System.out.println("Body: " + textHandler.toString());//<co id="tika.body"/>
    /*
<calloutlist>
    <callout arearefs="tika.is"><para>Create the <classname>InputStream</classname> to read in the content</para></callout>
    <callout arearefs="tika.handler"><para>The <classname>BodyContentHandler</classname> is a Tika-provided <classname>ContentHandler</classname> that extracts just the "body" of the InputStream</para></callout>
View Full Code Here

    //<start id="tika-html"/>
    InputStream input = new ByteArrayInputStream(html.getBytes(Charset.forName("UTF-8")));
    ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/>
    LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/>
    ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/>
    Metadata metadata = new Metadata();//<co id="html.store"/>
    Parser parser = new HtmlParser();//<co id="html.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, handler, metadata, context);//<co id="html.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    System.out.println("Body: " + text.toString());
    System.out.println("Links: " + links.getLinks());
    /*
    <calloutlist>
        <callout arearefs="html.text.co"><para>Construct a ContentHandler that will just extract between the body tags.</para></callout>
View Full Code Here

        assertNotNull("Test stream: ["+urlOrFileName+"] is null!", in);
        if (!in.markSupported()) {
            in = new java.io.BufferedInputStream(in);
        }
        try {
            Metadata metadata = new Metadata();
            String mime = this.mimeTypes.detect(in, metadata).toString();
            assertEquals(urlOrFileName + " is not properly detected: detected.", expected, mime);

            //Add resource name and test again
            metadata.set(Metadata.RESOURCE_NAME_KEY, urlOrFileName);
            mime = this.mimeTypes.detect(in, metadata).toString();
            assertEquals(urlOrFileName + " is not properly detected after adding resource name.", expected, mime);
        } finally {
            in.close();
        }       
View Full Code Here

TOP

Related Classes of org.apache.tika.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.