Package org.apache.tika.parser

Examples of org.apache.tika.parser.ParseContext


    private String extractTextWithTika(byte[] textBytes, Metadata metadata) throws TikaException, SAXException, IOException {
        AutoDetectParser parser = new AutoDetectParser(new MimeTypes());
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        OutputStreamWriter writer = new OutputStreamWriter(baos, "UTF-8");
        ContentHandler handler = new BodyContentHandler(writer);
        ParseContext context = new ParseContext();
        context.set(PDFParserConfig.class, new LumifyParserConfig());
        parser.parse(new ByteArrayInputStream(textBytes), handler, metadata, context);
        return IOUtils.toString(baos.toByteArray(), "UTF-8");
    }
View Full Code Here


          Metadata metadata = new Metadata();
          InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes());
         
          try {
            _parser.parse(is, _handler, metadata, new ParseContext());

            // _ids now has a list of the mailbox IDs that we use to create URLs.
            for (String id : _ids) {
              String url = String.format("%s/%s.mbox", fetchedDatum.getUrl(), id);
              UrlDatum datum = new UrlDatum(url);
View Full Code Here

       
        // Now, if the FetchedDatum mime-type is application/mbox, we want to parse it and
        // output the results
        if (fetchedDatum.getContentType().equals("application/mbox")) {
          Metadata metadata = new Metadata();
          ParseContext context = new ParseContext();
          InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes());
         
          try {
            _parser.parse(is, _handler, metadata, context);
View Full Code Here

            fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);
           
            // We only care about mime types that the Tika HTML parser can handle,
            // so restrict it to the same.
            Set<String> validMimeTypes = new HashSet<String>();
            Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
            for (MediaType supportedType : supportedTypes) {
                validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
            }
            fetcherPolicy.setValidMimeTypes(validMimeTypes);
View Full Code Here

        _contentExtractor = contentExtractor;
        _linkExtractor = linkExtractor;

        if (includeMarkup) {
            _parseContext = new ParseContext();
            _parseContext.set(HtmlMapper.class, FixedIdentityHtmlMapper.INSTANCE);
        }
    }
View Full Code Here

     * aren't part of the default set.
     *
     * @return
     */
    private ParseContext makeParseContext() {
        ParseContext result = new ParseContext();

        Set<String> validTags = _linkExtractor.getLinkTags();
        HtmlMapper defaultMapper = DefaultHtmlMapper.INSTANCE;
        for (String tag : validTags) {
            if (defaultMapper.mapSafeElement(tag) == null) {
                result.set(HtmlMapper.class, new CustomHtmlMapper(validTags, _linkExtractor.getLinkAttributeTypes()));
                break;
            }
        }
       
        return result;
View Full Code Here

    InputStream input = new FileInputStream(
            new File("src/test/resources/pdfBox-sample.pdf"));//<co id="tika.is"/>
    ContentHandler textHandler = new BodyContentHandler();//<co id="tika.handler"/>
    Metadata metadata = new Metadata();//<co id="tika.metadata"/>
    Parser parser = new AutoDetectParser();//<co id="tika.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, textHandler, metadata, context);//<co id="tika.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));//<co id="tika.title"/>
    System.out.println("Body: " + textHandler.toString());//<co id="tika.body"/>
    /*
<calloutlist>
 
View Full Code Here

    ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/>
    LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/>
    ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/>
    Metadata metadata = new Metadata();//<co id="html.store"/>
    Parser parser = new HtmlParser();//<co id="html.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, handler, metadata, context);//<co id="html.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    System.out.println("Body: " + text.toString());
    System.out.println("Links: " + links.getLinks());
    /*
 
View Full Code Here

    private String encoding = null;

    private boolean pipeMode = true;

    public TikaCLI() throws TransformerConfigurationException {
        context = new ParseContext();
        parser = new AutoDetectParser();
        context.set(Parser.class, parser);
    }
View Full Code Here

     * @deprecated This method will be removed in Apache Tika 1.0.
     */
    public void parse(
            InputStream stream, ContentHandler handler, Metadata metadata)
            throws IOException, SAXException, TikaException {
        parse(stream, handler, metadata, new ParseContext());
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.ParseContext

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.