Package org.apache.tika.detect

Examples of org.apache.tika.detect.AutoDetectReader


  @Override
  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {

    AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));

    try {
      Charset charset = reader.getCharset();
      String mediaType = metadata.get(Metadata.CONTENT_TYPE);
      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
      if (mediaType != null && name != null) {
        MediaType type = MediaType.parse(mediaType);
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());

        StringBuilder out = new StringBuilder();
        String line;
        int nbLines =  0;
        while ((line = reader.readLine()) != null) {
            out.append(line);
            String author = parserAuthor(line);
            if (author != null) {
              metadata.add(TikaCoreProperties.CREATOR, author);
            }
            nbLines ++;
        }
        metadata.set("LoC", String.valueOf(nbLines));

        Renderer renderer = getRenderer(type.toString());
        String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
        char[] charArray = codeAsHtml.toCharArray();
        handler.startDocument();
        handler.characters(charArray, 0, charArray.length);
        handler.endDocument();
      }
    } finally {
      reader.close();
    }

  }
View Full Code Here


    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata,
                context.get(ServiceLoader.class, LOADER));
        try {
            Charset charset = reader.getCharset();
            String previous = metadata.get(Metadata.CONTENT_TYPE);
            if (previous == null || previous.startsWith("text/html")) {
                MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
                metadata.set(Metadata.CONTENT_TYPE, type.toString());
            }
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());

            // Get the HTML mapper from the parse context
            HtmlMapper mapper =
                    context.get(HtmlMapper.class, new HtmlParserMapper());

            // Parse the HTML document
            org.ccil.cowan.tagsoup.Parser parser =
                    new org.ccil.cowan.tagsoup.Parser();

            // TIKA-528: Reuse share schema to avoid heavy instantiation
            parser.setProperty(
                    org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
            parser.setFeature(
                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

            parser.setContentHandler(new XHTMLDowngradeHandler(
                    new HtmlHandler(mapper, handler, metadata)));

            parser.parse(reader.asInputSource());
        } finally {
            reader.close();
        }
    }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata, LOADER);
        try {
            Charset charset = reader.getCharset();
            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());

            XHTMLContentHandler xhtml =
                    new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();

            xhtml.startElement("p");
            char[] buffer = new char[4096];
            int n = reader.read(buffer);
            while (n != -1) {
                xhtml.characters(buffer, 0, n);
                n = reader.read(buffer);
            }
            xhtml.endElement("p");

            xhtml.endDocument();
        } finally {
            reader.close();
        }
    }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata, LOADER);
        try {
            Charset charset = reader.getCharset();
            String previous = metadata.get(Metadata.CONTENT_TYPE);
            if (previous == null || previous.startsWith("text/html")) {
                MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
                metadata.set(Metadata.CONTENT_TYPE, type.toString());
            }
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());

            // Get the HTML mapper from the parse context
            HtmlMapper mapper =
                    context.get(HtmlMapper.class, new HtmlParserMapper());

            // Parse the HTML document
            org.ccil.cowan.tagsoup.Parser parser =
                    new org.ccil.cowan.tagsoup.Parser();

            // TIKA-528: Reuse share schema to avoid heavy instantiation
            parser.setProperty(
                    org.ccil.cowan.tagsoup.Parser.schemaProperty, HTML_SCHEMA);
            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
            parser.setFeature(
                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

            parser.setContentHandler(new XHTMLDowngradeHandler(
                    new HtmlHandler(mapper, handler, metadata)));

            parser.parse(reader.asInputSource());
        } finally {
            reader.close();
        }
    }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata,
                context.get(ServiceLoader.class, LOADER));
        try {
            Charset charset = reader.getCharset();
            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());

            XHTMLContentHandler xhtml =
                    new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();

            xhtml.startElement("p");
            char[] buffer = new char[4096];
            int n = reader.read(buffer);
            while (n != -1) {
                xhtml.characters(buffer, 0, n);
                n = reader.read(buffer);
            }
            xhtml.endElement("p");

            xhtml.endDocument();
        } finally {
            reader.close();
        }
    }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata,
                context.get(ServiceLoader.class, LOADER));
        try {
            Charset charset = reader.getCharset();
            String previous = metadata.get(Metadata.CONTENT_TYPE);
            if (previous == null || previous.startsWith("text/html")) {
                MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
                metadata.set(Metadata.CONTENT_TYPE, type.toString());
            }
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());

            // Get the HTML mapper from the parse context
            HtmlMapper mapper =
                    context.get(HtmlMapper.class, new HtmlParserMapper());

            // Parse the HTML document
            org.ccil.cowan.tagsoup.Parser parser =
                    new org.ccil.cowan.tagsoup.Parser();

            // Use schema from context or default
            Schema schema = context.get(Schema.class, HTML_SCHEMA);

            // TIKA-528: Reuse share schema to avoid heavy instantiation
            parser.setProperty(
                    org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
            parser.setFeature(
                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

            parser.setContentHandler(new XHTMLDowngradeHandler(
                    new HtmlHandler(mapper, handler, metadata)));

            parser.parse(reader.asInputSource());
        } finally {
            reader.close();
        }
    }
View Full Code Here

  @Override
  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {

    AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));

    try {
      Charset charset = reader.getCharset();
      String mediaType = metadata.get(Metadata.CONTENT_TYPE);
      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
      if (mediaType != null && name != null) {
        MediaType type = MediaType.parse(mediaType);
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());

        StringBuilder out = new StringBuilder();
        String line;
        int nbLines =  0;
        while ((line = reader.readLine()) != null) {
            out.append(line + System.getProperty("line.separator"));
            String author = parserAuthor(line);
            if (author != null) {
              metadata.add(TikaCoreProperties.CREATOR, author);
            }
            nbLines ++;
        }
        metadata.set("LoC", String.valueOf(nbLines));
        Renderer renderer = getRenderer(type.toString());
       
        String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
       
        Schema schema = context.get(Schema.class, HTML_SCHEMA);

        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
        parser.setContentHandler(handler);
        parser.parse(new InputSource(new StringReader(codeAsHtml)));
      }
    } finally {
      reader.close();
    }

  }
View Full Code Here

        // Only outputting the MIME type as metadata
        metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);

        // The following code was taken from the TXTParser
        // Automatically detect the character encoding
        AutoDetectReader reader = new AutoDetectReader(
                new CloseShieldInputStream(stream), metadata);

        try {
            Charset charset = reader.getCharset();
            MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
            // deprecated, see TIKA-431
            metadata.set(Metadata.CONTENT_ENCODING, charset.name());

            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
                    metadata);

            xhtml.startDocument();

            // text contents of the xhtml
            String line;
            while ((line = reader.readLine()) != null) {
                xhtml.startElement("p");
                xhtml.characters(line);
                xhtml.endElement("p");
            }
           
            xhtml.endDocument();
        } finally {
            reader.close();
        }
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.detect.AutoDetectReader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.