Package org.apache.tika.parser.html

Examples of org.apache.tika.parser.html.HtmlParser$HtmlParserMapper


                 data = ((ByteChunk)htmlChunk).getValue();
              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;
View Full Code Here


  public static AutoDetectParser createParser() {
    final AutoDetectParser parser = new AutoDetectParser();

    Map<MediaType,Parser> parsers = parser.getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    parser.setParsers(parsers);

    parser.setFallback(new Parser() {
      public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return parser.getSupportedTypes(parseContext);
View Full Code Here

              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                  // nocommit same problem here?
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;
View Full Code Here

  public static AutoDetectParser createParser() {
    final AutoDetectParser parser = new AutoDetectParser();

    Map<MediaType,Parser> parsers = parser.getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    parser.setParsers(parsers);

    parser.setFallback(new Parser() {
      public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return parser.getSupportedTypes(parseContext);
View Full Code Here

     */
    private String extract(byte[][] byteObject) {// throws IOException
        StringBuilder wBuf = new StringBuilder();
        InputStream stream = null;
        Metadata metadata = new Metadata();
        HtmlParser htmlParser = new HtmlParser();
        BodyContentHandler handler = new BodyContentHandler(-1);// -1
        ParseContext parser = new ParseContext();
        try {
            for (int i = 0; i < byteObject.length; i++) {
                stream = new ByteArrayInputStream(byteObject[i]);
                try {
                    htmlParser.parse(stream, handler, metadata, parser);
                } catch (TikaException e) {
                    wBuf.append(new String(byteObject[i]));
//                    System.err.println("\n"
//                            + CHMDocumentInformation.class.getName()
//                            + " extract " + e.getMessage());
View Full Code Here

  @SuppressWarnings("serial")
  public static AutoDetectParser createParser(TikaConfig tikaConfig) {
    final AutoDetectParser parser = new AutoDetectParser(tikaConfig);

    Map<MediaType,Parser> parsers = parser.getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    parser.setParsers(parsers);

    parser.setFallback(new Parser() {
      public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return parser.getSupportedTypes(parseContext);
View Full Code Here

                 data = ((ByteChunk)htmlChunk).getValue();
              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;
View Full Code Here

     */
    private String extract(byte[] byteObject) throws TikaException {// throws IOException
        StringBuilder wBuf = new StringBuilder();
        InputStream stream = null;
        Metadata metadata = new Metadata();
        HtmlParser htmlParser = new HtmlParser();
        BodyContentHandler handler = new BodyContentHandler(-1);// -1
        ParseContext parser = new ParseContext();
        try {
            stream = new ByteArrayInputStream(byteObject);
            htmlParser.parse(stream, handler, metadata, parser);
            wBuf.append(handler.toString()
                    + System.getProperty("line.separator"));
        } catch (SAXException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.html.HtmlParser$HtmlParserMapper

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.