Package org.apache.tika.parser.html

Examples of org.apache.tika.parser.html.HtmlParser$HtmlParserMapper


                 data = ((ByteChunk)htmlChunk).getValue();
              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;
View Full Code Here


      Multipart mp = (Multipart) p.getContent();
      int count = mp.getCount();
      for (int i = 0; i < count; i++)
        content.append(getContentFromHTML(mp.getBodyPart(i)));
    } else if (p.isMimeType("text/html")) {
      HtmlParser parser = new HtmlParser();
      Metadata met = new Metadata();
      TextContentHandler handler = new TextContentHandler(
          new BodyContentHandler());
      parser.parse(new ByteArrayInputStream(((String) p.getContent())
          .getBytes()), handler, met);
      content.append(handler.toString());
    } else {
      Object obj = p.getContent();
      if (obj instanceof Part)
View Full Code Here

        StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f");
        while (tokenizer.hasMoreTokens()) {
            String name = tokenizer.nextToken();
            if (name.equals(
                    "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
                parsers.put("text/html", new HtmlParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/msexcel", parser);
View Full Code Here

        StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f");
        while (tokenizer.hasMoreTokens()) {
            String name = tokenizer.nextToken();
            if (name.equals(
                    "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
                parsers.put("text/html", new HtmlParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/msexcel", parser);
View Full Code Here

      Multipart mp = (Multipart) p.getContent();
      int count = mp.getCount();
      for (int i = 0; i < count; i++)
        content.append(getContentFromHTML(mp.getBodyPart(i)));
    } else if (p.isMimeType("text/html")) {
      HtmlParser parser = new HtmlParser();
      Metadata met = new Metadata();
      TextContentHandler handler = new TextContentHandler(
          new BodyContentHandler());
      parser.parse(new ByteArrayInputStream(((String) p.getContent())
          .getBytes()), handler, met);
      content.append(handler.toString());
    } else {
      Object obj = p.getContent();
      if (obj instanceof Part)
View Full Code Here

  private HtmlParser htmlParser;
  private ParseContext parseContext;

  public Parser(CrawlConfig config) {
    super(config);
    htmlParser = new HtmlParser();
    parseContext = new ParseContext();
  }
View Full Code Here

                 data = ((ByteChunk)htmlChunk).getValue();
              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new BodyContentHandler(xhtml),
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;
View Full Code Here

                 data = ((ByteChunk)htmlChunk).getValue();
              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;
View Full Code Here

  private HtmlParser htmlParser;
  private ParseContext parseContext;

  public Parser(CrawlConfig config) {
    super(config);
    htmlParser = new HtmlParser();
    parseContext = new ParseContext();
  }
View Full Code Here

                 data = ((ByteChunk)htmlChunk).getValue();
              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new BodyContentHandler(xhtml),
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.html.HtmlParser$HtmlParserMapper

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.