Examples of org.apache.tika.parser.html.HtmlParser$HtmlParserMapper

org.apache.tika.parser.html.HtmlParser
Adapter class that maintains backwards compatibility with the protected HtmlParser methods. Making HtmlParser implement HtmlMapper directly would require those methods to be public, which would break backwards compatibility with subclasses. @deprecated Use the {@link HtmlMapper} mechanism to customizethe HTML mapping. This class will be removed in Tika 1.0.

                 data = ((ByteChunk)htmlChunk).getValue();
              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new EmbeddedContentHandler(new BodyContentHandler(xhtml)), 
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;

View Full Code Here

      Multipart mp = (Multipart) p.getContent();
      int count = mp.getCount();
      for (int i = 0; i < count; i++)
        content.append(getContentFromHTML(mp.getBodyPart(i)));
    } else if (p.isMimeType("text/html")) {
      HtmlParser parser = new HtmlParser();
      Metadata met = new Metadata();
      TextContentHandler handler = new TextContentHandler(
          new BodyContentHandler());
      parser.parse(new ByteArrayInputStream(((String) p.getContent())
          .getBytes()), handler, met);
      content.append(handler.toString());
    } else {
      Object obj = p.getContent();
      if (obj instanceof Part)

View Full Code Here

        StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f");
        while (tokenizer.hasMoreTokens()) {
            String name = tokenizer.nextToken();
            if (name.equals(
                    "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
                parsers.put("text/html", new HtmlParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/msexcel", parser);

View Full Code Here

        StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f");
        while (tokenizer.hasMoreTokens()) {
            String name = tokenizer.nextToken();
            if (name.equals(
                    "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
                parsers.put("text/html", new HtmlParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/msexcel", parser);

View Full Code Here

      Multipart mp = (Multipart) p.getContent();
      int count = mp.getCount();
      for (int i = 0; i < count; i++)
        content.append(getContentFromHTML(mp.getBodyPart(i)));
    } else if (p.isMimeType("text/html")) {
      HtmlParser parser = new HtmlParser();
      Metadata met = new Metadata();
      TextContentHandler handler = new TextContentHandler(
          new BodyContentHandler());
      parser.parse(new ByteArrayInputStream(((String) p.getContent())
          .getBytes()), handler, met);
      content.append(handler.toString());
    } else {
      Object obj = p.getContent();
      if (obj instanceof Part)

View Full Code Here

  private HtmlParser htmlParser;
  private ParseContext parseContext;


  public Parser(CrawlConfig config) {
    super(config);
    htmlParser = new HtmlParser();
    parseContext = new ParseContext();
  }

View Full Code Here

                 data = ((ByteChunk)htmlChunk).getValue();
              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new BodyContentHandler(xhtml), 
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;

View Full Code Here

                 data = ((ByteChunk)htmlChunk).getValue();
              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new EmbeddedContentHandler(new BodyContentHandler(xhtml)), 
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;

View Full Code Here

  private HtmlParser htmlParser;
  private ParseContext parseContext;


  public Parser(CrawlConfig config) {
    super(config);
    htmlParser = new HtmlParser();
    parseContext = new ParseContext();
  }

View Full Code Here

                 data = ((ByteChunk)htmlChunk).getValue();
              } else if(htmlChunk instanceof StringChunk) {
                 data = ((StringChunk)htmlChunk).getRawValue();
              }
              if(data != null) {
                 HtmlParser htmlParser = new HtmlParser();
                 htmlParser.parse(
                       new ByteArrayInputStream(data),
                       new BodyContentHandler(xhtml), 
                       new Metadata(), new ParseContext()
                 );
                 doneBody = true;

View Full Code Here

0 1 2

TOP

Related Classes of org.apache.tika.parser.html.HtmlParser$HtmlParserMapper

bixo.examples.webmining.DemoWebMiningTool

com.scaleunlimited.helpful.operations.ParseModMboxPageFunction

com.tamingtext.tika.TikaTest

edu.uci.ics.crawler4j.parser.Parser

org.apache.commons.io.input.CloseShieldInputStream

org.apache.jackrabbit.core.query.lucene.JackrabbitParser

org.apache.oodt.cas.protocol.imaps.ImapsProtocol

org.apache.oodt.cas.pushpull.protocol.http.HttpClient

org.apache.oodt.cas.pushpull.protocol.imaps.ImapsClient

org.apache.tika.detect.AutoDetectReader

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.