Package org.apache.tika.parser.html

Examples of org.apache.tika.parser.html.HtmlParser$TitleExtractingContentHandler


    super(UrlDatum.FIELDS);
  }

  private synchronized void init() {
    if (_parser == null) {
      _parser = new HtmlParser();
    }
   
    if (_handler == null) {
      _handler = new DefaultHandler() {
View Full Code Here


            fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);
           
            // We only care about mime types that the Tika HTML parser can handle,
            // so restrict it to the same.
            Set<String> validMimeTypes = new HashSet<String>();
            Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
            for (MediaType supportedType : supportedTypes) {
                validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
            }
            fetcherPolicy.setValidMimeTypes(validMimeTypes);
View Full Code Here

    InputStream input = new ByteArrayInputStream(html.getBytes(Charset.forName("UTF-8")));
    ContentHandler text = new BodyContentHandler();//<co id="html.text.co"/>
    LinkContentHandler links = new LinkContentHandler();//<co id="html.link.co"/>
    ContentHandler handler = new TeeContentHandler(links, text);//<co id="html.merge"/>
    Metadata metadata = new Metadata();//<co id="html.store"/>
    Parser parser = new HtmlParser();//<co id="html.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, handler, metadata, context);//<co id="html.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));
    System.out.println("Body: " + text.toString());
    System.out.println("Links: " + links.getLinks());
    /*
    <calloutlist>
View Full Code Here

     */
    private String extract(byte[] byteObject) throws TikaException {// throws IOException
        StringBuilder wBuf = new StringBuilder();
        InputStream stream = null;
        Metadata metadata = new Metadata();
        HtmlParser htmlParser = new HtmlParser();
        BodyContentHandler handler = new BodyContentHandler(-1);// -1
        ParseContext parser = new ParseContext();
        try {
            stream = new ByteArrayInputStream(byteObject);
            htmlParser.parse(stream, handler, metadata, parser);
            wBuf.append(handler.toString()
                    + System.getProperty("line.separator"));
        } catch (SAXException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
View Full Code Here

  @SuppressWarnings("serial")
  public static AutoDetectParser createParser() {
    final AutoDetectParser parser = new AutoDetectParser();

    Map<MediaType,Parser> parsers = parser.getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    parser.setParsers(parsers);

    parser.setFallback(new Parser() {
      public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return parser.getSupportedTypes(parseContext);
View Full Code Here

        StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f");
        while (tokenizer.hasMoreTokens()) {
            String name = tokenizer.nextToken();
            if (name.equals(
                    "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
                parsers.put(MediaType.text("html"), new HtmlParser());
            } else if (name.equals("org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put(MediaType.application("vnd.ms-excel"), parser);
                parsers.put(MediaType.application("msexcel"), parser);
                parsers.put(MediaType.application("excel"), parser);
View Full Code Here

        StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f");
        while (tokenizer.hasMoreTokens()) {
            String name = tokenizer.nextToken();
            if (name.equals(
                    "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
                parsers.put("text/html", new HtmlParser());
            } else if (name.equals(
                    "org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
                Parser parser = new OfficeParser();
                parsers.put("application/vnd.ms-excel", parser);
                parsers.put("application/msexcel", parser);
View Full Code Here

      Multipart mp = (Multipart) p.getContent();
      int count = mp.getCount();
      for (int i = 0; i < count; i++)
        content.append(getContentFromHTML(mp.getBodyPart(i)));
    } else if (p.isMimeType("text/html")) {
      HtmlParser parser = new HtmlParser();
      Metadata met = new Metadata();
      TextContentHandler handler = new TextContentHandler(
          new BodyContentHandler());
      parser.parse(new ByteArrayInputStream(((String) p.getContent())
          .getBytes()), handler, met);
      content.append(handler.toString());
    } else {
      Object obj = p.getContent();
      if (obj instanceof Part)
View Full Code Here

        // Read in link
        StringBuffer sb = new StringBuffer("");
        while (scanner.hasNext())
          sb.append(scanner.nextLine());

        HtmlParser parser = new HtmlParser();
        Metadata met = new Metadata();
        LinkContentHandler handler = new LinkContentHandler();

        parser.parse(new ByteArrayInputStream(sb.toString().getBytes()),
            handler, met);
        List<Link> links = handler.getLinks();
        children = new LinkedList<ProtocolFile>();
        for (Link link : links) {
          String href = link.getUri();
View Full Code Here

  public static AutoDetectParser createParser() {
    final AutoDetectParser parser = new AutoDetectParser();

    Map<MediaType,Parser> parsers = parser.getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    parser.setParsers(parsers);

    parser.setFallback(new Parser() {
      public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return parser.getSupportedTypes(parseContext);
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.html.HtmlParser$TitleExtractingContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.