Package org.cyberneko.html

Examples of org.cyberneko.html.HTMLScanner$PlaybackInputStream


   * @return a document handler containing the parsed source
   */
  private DocumentHandler parseHtmlImpl(String source) throws IOException {
    HTMLConfiguration config = newConfiguration();

    HTMLScanner htmlScanner = new HTMLScanner();
    HTMLTagBalancer tagBalancer = new HTMLTagBalancer();

    DocumentHandler handler = newDocumentHandler(source, htmlScanner);

    if (config.getFeature("http://xml.org/sax/features/namespaces")) {
      NamespaceBinder namespaceBinder = new NamespaceBinder();
      namespaceBinder.setDocumentHandler(handler);
      namespaceBinder.setDocumentSource(tagBalancer);
      namespaceBinder.reset(config);
      tagBalancer.setDocumentHandler(namespaceBinder);
    } else {
      tagBalancer.setDocumentHandler(handler);
    }

    tagBalancer.setDocumentSource(htmlScanner);
    htmlScanner.setDocumentHandler(tagBalancer);

    tagBalancer.reset(config);
    htmlScanner.reset(config);

    XMLInputSource inputSource = new XMLInputSource(null, null, null);
    inputSource.setEncoding("UTF-8");
    inputSource.setCharacterStream(new StringReader(source));
    htmlScanner.setInputSource(inputSource);
    htmlScanner.scanDocument(true);
    return handler;
  }
View Full Code Here


   */
  private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config,
      NormalizingTagBalancer tagBalancer)
      throws IOException {

    HTMLScanner htmlScanner = new HTMLScanner();
    tagBalancer.setScanner(htmlScanner);

    DocumentHandler handler = newDocumentHandler(source);

    NamespaceBinder namespaceBinder = new NamespaceBinder();
    namespaceBinder.setDocumentHandler(handler);
    namespaceBinder.setDocumentSource(tagBalancer);
    namespaceBinder.reset(config);
    tagBalancer.setDocumentHandler(namespaceBinder);

    // Order of filter is Scanner -> OSMLFilter -> Tag Balancer
    tagBalancer.setDocumentSource(htmlScanner);
    htmlScanner.setDocumentHandler(tagBalancer);

    tagBalancer.reset(config);
    htmlScanner.reset(config);

    XMLInputSource inputSource = new XMLInputSource(null, null, null);
    inputSource.setEncoding("UTF-8");
    inputSource.setCharacterStream(new StringReader(source));
    htmlScanner.setInputSource(inputSource);
    htmlScanner.scanDocument(true);
    return handler;
  }
View Full Code Here

   */
  private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config,
      NormalizingTagBalancer tagBalancer)
      throws IOException {

    HTMLScanner htmlScanner = new HTMLScanner();
    tagBalancer.setScanner(htmlScanner);

    DocumentHandler handler = newDocumentHandler(source);

    NamespaceBinder namespaceBinder = new NamespaceBinder();
    namespaceBinder.setDocumentHandler(handler);
    namespaceBinder.setDocumentSource(tagBalancer);
    namespaceBinder.reset(config);
    tagBalancer.setDocumentHandler(namespaceBinder);

    // Order of filter is Scanner -> OSMLFilter -> Tag Balancer
    tagBalancer.setDocumentSource(htmlScanner);
    htmlScanner.setDocumentHandler(tagBalancer);

    tagBalancer.reset(config);
    htmlScanner.reset(config);

    XMLInputSource inputSource = new XMLInputSource(null, null, null);
    inputSource.setEncoding("UTF-8");
    inputSource.setCharacterStream(new StringReader(source));
    htmlScanner.setInputSource(inputSource);
    htmlScanner.scanDocument(true);
    return handler;
  }
View Full Code Here

    this.documentFactory = documentFactory;
  }

  @Override
  protected Document parseDomImpl(String source) {
    HTMLScanner htmlScanner = new HTMLScanner();
    HTMLTagBalancer tagBalancer = new HTMLTagBalancer();
    DocumentHandler handler = new DocumentHandler(source);
    tagBalancer.setDocumentHandler(handler);
    htmlScanner.setDocumentHandler(tagBalancer);

    HTMLConfiguration config = new HTMLConfiguration();
    // Maintain original case for elements and attributes
    config.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
    config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
    // Parse as fragment.
    config.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
    // Get notified of entity and character references
    config.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true);
    config.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
    tagBalancer.reset(config);
    htmlScanner.reset(config);
    XMLInputSource inputSource = new XMLInputSource(null, null, null);
    inputSource.setEncoding("UTF-8");
    inputSource.setCharacterStream(new StringReader(source));
    try {
      htmlScanner.setInputSource(inputSource);
      htmlScanner.scanDocument(true);
      Document document = handler.getDocument();
      DocumentFragment fragment = handler.getFragment();
      normalizeFragment(document, fragment);
      HtmlSerializer.attach(document, new NekoSerializer(), source);
      return document;
View Full Code Here

   */
  private DocumentHandler parseHtmlImpl(String source, HTMLConfiguration config,
      NormalizingTagBalancer tagBalancer)
      throws IOException {

    HTMLScanner htmlScanner = new HTMLScanner();
    tagBalancer.setScanner(htmlScanner);

    DocumentHandler handler = newDocumentHandler(source);

    NamespaceBinder namespaceBinder = new NamespaceBinder();
    namespaceBinder.setDocumentHandler(handler);
    namespaceBinder.setDocumentSource(tagBalancer);
    namespaceBinder.reset(config);
    tagBalancer.setDocumentHandler(namespaceBinder);

    // Order of filter is Scanner -> OSMLFilter -> Tag Balancer
    tagBalancer.setDocumentSource(htmlScanner);
    htmlScanner.setDocumentHandler(tagBalancer);

    tagBalancer.reset(config);
    htmlScanner.reset(config);

    XMLInputSource inputSource = new XMLInputSource(null, null, null);
    inputSource.setEncoding("UTF-8");
    inputSource.setCharacterStream(new StringReader(source));
    htmlScanner.setInputSource(inputSource);
    htmlScanner.scanDocument(true);
    return handler;
  }
View Full Code Here

TOP

Related Classes of org.cyberneko.html.HTMLScanner$PlaybackInputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.