Package org.cyberneko.html.parsers

Examples of org.cyberneko.html.parsers.SAXParser


        return; // this test makes sense only for more recent Xerces versions
      }
     
      final String content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html></html>";
        ByteArrayInputStream input = new ByteArrayInputStream(content.getBytes("UTF-8"));
        SAXParser parser = new SAXParser();
       
        final Locator[] locators = { null };
       
        final ContentHandler contentHandler = new ContentHandler() {
      public void startPrefixMapping(String prefix, String uri) throws SAXException {
      }
     
      public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
      }
     
      public void startDocument() throws SAXException {
      }
     
      public void skippedEntity(String name) throws SAXException {
      }
     
      public void setDocumentLocator(Locator locator) {
        locators[0] = locator;
      }
     
      public void processingInstruction(String target, String data)
          throws SAXException {
      }
     
      public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
      }
     
      public void endPrefixMapping(String prefix) throws SAXException {
      }
     
      public void endElement(String uri, String localName, String qName) throws SAXException {
      }
     
      public void endDocument() throws SAXException {
      }
     
      public void characters(char[] ch, int start, int length) throws SAXException {
      }
    };
        parser.setContentHandler(contentHandler);
        parser.parse(new InputSource(input));
        assertEquals("UTF8", ((Locator2) locators[0]).getEncoding());
    }
View Full Code Here


    @Override
    public void setup(Map<String, Object> parameters) {
        super.setup(parameters);

        this.saxParser = new SAXParser();
        try {
            this.saxParser.setFeature(BALANCE_TAGS_URI, true);
            this.saxParser.setProperty(ELEMS_URI, "lower");
            this.saxParser.setFeature(OVERRIDE_NAMESPACES_URI, true);
            this.saxParser.setFeature(INSERT_NAMESPACES_URI, true);
View Full Code Here

     */
    public void rewrite(Rewriter rewriter, java.io.Reader reader, java.io.Writer writer)
            throws RewriterException
    {
        // use a cyberneko SAXParser
        SAXParser parser = new SAXParser() ;

        // setup filter chain
        XMLDocumentFilter[] filters = {
            new Purifier(),                                                                                  // [1] standard neko purifications (tag balancing, etc)
            new CallbackElementRemover( rewriter ),                                                          // [2] accept / reject tags based on advice from rewriter
            writer != null ? new org.cyberneko.html.filters.Writer( writer, null ) : new DefaultFilter()     // [3] propagate results to specified writer (or do nothing -- Default -- when writer is null)
        };
       
        String filtersPropName = "http://cyberneko.org/html/properties/filters";
  
        try
        {
            parser.setProperty(filtersPropName, filters);
        }
        catch (SAXException e)
        {
            // either no longer supported (SAXNotSupportedException), or no logner recognized (SAXNotRecognizedException)
            log.error(filtersPropName + " is, unexpectedly, no longer defined for the cyberneko HTML parser",e);
            throw new RewriterException("cyberneko parser version not supported",e);
        }

        try
        {
            // parse from reader
            parser.parse(new XMLInputSource( null, null, null, reader, null )) ;
        }
        catch (IOException e)
        {
            String msg = "cyberneko HTML parsing failure";
            log.error(msg,e);
View Full Code Here

    @Override
    public void setup(Map<String, Object> parameters) {
        super.setup(parameters);

        this.saxParser = new SAXParser();
        try {
            this.saxParser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
            this.saxParser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
            this.saxParser.setFeature("http://cyberneko.org/html/features/override-namespaces", true);
            this.saxParser.setFeature("http://cyberneko.org/html/features/insert-namespaces", true);
View Full Code Here

        }
*/       
    }

    protected void parseHTML(String text) throws SAXException {
        SAXParser parser = new SAXParser();
        parser.setProperty(
            "http://cyberneko.org/html/properties/names/elems",
            "lower"
        );
        parser.setProperty(
            "http://cyberneko.org/html/properties/names/attrs",
            "lower"
        );
        parser.setContentHandler(
            new DefaultHandler() {
                public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
                    if ( validDocElementName( localName ) ) {
                        cm.startElement(namespaceURI, localName, qName, atts);
                    }
                }
                public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
                    if ( validDocElementName( localName ) ) {
                        cm.endElement(namespaceURI, localName, qName);
                    }
                }
                public void characters(char[] ch, int start, int length) throws SAXException {
                    cm.characters(ch, start, length);
                }
            }
        );
        try {
            parser.parse( new InputSource(new StringReader( text )) );
        }
        catch (IOException e) {
            System.err.println( "This should never happen!" + e );
        }
    }
View Full Code Here

                new MatchingContentHandler(getTitleHandler(metadata), title),
                new MatchingContentHandler(getMetaHandler(metadata), meta));

        // Parse the HTML document
        xhtml.startDocument();
        SAXParser parser = new SAXParser();
        parser.setContentHandler(new XHTMLDowngradeHandler(handler));
        parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
        xhtml.endDocument();
    }
View Full Code Here

                new MatchingContentHandler(getBodyHandler(xhtml), body),
                new MatchingContentHandler(getTitleHandler(metadata), title),
                new MatchingContentHandler(getMetaHandler(metadata), meta));

        // Parse the HTML document
        SAXParser parser = new SAXParser();
        parser.setContentHandler(new XHTMLDowngradeHandler(handler));
        parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
    }
View Full Code Here

     */
    public void rewrite(Rewriter rewriter, java.io.Reader reader, java.io.Writer writer)
            throws RewriterException
    {
        // use a cyberneko SAXParser
        SAXParser parser = new SAXParser() ;

        // setup filter chain
        XMLDocumentFilter[] filters = {
            new Purifier(),                                                                                  // [1] standard neko purifications (tag balancing, etc)
            new CallbackElementRemover( rewriter ),                                                          // [2] accept / reject tags based on advice from rewriter
            writer != null ? new org.cyberneko.html.filters.Writer( writer, null ) : new DefaultFilter()     // [3] propagate results to specified writer (or do nothing -- Default -- when writer is null)
        };
       
        String filtersPropName = "http://cyberneko.org/html/properties/filters";
  
        try
        {
            parser.setProperty(filtersPropName, filters);
        }
        catch (SAXException e)
        {
            // either no longer supported (SAXNotSupportedException), or no logner recognized (SAXNotRecognizedException)
            log.error(filtersPropName + " is, unexpectedly, no longer defined for the cyberneko HTML parser",e);
            throw new RewriterException("cyberneko parser version not supported",e);
        }

        try
        {
            // parse from reader
            parser.parse(new XMLInputSource( null, null, null, reader, null )) ;
        }
        catch (IOException e)
        {
            String msg = "cyberneko HTML parsing failure";
            log.error(msg,e);
View Full Code Here

   * Initialize a Cyber Necko parser configured to return lower case element's names
   *
   * @return
   */
  private SAXParser initParser() {
    parser = new SAXParser();
    try {
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
      parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
View Full Code Here

  @Override
  public Parse parse(ContentEntity entity, Task newLink) throws DroidsException, IOException {
    // setup filter chain
    XMLDocumentFilter[] filters = { getRemover() };
    // create HTML parser
    SAXParser parser = getParser(filters);
    LinkExtractor linkExtractor = new LinkExtractor((Link)newLink, elements);
    parser.setContentHandler(linkExtractor);
    InputStream instream = entity.obtainContent();
    try {
      parser.parse(new InputSource(instream));
    } catch (SAXException ex) {
      throw new ContentFormatViolationException("Failure parsing HTML content", ex);
    } finally {
      instream.close();
    }
View Full Code Here

TOP

Related Classes of org.cyberneko.html.parsers.SAXParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.