Package org.ccil.cowan.tagsoup

Examples of org.ccil.cowan.tagsoup.Parser


        }
    }

    private static HTMLDocument getHtmlDocument(Reader reader)
            throws IOException, SAXException {
        XMLReaderAdapter parser = new XMLReaderAdapter(new Parser());
        HTMLBuilder builder = new HTMLBuilder();
        parser.setDocumentHandler(builder);
        parser.parse(new InputSource(reader));
        return builder.getHTMLDocument();
    }
View Full Code Here


    /**
     * @see org.apache.sling.commons.html.HtmlParser#parse(java.io.InputStream, java.lang.String, org.xml.sax.ContentHandler)
     */
    public void parse(InputStream stream, String encoding, ContentHandler ch)
    throws SAXException {
        final Parser parser = new Parser();
        if ( ch instanceof LexicalHandler ) {
            parser.setProperty("http://xml.org/sax/properties/lexical-handler", ch);
        }
        parser.setContentHandler(ch);
        final InputSource source = new InputSource(stream);
        source.setEncoding(encoding);
        try {
            parser.parse(source);
        } catch (IOException ioe) {
            throw new SAXException(ioe);
        }
    }
View Full Code Here

    /**
     * @see org.apache.sling.commons.html.HtmlParser#parse(java.lang.String, java.io.InputStream, java.lang.String)
     */
    public Document parse(String systemId, InputStream stream, String encoding) throws IOException {
        final Parser parser = new Parser();

        final DOMBuilder builder = new DOMBuilder();

        final InputSource source = new InputSource(stream);
        source.setEncoding(encoding);
        source.setSystemId(systemId);

        try {
            parser.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
            parser.setContentHandler(builder);
            parser.parse(source);
        } catch (SAXException se) {
            if ( se.getCause() instanceof IOException ) {
                throw (IOException) se.getCause();
            }
            throw (IOException) new IOException("Unable to parse xml.").initCause(se);
View Full Code Here

        // Call parser.parse
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
       
        // Now take the resulting HTML, process it using Dom4J
        SAXReader reader = new SAXReader(new Parser());
        reader.setEncoding("UTF-8");
        String htmlWithMarkup = parsedDatum.getParsedText();
        Document doc = reader.read(new StringInputStream(htmlWithMarkup));
       
        // We have to do helicopter stunts since HTML has a global namespace on it, set
View Full Code Here

   
    @Override
    public void prepare(FlowProcess process, OperationCall<NullContext> opCall) {
        super.prepare(process, opCall);
       
        _reader = new SAXReader(new Parser());
        _reader.setXMLFilter(new DowngradeXmlFilter(_removeNamespaces));
        _reader.setEncoding("UTF-8");
        _input = new ParsedDatum();
    }
View Full Code Here

  if (in == null) {
      res.put (resultPN, null);
      return;
  }

  Parser parser = new Parser ();
  SAXEventBufferImpl sb = new SAXEventBufferImpl ();
  parser.setContentHandler(sb);
  try {
      parser.parse(new InputSource (in));
  } catch (IOException e) {
      throw new CannotExecuteException
    ("Cannot parse response body: " + e.getMessage ());
  } catch (SAXException e) {
      throw new CannotExecuteException
View Full Code Here

        n = doc.createElement("description");
        itemNode.appendChild(n);
   
        // for now we always assume html: see Rome bug #26
  //      if (sc.getType().equals("text/html")){
          Parser p = new Parser();
          try {
           
            SaferHTMLHandler c = new SaferHTMLHandler(doc,n);
            p.setContentHandler(c);
            p.parse(new InputSource(new StringReader(text)));
           
          } catch (IOException e) {
            throw new RuntimeException(e);
          } catch (SAXException e) {
            throw new RuntimeException(e);
View Full Code Here

     *
     * @return
     * @throws CamelException
     */
    protected XMLReader createTagSoupParser() throws CamelException {
        XMLReader reader = new Parser();
        try {
            reader.setFeature(Parser.namespacesFeature, false);
            reader.setFeature(Parser.namespacePrefixesFeature, false);

            /*
             * set each parser feature that the user may have supplied.
             * http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html
             * http://home.ccil.org/~cowan/XML/tagsoup/#properties
             */

            if (getParserFeatures() != null) {
                for (Entry<String, Boolean> e : getParserFeatures().entrySet()) {
                    reader.setFeature(e.getKey(), e.getValue());
                }
            }

            /*
             * set each parser feature that the user may have supplied. {@link
             * http://home.ccil.org/~cowan/XML/tagsoup/#properties}
             */

            if (getParserPropeties() != null) {
                for (Entry<String, Object> e : getParserPropeties().entrySet()) {
                    reader.setProperty(e.getKey(), e.getValue());
                }
            }

            /*
             * default the schema to HTML
             */
            if (this.getParsingSchema() != null) {
                reader.setProperty(Parser.schemaProperty, getParsingSchema());
            }

        } catch (Exception e) {
            throw new IllegalArgumentException("Problem configuring the parser", e);
        }
View Full Code Here

    byte[] bytes;
    try {
      String xhtml = XHTML_START + badHTMLSnippet + XHTML_END;
      bytes = xhtml.getBytes("UTF-8");
      InputStream in = new ByteArrayInputStream(bytes);
      Parser parser = new Parser();
      HTMLSchema schema = new HTMLSchema();
      parser.setProperty(Parser.schemaProperty, schema);
      Writer w = new StringWriter();
      XMLWriter x = new XMLWriter(w);
      x.setOutputProperty(XMLWriter.METHOD, "xml");
      x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
      x.setPrefix(schema.getURI(), "");

      parser.setFeature(Parser.namespacesFeature, false);
      parser.setFeature(Parser.defaultAttributesFeature, true);
      parser.setContentHandler(x);

      InputSource is = new InputSource(in);
      is.setEncoding("UTF-8");
      parser.parse(is);
      return w.toString();
    } catch (IOException e) {
      e.printStackTrace();
    } catch (SAXException e) {
      e.printStackTrace();
View Full Code Here


public class PrincipalIefInfoParseUm {
  public static void main(String args[]){
   
    Parser parser = new Parser();
    FileInputStream fis = null;
    try {
      fis = new FileInputStream("C:\\Users\\Einstein\\workspace\\TG6_Parse\\src\\iefExemplo.html");
      //fis = new FileInputStream("D:\\Iefs\\1000.txt");//sem pagina
      //fis = new FileInputStream("D:\\Iefs\\270.txt"); //sem sigla
      //fis = new FileInputStream("D:\\Iefs\\1849.txt"); // Cidade com hifen e sem bairro
      //fis = new FileInputStream("D:\\Iefs\\2089.txt");//sem homepage @@
      //fis = new FileInputStream("D:\\Iefs\\2092.txt");//sem cep
      //fis = new FileInputStream("D:\\Iefs\\2081.txt");//sem cep //sem homepage
      //fis = new FileInputStream("D:\\Iefs\\450.txt");// cidade com - (Ji-parana) // nome do ief com -
      //fis = new FileInputStream("D:\\Iefs\\19.txt");// sigla com - (PUC-Campinas) // nome do ief com -
     
    } catch (FileNotFoundException e1) {
      e1.printStackTrace();
    }
    InputStream is = fis;
    IefSAXHandler handler = new IefSAXHandler();
    parser.setContentHandler(handler);
    InputSource input = new InputSource(is);
    try {
      parser.parse(input);
    } catch (IOException e) {
      e.printStackTrace();
    } catch (SAXException e) { 
      //e.printStackTrace();
    }
View Full Code Here

TOP

Related Classes of org.ccil.cowan.tagsoup.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.