Package org.ccil.cowan.tagsoup

Examples of org.ccil.cowan.tagsoup.Parser


        URL url = new URL(page);
        File file = new File(targetDir, ".manualCache-" + url.getFile().substring(1));
       
        try {
            HttpURLConnection con = (HttpURLConnection)url.openConnection();
            XMLReader parser = new Parser();
            parser.setFeature(Parser.namespacesFeature, false);
            parser.setFeature(Parser.namespacePrefixesFeature, false);
            parser.setProperty(Parser.schemaProperty, new org.ccil.cowan.tagsoup.HTMLSchema() {
                {
                    //problem with nested lists that the confluence {toc} macro creates
                    elementType("ul", M_LI, M_BLOCK | M_LI, 0);
                }
            });
           
            StringWriter w = new StringWriter();
            XMLWriter xmlWriter = new XMLWriter(w) {
                int inDiv = Integer.MAX_VALUE;
                int count;
                public void characters(char ch[], int start, int len)
                    throws SAXException {
                    if (inDiv <= count) {
                        super.characters(ch, start, len);
                    }
                }
                public void startElement(String uri, String localName, String qName, Attributes atts)
                    throws SAXException {
                    count++;
                    if ("div".equalsIgnoreCase(qName)
                        && "wiki-content maincontent".equalsIgnoreCase(atts.getValue("class"))) {
                        inDiv = count;
                    }
                    if (inDiv <= count) {
                        super.startElement(uri, localName, qName, atts);
                    }
                }
                public void endElement(String uri, String localName, String qName) throws SAXException {
                    if (inDiv <= count) {
                        super.endElement(uri, localName, qName);
                    }
                    count--;
                    if (inDiv > count) {
                        inDiv = Integer.MAX_VALUE;
                    }
                }
            };
            xmlWriter.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
            xmlWriter.setOutputProperty(XMLWriter.METHOD, "html");
            parser.setContentHandler(xmlWriter);
            long date = con.getLastModified();
            parser.parse(new InputSource(new BufferedInputStream(con.getInputStream())));

           
            FileWriter writer = new FileWriter(file);
            writer.write(Long.toString(date));
            writer.close();
View Full Code Here


    }

    private XdmNode tagSoup(String text) {
        StringReader inputStream = new StringReader(text);
        InputSource source = new InputSource(inputStream);
        Parser parser = new Parser();
        parser.setEntityResolver(runtime.getResolver());
        SAXSource saxSource = new SAXSource(parser, source);
        DocumentBuilder builder = runtime.getProcessor().newDocumentBuilder();
        try {
            XdmNode doc = builder.build(saxSource);
            return doc;
View Full Code Here

          parser.setEntityResolver(new XMLEntityResolverDefaultHandler(validator));
          parser.parse(xml);
          return parser.getDocument();*/
        }
       
        XMLReader reader = new Parser();
            reader.setFeature(Parser.namespacesFeature, true);
            reader.setFeature(Parser.namespacePrefixesFeature, true);
       
        try {
            Transformer transformer = TransformerFactory.newInstance().newTransformer();
           
            DOMResult result = new DOMResult();
View Full Code Here

 
    public ConvertHTML(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) throws SAXNotRecognizedException, SAXNotSupportedException {
      super(builder, config, parent, child, context);
      this.charset = getConfigs().getCharset(config, "charset", null);
      this.omitXMLDeclaration = getConfigs().getBoolean(config, "omitXMLDeclaration", false);     
      this.xmlReader = new Parser(); // no reuse?
      xmlReader.setProperty(Parser.schemaProperty, htmlSchema);
      xmlReader.setFeature(Parser.CDATAElementsFeature, getConfigs().getBoolean(config, "noCDATA", false));
      xmlReader.setFeature(Parser.namespacesFeature, !getConfigs().getBoolean(config, "noNamespaces", true));
      xmlReader.setFeature(Parser.ignoreBogonsFeature, getConfigs().getBoolean(config, "noBogons", false)); // also see TIKA-599
      xmlReader.setFeature(Parser.bogonsEmptyFeature, getConfigs().getBoolean(config, "emptyBogons", false));
View Full Code Here

 
    public ConvertHTML(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) throws SAXNotRecognizedException, SAXNotSupportedException {
      super(builder, config, parent, child, context);
      this.charset = getConfigs().getCharset(config, "charset", null);
      this.omitXMLDeclaration = getConfigs().getBoolean(config, "omitXMLDeclaration", false);     
      this.xmlReader = new Parser(); // no reuse?
      xmlReader.setProperty(Parser.schemaProperty, htmlSchema);
      xmlReader.setFeature(Parser.CDATAElementsFeature, getConfigs().getBoolean(config, "noCDATA", false));
      xmlReader.setFeature(Parser.namespacesFeature, !getConfigs().getBoolean(config, "noNamespaces", true));
      xmlReader.setFeature(Parser.ignoreBogonsFeature, getConfigs().getBoolean(config, "noBogons", false)); // also see TIKA-599
      xmlReader.setFeature(Parser.bogonsEmptyFeature, getConfigs().getBoolean(config, "emptyBogons", false));
View Full Code Here

import fueltrack.server.motormouth.model.FuelPriceRecord;

public class HTMLParser {
  public static List<FuelPriceRecord> parseHTML(Reader htmlInputReader) throws IOException, SAXException, TransformerException, TransformerFactoryConfigurationError {
    SimpleDateFormat format = new SimpleDateFormat("dd/MM/yy h:mm a");
    XMLReader reader = new Parser();
   
    Source s = new SAXSource(reader, new InputSource(htmlInputReader));
    DOMResult r = new DOMResult();
   
    TransformerFactory.newInstance().newTransformer(new StreamSource(HTMLParser.class.getClassLoader().getResourceAsStream("fueltrack/server/motormouth/TransformHTML.xsl"))).transform(s, r);
View Full Code Here

      return null;
    }
   
    final StringBuilder builder = new StringBuilder();
   
    XMLReader reader = new Parser();
   
    reader.setContentHandler(new DefaultHandler() {
      @Override
      public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
        super.startElement(uri, localName, qName, attributes);
       
        if ("input".equalsIgnoreCase(localName)) {
          if (tagName.equalsIgnoreCase(attributes.getValue("name"))) {
            builder.append(attributes.getValue("value"));
            throw new SAXTerminateProcessing();
          }
        }
      }
    });
   
    try {
      reader.parse(new InputSource(htmlInputReader));
    } catch (SAXTerminateProcessing e) {}
   
    return builder.toString();
  }
View Full Code Here

  private Parser getParser() throws SAXNotSupportedException, SAXNotRecognizedException
    {
    if( parser != null )
      return parser;

    parser = new Parser();
    parser.setProperty( Parser.schemaProperty, getSchema() );

    if( features != null )
      {
      for( Map.Entry<String, Boolean> entry : features.entrySet() )
View Full Code Here

                    builderFactory.setIgnoringElementContentWhitespace(true);
                    DocumentBuilder builder = builderFactory.newDocumentBuilder();
                    document = builder.parse(is);
                } else if (contentType.matches("text/html.*")) {
                    DOMHandler domHandler = new DOMHandler();
                    Parser parser = new Parser();
                    parser.setContentHandler(domHandler);
                    parser.parse(new InputSource(is));
                    document = domHandler.getDocument();
                }
                if (document != null) {
                    document.getDocumentElement().normalize();
                    rootElement = document.getDocumentElement();
View Full Code Here

TOP

Related Classes of org.ccil.cowan.tagsoup.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.