Package org.ccil.cowan.tagsoup

Examples of org.ccil.cowan.tagsoup.Parser


import org.xml.sax.SAXException;


public class PrincipalLattesParseUm {
  public static void main(String[] args){
    Parser parser = new Parser();
    FileInputStream fis = null;
    try {
      fis = new FileInputStream("C:\\Users\\Einstein\\workspace\\TG6_Parse\\src\\lattesExemplo.html");
      //fis = new FileInputStream("C:\\Users\\Einstein\\workspace\\TG6_Parse\\src\\lattesExemplo2.html");
      //fis = new FileInputStream("D:\\Lattes\\1234.html");
      //fis = new FileInputStream("D:\\Lattes\\3455.html");
      //fis = new FileInputStream("D:\\Lattes\\3454.html");
      //fis = new FileInputStream("D:\\Lattes\\3453.html");
      //fis = new FileInputStream("D:\\Lattes\\3550.html");
      //fis = new FileInputStream("D:\\Lattes\\3544.html");
      //fis = new FileInputStream("D:\\Lattes\\6.html");
      //fis = new FileInputStream("D:\\Lattes\\3442.html");//sem endereco
     
    } catch (FileNotFoundException e1) {
      e1.printStackTrace();
    }
    InputStream is = fis;
    LattesSAXHandler handler = new LattesSAXHandler();
    parser.setContentHandler(handler);
    InputSource input = new InputSource(is);
    try {
      parser.parse(input);
    } catch (IOException e) {
      e.printStackTrace();
    } catch (SAXException e) { 
      //e.printStackTrace();
    }
View Full Code Here


      String xhtml = HTML_START + xhtmlArtifact + XHTML_END;

      bytes = xhtml.getBytes("UTF-8");

      InputStream in = new ByteArrayInputStream(bytes);
      Parser parser = new Parser();
      HTMLSchema schema = new HTMLSchema();
      parser.setProperty(Parser.schemaProperty, schema);
      Writer w = new StringWriter();
      XMLWriter x = new XMLWriter(w);
      x.setOutputProperty(XMLWriter.METHOD, "xml");
      x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
      // x.setPrefix(schema.getURI(), "");

      parser.setFeature(Parser.namespacesFeature, false);
      parser.setFeature(Parser.defaultAttributesFeature, true);
      parser.setContentHandler(x);
      InputSource is = new InputSource(in);
      is.setEncoding("UTF-8");
      parser.parse(is);
      XhtmlValidator validator = new XhtmlValidator();
      xhtml = w.toString();
      bytes = (XHTML_START + xhtml).getBytes("UTF-8");
      in = new ByteArrayInputStream(bytes);
      validator.isValid(in);
View Full Code Here

  @Override
  public void run() {
   
    int iefCount = controle.getIefCount();
   
    Parser parser = new Parser();
    FileInputStream fis = null;
       
    while (iefCount!=-1){
        try {
          fis = new FileInputStream(myIefDir+iefCount+".txt");
                   
        InputStream is = fis;
        IefSAXHandler handler = new IefSAXHandler();
        parser.setContentHandler(handler);
        InputSource input = new InputSource(is);
       
        parser.parse(input);
        if(!handler.isErrorPageFromSite()) iefInfoTelaDAO.insert(handler.getCorrente());
        System.out.println("Thread "+numero+": Inserido ief "+iefCount);
       
      }catch (ApplicationException e) {
        e.printStackTrace();
View Full Code Here

  @Override
  public void run() {
   
    int iefCount = controle.getIefCount();
   
    Parser parser = new Parser();
    FileInputStream fis = null;
       
    while (iefCount!=-1){
        try {
          fis = new FileInputStream(myIefDir+iefCount+".txt");
                   
        InputStream is = fis;
        IefSAXHandler handler = new IefSAXHandler();
        parser.setContentHandler(handler);
        InputSource input = new InputSource(is);
       
        parser.parse(input);
        if(!handler.isErrorPageFromSite()) {
          IefInfoTela aux = new IefInfoTela();
          aux.setNum(handler.getCorrente().getNum());
          aux.setNome("");
          aux.setSigla("");
View Full Code Here

  @Override
  public void run() {
   
    List<String> resultList = controle.getUrlLattes();
   
    Parser parser = new Parser();
    FileInputStream fis = null;
       
    while (resultList!=null){
        try {
         
          File f = new File(myLattesDir+resultList.get(0)+".html");
          if(f.exists()){
            fis = new FileInputStream(myLattesDir+resultList.get(0)+".html");
                     
          InputStream is = fis;
          LattesSAXHandler handler = new LattesSAXHandler();
          parser.setContentHandler(handler);
          InputSource input = new InputSource(is);
         
          parser.parse(input);
          if(!handler.isErrorPageFromSite()) professorTelaDAO.insert(handler.getCorrente());
          System.out.println("Thread "+numero+": Inserido professor "+resultList.get(0));
          }
          else{
            System.out.println("Sem arquivo"+ myLattesDir+resultList.get(0)+".html");
View Full Code Here

        URL url = new URL(page);
        File file = new File(targetDir, ".manualCache-" + url.getFile().substring(1));
       
        try {
            HttpURLConnection con = (HttpURLConnection)url.openConnection();
            XMLReader parser = new Parser();
            parser.setFeature(Parser.namespacesFeature, false);
            parser.setFeature(Parser.namespacePrefixesFeature, false);
            parser.setProperty(Parser.schemaProperty, new org.ccil.cowan.tagsoup.HTMLSchema() {
                {
                    //problem with nested lists that the confluence {toc} macro creates
                    elementType("ul", M_LI, M_BLOCK | M_LI, 0);
                }
            });
           
            StringWriter w = new StringWriter();
            XMLWriter xmlWriter = new XMLWriter(w) {
                int inDiv = Integer.MAX_VALUE;
                int count;
                public void characters(char ch[], int start, int len)
                    throws SAXException {
                    if (inDiv <= count) {
                        super.characters(ch, start, len);
                    }
                }
                public void startElement(String uri, String localName, String qName, Attributes atts)
                    throws SAXException {
                    count++;
                    if ("div".equalsIgnoreCase(qName)
                        && "wiki-content maincontent".equalsIgnoreCase(atts.getValue("class"))) {
                        inDiv = count;
                    }
                    if (inDiv <= count) {
                        super.startElement(uri, localName, qName, atts);
                    }
                }
                public void endElement(String uri, String localName, String qName) throws SAXException {
                    if (inDiv <= count) {
                        super.endElement(uri, localName, qName);
                    }
                    count--;
                    if (inDiv > count) {
                        inDiv = Integer.MAX_VALUE;
                    }
                }
            };
            xmlWriter.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
            xmlWriter.setOutputProperty(XMLWriter.METHOD, "html");
            parser.setContentHandler(xmlWriter);
            long date = con.getLastModified();
            parser.parse(new InputSource(new BufferedInputStream(con.getInputStream())));

           
            FileWriter writer = new FileWriter(file);
            writer.write(Long.toString(date));
            writer.close();
View Full Code Here

     *
     * @return
     * @throws CamelException
     */
    protected XMLReader createTagSoupParser() throws CamelException {
        XMLReader reader = new Parser();
        try {
            reader.setFeature(Parser.namespacesFeature, false);
            reader.setFeature(Parser.namespacePrefixesFeature, false);

            /*
             * set each parser feature that the user may have supplied.
             * http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html
             * http://home.ccil.org/~cowan/XML/tagsoup/#properties
             */

            if (getParserFeatures() != null) {
                for (Entry<String, Boolean> e : getParserFeatures().entrySet()) {
                    reader.setFeature(e.getKey(), e.getValue());
                }
            }

            /*
             * set each parser feature that the user may have supplied. {@link
             * http://home.ccil.org/~cowan/XML/tagsoup/#properties}
             */

            if (getParserPropeties() != null) {
                for (Entry<String, Object> e : getParserPropeties().entrySet()) {
                    reader.setProperty(e.getKey(), e.getValue());
                }
            }

            /*
             * default the schema to HTML
             */
            if (this.getParsingSchema() != null) {
                reader.setProperty(Parser.schemaProperty, getParsingSchema());
            }

        } catch (Exception e) {
            throw new IllegalArgumentException("Problem configuring the parser", e);
        }
View Full Code Here

     *
     * @return
     * @throws CamelException
     */
    protected XMLReader createTagSoupParser() throws CamelException {
        XMLReader reader = new Parser();
        try {
            reader.setFeature(Parser.namespacesFeature, false);
            reader.setFeature(Parser.namespacePrefixesFeature, false);

            /*
             * set each parser feature that the user may have supplied.
             * http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html
             * http://home.ccil.org/~cowan/XML/tagsoup/#properties
             */

            if (getParserFeatures() != null) {
                for (Entry<String, Boolean> e : getParserFeatures().entrySet()) {
                    reader.setFeature(e.getKey(), e.getValue());
                }
            }

            /*
             * set each parser feature that the user may have supplied. {@link
             * http://home.ccil.org/~cowan/XML/tagsoup/#properties}
             */

            if (getParserPropeties() != null) {
                for (Entry<String, Object> e : getParserPropeties().entrySet()) {
                    reader.setProperty(e.getKey(), e.getValue());
                }
            }

            /*
             * default the schema to HTML
             */
            if (this.getParsingSchema() != null) {
                reader.setProperty(Parser.schemaProperty, getParsingSchema());
            }

        } catch (Exception e) {
            throw new IllegalArgumentException("Problem configuring the parser", e);
        }
View Full Code Here

     *
     * @return
     * @throws CamelException
     */
    protected XMLReader createTagSoupParser() throws CamelException {
        XMLReader reader = new Parser();
        try {
            reader.setFeature(Parser.namespacesFeature, false);
            reader.setFeature(Parser.namespacePrefixesFeature, false);

            /*
             * set each parser feature that the user may have supplied.
             * http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html
             * http://home.ccil.org/~cowan/XML/tagsoup/#properties
             */

            if (getParserFeatures() != null) {
                for (Entry<String, Boolean> e : getParserFeatures().entrySet()) {
                    reader.setFeature(e.getKey(), e.getValue());
                }
            }

            /*
             * set each parser feature that the user may have supplied. {@link
             * http://home.ccil.org/~cowan/XML/tagsoup/#properties}
             */

            if (getParserPropeties() != null) {
                for (Entry<String, Object> e : getParserPropeties().entrySet()) {
                    reader.setProperty(e.getKey(), e.getValue());
                }
            }

            /*
             * default the schema to HTML
             */
            if (this.getParsingSchema() != null) {
                reader.setProperty(Parser.schemaProperty, getParsingSchema());
            }

        } catch (Exception e) {
            throw new IllegalArgumentException("Problem configuring the parser", e);
        }
View Full Code Here

     *
     * @return
     * @throws CamelException
     */
    protected XMLReader createTagSoupParser() throws CamelException {
        XMLReader reader = new Parser();
        try {
            reader.setFeature(Parser.namespacesFeature, false);
            reader.setFeature(Parser.namespacePrefixesFeature, false);

            /*
             * set each parser feature that the user may have supplied.
             * http://www.saxproject.org/apidoc/org/xml/sax/package-summary.html
             * http://home.ccil.org/~cowan/XML/tagsoup/#properties
             */

            if (getParserFeatures() != null) {
                for (Entry<String, Boolean> e : getParserFeatures().entrySet()) {
                    reader.setFeature(e.getKey(), e.getValue());
                }
            }

            /*
             * set each parser feature that the user may have supplied. {@link
             * http://home.ccil.org/~cowan/XML/tagsoup/#properties}
             */

            if (getParserPropeties() != null) {
                for (Entry<String, Object> e : getParserPropeties().entrySet()) {
                    reader.setProperty(e.getKey(), e.getValue());
                }
            }

            /*
             * default the schema to HTML
             */
            if (this.getParsingSchema() != null) {
                reader.setProperty(Parser.schemaProperty, getParsingSchema());
            }

        } catch (Exception e) {
            throw new IllegalArgumentException("Problem configuring the parser", e);
        }
View Full Code Here

TOP

Related Classes of org.ccil.cowan.tagsoup.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.