Package org.htmlcleaner

Examples of org.htmlcleaner.HtmlCleaner


    props.setTranslateSpecialEntities(true);
    props.setTransResCharsToNCR(true);
    props.setTransSpecialEntitiesToNCR(true);
    props.setOmitComments(true);
     
    HtmlCleaner cleaner = new HtmlCleaner(props);

    TagNode node = cleaner.clean(new URL(lien));
   
    for (Object o : node.evaluateXPath("//div[@id='retour_accueil']/a/img"))
    {
      //System.out.println(((TagNode)(o)).getAllChildren());
      lien_logo = ((TagNode)(o)).getAttributeByName("src");
View Full Code Here


    props.setTranslateSpecialEntities(true);
    props.setTransResCharsToNCR(true);
    props.setTransSpecialEntitiesToNCR(true);
    props.setOmitComments(true);
     
    HtmlCleaner cleaner = new HtmlCleaner(props);
     
    int i=0;
    TagNode node = cleaner.clean(new URL(lien));
   
    for (Object o : node.evaluateXPath("//div[@class='encadre_fiche firstencadre']/div/div/a"))
    {
      lien_site_spe = ((TagNode)(o)).getAttributeByName("href");
      //System.out.println("lien spe "+lien_site_spe);
View Full Code Here

  }
 
 
  public boolean recherche(String mot, String lien) throws MalformedURLException, IOException, XPatherException
  {
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode node = cleaner.clean(new URL(lien));
    boolean bool = false;

    String[] decoupage = mot.split(" ");
    if(decoupage.length == 1)
    {
View Full Code Here

    }
    return new Template(mapping);
  }
 
  private static HtmlCleaner getCleaner() {
    HtmlCleaner cleaner = new HtmlCleaner();
    cleaner.getProperties().setTranslateSpecialEntities(false);
    cleaner.getProperties().setRecognizeUnicodeChars(false);
    cleaner.getProperties().setUseEmptyElementTags(false);
    return cleaner;
  }
View Full Code Here

    return getError(response);
  }
 
  private static String getError(String response) throws IOException{
    String error = null;
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode html = cleaner.clean(response);
    TagNode errortag = html.findElementByAttValue("id", "error", true, true);
    if (errortag != null){
      error = errortag.getAttributeByName("title");
    }
    return error;
View Full Code Here

    private CleanerProperties parserProps;
    private DomSerializer2 domCreator;


    public HtmlParser() {
        this.htmlToXmlParser = new HtmlCleaner();
        this.parserProps = this.htmlToXmlParser.getProperties();
        this.parserProps.setRecognizeUnicodeChars(true);
        this.parserProps.setUseEmptyElementTags(true);
        this.parserProps.setAdvancedXmlEscape(true);
        this.parserProps.setTranslateSpecialEntities(true);
View Full Code Here

      String charset = get.getRequestCharSet();

      //
      // Check for charset overrides in the HTML start page
      //
      HtmlCleaner cleaner = new HtmlCleaner();
      TagNode httpEquivNode = cleaner.clean(get.getResponseBodyAsStream()).findElementByAttValue("http-equiv", "content-type", true, false);
      if (httpEquivNode != null && httpEquivNode.hasAttribute("content")){
        String value = httpEquivNode.getAttributeByName("content");
        int offset = value.indexOf("charset=");
        if (offset >= -1){
            charset = value.substring(offset+8).toUpperCase();
View Full Code Here

    }

    @Override
    public String select(String text) {
        try {
            HtmlCleaner htmlCleaner = new HtmlCleaner();
            TagNode tagNode = htmlCleaner.clean(text);
            Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
            Object result;
            try {
                result = xPathExpression.evaluate(document, XPathConstants.NODESET);
            } catch (XPathExpressionException e) {
View Full Code Here

    @Override
    public List<String> selectList(String text) {
        List<String> results = new ArrayList<String>();
        try {
            HtmlCleaner htmlCleaner = new HtmlCleaner();
            TagNode tagNode = htmlCleaner.clean(text);
            Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
            Object result;
            try {
                result = xPathExpression.evaluate(document, XPathConstants.NODESET);
            } catch (XPathExpressionException e) {
View Full Code Here

    @Ignore("take long time")
    @Test
    public void parserPerformanceTest() throws XPatherException {
        System.out.println(html.length());

        HtmlCleaner htmlCleaner = new HtmlCleaner();
        TagNode tagNode = htmlCleaner.clean(html);
        Document document = Jsoup.parse(html);

        long time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            htmlCleaner.clean(html);
        }
        System.out.println(System.currentTimeMillis()-time);

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            tagNode.evaluateXPath("//a");
        }
        System.out.println(System.currentTimeMillis()-time);

        System.out.println("=============");

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            Jsoup.parse(html);
        }
        System.out.println(System.currentTimeMillis()-time);

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            document.select("a");
        }
        System.out.println(System.currentTimeMillis()-time);

        System.out.println("=============");

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            htmlCleaner.clean(html);
        }
        System.out.println(System.currentTimeMillis()-time);

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
View Full Code Here

TOP

Related Classes of org.htmlcleaner.HtmlCleaner

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.