Package org.htmlcleaner

Examples of org.htmlcleaner.HtmlCleaner.clean()


      //
      // Check for charset overrides in the HTML start page
      //
      HtmlCleaner cleaner = new HtmlCleaner();
      TagNode httpEquivNode = cleaner.clean(get.getResponseBodyAsStream()).findElementByAttValue("http-equiv", "content-type", true, false);
      if (httpEquivNode != null && httpEquivNode.hasAttribute("content")){
        String value = httpEquivNode.getAttributeByName("content");
        int offset = value.indexOf("charset=");
        if (offset >= -1){
            charset = value.substring(offset+8).toUpperCase();
View Full Code Here


    @Override
    public String select(String text) {
        try {
            HtmlCleaner htmlCleaner = new HtmlCleaner();
            TagNode tagNode = htmlCleaner.clean(text);
            Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
            Object result;
            try {
                result = xPathExpression.evaluate(document, XPathConstants.NODESET);
            } catch (XPathExpressionException e) {
View Full Code Here

    @Override
    public List<String> selectList(String text) {
        List<String> results = new ArrayList<String>();
        try {
            HtmlCleaner htmlCleaner = new HtmlCleaner();
            TagNode tagNode = htmlCleaner.clean(text);
            Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
            Object result;
            try {
                result = xPathExpression.evaluate(document, XPathConstants.NODESET);
            } catch (XPathExpressionException e) {
View Full Code Here

    @Test
    public void parserPerformanceTest() throws XPatherException {
        System.out.println(html.length());

        HtmlCleaner htmlCleaner = new HtmlCleaner();
        TagNode tagNode = htmlCleaner.clean(html);
        Document document = Jsoup.parse(html);

        long time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            htmlCleaner.clean(html);
View Full Code Here

        TagNode tagNode = htmlCleaner.clean(html);
        Document document = Jsoup.parse(html);

        long time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            htmlCleaner.clean(html);
        }
        System.out.println(System.currentTimeMillis()-time);

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
View Full Code Here

        System.out.println("=============");

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
            htmlCleaner.clean(html);
        }
        System.out.println(System.currentTimeMillis()-time);

        time =System.currentTimeMillis();
        for (int i = 0; i < 2000; i++) {
View Full Code Here

  public static final String DEFAULT_HTML_INPUT_ENCODING = "Windows-1251";
 
  public static List<TOCReference> parseHhc(InputStream hhcFile, Resources resources) throws IOException, ParserConfigurationException,  XPathExpressionException {
    HtmlCleaner htmlCleaner = new HtmlCleaner();
    CleanerProperties props = htmlCleaner.getProperties();
    TagNode node = htmlCleaner.clean(hhcFile);
    Document hhcDocument = new DomSerializer(props).createDOM(node);
    XPath xpath = XPathFactory.newInstance().newXPath();
    Node ulNode = (Node) xpath.evaluate("body/ul", hhcDocument
        .getDocumentElement(), XPathConstants.NODE);
    List<TOCReference> sections = processUlNode(ulNode, resources);
View Full Code Here

      if (statusCode != 200) {
        throw new RuntimeException("Failed to get page: " + statusCode);
      }
      String response = method.getResponseBodyAsString();
      HtmlCleaner html = new HtmlCleaner();
      TagNode content = html.clean(response).findElementByAttValue("id", "content", true, false);
      Object[] rows = content.evaluateXPath("table/tbody/tr[@id]");
      for (Object row: rows) {
        if (!(row instanceof TagNode)) {
          continue;
        }
View Full Code Here

      //
      // Check for charset overrides in the HTML start page
      //
      HtmlCleaner cleaner = new HtmlCleaner();
      TagNode httpEquivNode = cleaner.clean(get.getResponseBodyAsStream()).findElementByAttValue("http-equiv", "content-type", true, false);
      if (httpEquivNode != null && httpEquivNode.hasAttribute("content")){
        String value = httpEquivNode.getAttributeByName("content");
        int offset = value.indexOf("charset=");
        if (offset >= -1){
            charset = value.substring(offset+8).toUpperCase();
View Full Code Here

    props.setAllowHtmlInsideAttributes( false );
    props.setPruneTags( "style, script" );

    HtmlCleaner cleaner = new HtmlCleaner( props );
    try {
      TagNode node = cleaner.clean( new StringReader( content ) );

      TagNode[] nodes = node.getElementsByName( "a", true );
      for (TagNode tagnode : nodes) {
        tagnode.removeAttribute( "target" );
        tagnode.addAttribute( "target", "_blank" );
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.