Package org.htmlcleaner

Examples of org.htmlcleaner.HtmlCleaner


   * @param htmlText
   * @return
   */
  public static String convertHtml2PlainText( String htmlText ) {

    HtmlCleaner cleaner = new HtmlCleaner( CLEANER_PROPERTIES );

    try {
      TagNode rootNode = cleaner.clean( new StringReader( htmlText ) );
      return rootNode.getText().toString();
    }
    catch (IOException e) {
      log.error( e.getMessage(), e );
    }
View Full Code Here


      String charset = get.getRequestCharSet();

      //
      // Check for charset overrides in the HTML start page
      //
      HtmlCleaner cleaner = new HtmlCleaner();
      TagNode httpEquivNode = cleaner.clean(get.getResponseBodyAsStream()).findElementByAttValue("http-equiv", "content-type", true, false);
      if (httpEquivNode != null && httpEquivNode.hasAttribute("content")){
        String value = httpEquivNode.getAttributeByName("content");
        int offset = value.indexOf("charset=");
        if (offset >= -1){
            charset = value.substring(offset+8).toUpperCase();
View Full Code Here

    
    props.setTranslateSpecialEntities(true);
    props.setTransResCharsToNCR(true);
    props.setOmitComments(true);
    
    final TagNode tagNode = new HtmlCleaner(props).clean(xml);
    final String cleansData = new CompactXmlSerializer(props).getAsString(tagNode);
    
    final SAXParserFactory spf = SAXParserFactory.newInstance();
      spf.setNamespaceAware(true);
     
View Full Code Here

    public static Node markupAsDOM(WicketTester tester) throws ParserConfigurationException
    {
        CleanerProperties props = new CleanerProperties();
        props.setNamespacesAware(false);
       
        HtmlCleaner cleaner = new HtmlCleaner(props);
        return new DomSerializer(props, true).createDOM(cleaner.clean(document(tester)));
    }
View Full Code Here

        studentCandidacy.setSummaryFile(new CandidacySummaryFile(studentNumber + ".pdf", pdfByteArray, studentCandidacy));
    }

    private String clean(String dirtyHtml) {
        try {
            HtmlCleaner cleaner = new HtmlCleaner();

            TagNode root = cleaner.clean(dirtyHtml);

            return new SimpleHtmlSerializer(cleaner.getProperties()).getAsString(root);
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
        }
        return StringUtils.EMPTY;
    }
View Full Code Here

            if (source == null)
                source = "";
        }
        String rawData = StringEscapeUtils.unescapeHtml4(source);
           
        HtmlCleaner cleaner = new HtmlCleaner();
        //CleanerProperties props = cleaner.getProperties();        
        //props.setXXX(...);
        TagNode node = cleaner.clean(rawData);
        TagNode[] myNodes;
       
        myNodes = node.getElementsByName("a", true);
        for (int i=0;i<myNodes.length;i++)
        {
View Full Code Here

  public static HashMap<String, String> extractMetas(String rawPage) throws IOException {

    final HashMap<String, String> m = new HashMap<String, String>();

    HtmlCleaner cleaner = new HtmlCleaner();
    //CleanerProperties props = cleaner.getProperties();    
    //props.setXXX(...);
    TagNode node = cleaner.clean(rawPage);
    TagNode[] myNodes;

    // <meta name="..." content="..." />
    // <meta http-equiv="refresh" content=
    myNodes = node.getElementsByName("meta", true);
View Full Code Here

  }

  public static String getBaseHref(String rawPage) throws IOException {
    if (rawPage==null || !StringUtils.containsIgnoreCase(rawPage, "<base")) return null;

    HtmlCleaner cleaner = new HtmlCleaner();
    //CleanerProperties props = cleaner.getProperties();    
    //props.setXXX(...);
    TagNode node = cleaner.clean(rawPage);
    TagNode[] myNodes = node.getElementsByName("base", true);
    if (myNodes==null || myNodes.length==0) return null;
    String href = myNodes[0].getAttributeByName("href");
    if (href!=null) return href;
    return null;
View Full Code Here

   */
  public static List<String> extractLinks(String rawPage, int depth) throws IOException {

    final ArrayList<String> list = new ArrayList<String>();

    HtmlCleaner cleaner = new HtmlCleaner();
    //CleanerProperties props = cleaner.getProperties();    
    //props.setXXX(...);
    TagNode node = cleaner.clean(rawPage);

    TagNode[] myNodes;

    if (depth==1 || depth==2) {
      // <a href=
View Full Code Here

            }
            headContents.append(inputLine + "\r\n");
        }

        String headContentsStr = headContents.toString();
        HtmlCleaner cleaner = new HtmlCleaner();
        // parse the string HTML
        TagNode pageData = cleaner.clean(headContentsStr);

    // read in the declared namespaces
    boolean hasOGspec = false;
    TagNode headElement = pageData.findElementByName("head", true);
    if (headElement.hasAttribute("prefix"))
View Full Code Here

TOP

Related Classes of org.htmlcleaner.HtmlCleaner

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.