Package org.htmlcleaner

Examples of org.htmlcleaner.HtmlCleaner


  }
 
  public static Collection<String> findAllLinkHref(String html, String hostUrl) throws Exception{
    Collection<String> urls = new ArrayList<String>();
   
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode node = cleaner.clean(html);
    Object[] ns = node.evaluateXPath("//a[@href]");
    for (Object object : ns) {
      TagNode node2=(TagNode) object;
      String href = node2.getAttributeByName("href");
      if (href == null || href.trim().length() == 0)
View Full Code Here


   
    return map;
  }
 
  private List<Map<String, Object>> parseHtml(Page page) throws Exception{
    HtmlCleaner cleaner = new HtmlCleaner();
    cleaner.getProperties().setTreatUnknownTagsAsContent(true);
    String html = page.getContent();
    TagNode rootNode = cleaner.clean(html);
    fel.getContext().set("$page_content", html);
        final List<Field> fields = target.getModel().getField();
    String isModelArray = target.getModel().getIsArray();
    String modelXpath = target.getModel().getXpath();
    List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
View Full Code Here

    System.out.println(objs);
  }
 
  public static Object evalXpath(String html, String xpath, String attribute){
    List<Object> result = new ArrayList<Object>();
    HtmlCleaner cleaner = new HtmlCleaner();
    try {
      TagNode tagNode = cleaner.clean(html);
      Object[] nodeVals = tagNode.evaluateXPath(xpath);
      for (Object tag : nodeVals){
        TagNode _tag = (TagNode)tag;
        Object val = null;
        if (attribute != null)
View Full Code Here

        xml = CommonUtil.toXml((Node)node, keepHeader);
        return CommonUtil.toHTML(xml);
      }else if (node instanceof TagNode){
        StringWriter sw = new StringWriter();
        //TODO 从配置文件里加载这个CleanerProperties
        CleanerProperties prop = new HtmlCleaner().getProperties();
        SimpleXmlSerializer ser = new SimpleXmlSerializer(prop);
        ser.write((TagNode)node, sw, "UTF-8");
          String html = sw.getBuffer().toString();
          if (keepHeader)
            xml = html;
View Full Code Here

    {
      for( int i=0 ; i<lElements.size() ; i++ )
      {
        try
        {
          HtmlCleaner cleaner = new HtmlCleaner();
          TagNode rootNode = cleaner.clean(new URL(url));
          TagNode tagElements[] = rootNode.getElementsByName( lElements.get(i).getTag(), true );
          for( int j=0 ; j<tagElements.length ; j++ )
            if( lElements.get(i).getClas().equalsIgnoreCase("*") || tagElements[j].getAttributeByName("class").equals(lElements.get(i).getClas()) )
              lElements.get(i).add( tagElements[j].getText().toString() );
        }
View Full Code Here

  public synchronized StringBuffer format(StringBuffer sb)
  {
    StringBuffer sbResult = new StringBuffer();
    try
    {
      HtmlCleaner cleaner = new HtmlCleaner();
     
      CleanerProperties props = cleaner.getProperties();
      props.setUseEmptyElementTags(false);
     
      TagNode node = cleaner.clean(sb.toString());
      Document myJDom = new JDomSerializer(props, true).createJDom(node);
      XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());
      sbResult.append(outputter.outputString(myJDom));
    }
    catch (IOException e) {logger.error(e);}
View Full Code Here

  public HtmlUtils(String html) {
    this.html = html;

    // create an instance of HtmlCleaner
    cleaner = new HtmlCleaner();
    // take default cleaner properties
    CleanerProperties props = cleaner.getProperties();

    // customize cleaner's behaviour with property setters
    // props.setXXX(...);
View Full Code Here

 
  public HtmlUtils(URL url) throws IOException {
    this.url = url;

    // create an instance of HtmlCleaner
    cleaner = new HtmlCleaner();
    // take default cleaner properties
    CleanerProperties props = cleaner.getProperties();

    // customize cleaner's behaviour with property setters
    // props.setXXX(...);
View Full Code Here

  public static void printTagNodes( Object[] nodes) throws Exception
  {
    // serialize a node to a file, output stream, DOM, JDom...
    System.out.println( "\n\n\nINICIO");

    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties props = cleaner.getProperties();
    PrettyXmlSerializer serializer = new PrettyXmlSerializer( props);
 
    for( int i=0 ; i<nodes.length ; i++) {
      System.out.println( "######################## Va uno " + i);
View Full Code Here

  public static void main(String[] args) throws Exception
  {
      Translator.load( "es");
     
      // create an instance of HtmlCleaner
    HtmlCleaner cleaner = new HtmlCleaner();

    // take default cleaner properties
    CleanerProperties props = cleaner.getProperties();

    // customize cleaner's behaviour with property setters
//    props.setXXX(...);

    // Clean HTML taken from simple string, file, URL, input stream,
    // input source or reader. Result is root node of created
    // tree-like structure. Single cleaner instance may be safely used
    // multiple times.
//    TagNode node = cleaner.clean( new File("tests/accept.html"));

    TagNode node = cleaner.clean( new File("tests/rallypoint.htm"));


    Object[] myNodes;
       
View Full Code Here

TOP

Related Classes of org.htmlcleaner.HtmlCleaner

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.