Examples of org.htmlparser.Parser

org.htmlparser.Parser

ahoo.com",new DefaultHTMLParserFeedback()); // In this example, we are registering all the common scanners parser.registerScanners(); for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode(); node.print(); } Below is some sample code to parse Yahoo.com and print only the text information. This scanning will run faster, as there are no scanners registered here.

 Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); // In this example, none of the scanners need to be registered // as a string node is not a tag to be scanned for. for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode(); if (node instanceof StringNode) {        	 StringNode stringNode = (StringNode)node;         System.out.println(stringNode.getText());     }  }

The above snippet will print out only the text contents in the html document.
Here's another snippet that will only print out the link urls in a document. This is an example of adding a link scanner.

 Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback()); parser.addScanner(new LinkScanner("-l")); for (NodeIterator i = parser.elements();e.hasMoreNodes();) { Node node = i.nextNode();     if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag)node;         System.out.println(linkTag.getLink());     }  }

@see Parser#elements()

  if (sHtml.length()==0) throw new FileNotFoundException(sHTMLPath);


    PatternMatcher oMatcher = new Perl5Matcher();
    PatternCompiler oCompiler = new Perl5Compiler();


    Parser parser = Parser.createParser(sHtml, null);


    StringBuffer oRetVal = new StringBuffer(sHtml.length());


    try {
      for (NodeIterator i = parser.elements(); i.hasMoreNodes(); ) {
        Node oNode = i.nextNode();
    oRetVal.append(parseNode(oNode, oCompiler, oMatcher));
      } // next
    }
    catch (ParserException pe) {

View Full Code Here

      String rt = createRightTag(htmlTag);
      List<String> t2hl = createText2Highlight(textList);    
      
      PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
      factory.setTextPrototype(new HighlightTextNode("",t2hl,lt,rt));
      Parser htmlParser = new Parser();
      htmlParser.setNodeFactory(factory);
      Parser.createParser(content,"UTF-8");
      htmlParser.setInputHTML(content);
      NodeList nodeList = htmlParser.parse(null);
      content = nodeList.toHtml();
    }catch(Exception e){
      logger.error(e);
    }
    return content;

View Full Code Here

  try 
  {
   //*-- Read the file into a string
   String htmlcontents = Files.readFromFile(new File(ifile)); htmlcontents = StringTools.filterChars(htmlcontents);
   if (htmlcontents.length() == 0) return;
   Parser parser = new Parser(); 


   //*-- Extract the title text
   logger.info("Extracting title from HTML file " + ifile);
   parser.setInputHTML(htmlcontents);


   NodeList nodelist1 = parser.parse(new TagNameFilter ("TITLE")); 
   if (nodelist1.elementAt(0) != null)
   { String title = nodelist1.elementAt(0).toPlainTextString();
   doc.setTitle( cleanHTML(title) );
   }


   //*-- Extract information from the meta tags
   logger.info("Extracting METADATA from html file " + ifile);
   parser.setInputHTML(htmlcontents);
   NodeList nodelist2 = parser.parse(new TagNameFilter("META") );
   if (nodelist2 != null)
   {
    String metadata = ""; String author = "";
    for (int i = 0; i < nodelist2.size(); i++)
    { if (nodelist2.elementAt(i) == null) continue;
    String meta = nodelist2.elementAt(i).getText(); meta = StringTools.filterChars(meta);
    MetaTag mtag = new MetaTag(); mtag.setText("<" + meta + ">");
    String tagName = mtag.getMetaTagName();
    if (tagName == null) continue;
    if (tagName.equalsIgnoreCase("keywords")) 
    { metadata += mtag.getMetaContent(); }
    if (tagName.equalsIgnoreCase("authors") || 
      tagName.equalsIgnoreCase("author") )
    { author += mtag.getMetaContent(); }
    } // end of for
    doc.setAuthor(author); doc.setMetadata(metadata);
   } // eod of if


   //*-- Populate the contents of the contents with the entire text from the  web page
   logger.info("Extracting text from body of html file " + ifile);
   StringExtractor st = new StringExtractor(ifile); 


   //*-- string extractor does not input form values -- handle separately
   parser.setInputHTML(htmlcontents); StringBuffer inputVal = new StringBuffer();
   NodeList nodelist3 = parser.parse(new TagNameFilter ("INPUT"));
   for (int i = 0; i < nodelist3.size(); i++)
   { InputTag itag = (InputTag) nodelist3.elementAt(i); 
   if ((itag != null) && (itag.getAttribute("value") != null) )
   { inputVal.append(" "); inputVal.append( itag.getAttribute("value") ); }
   }

View Full Code Here


        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + baseUrl);
        }


        Parser htmlParser = null;
        try {
            String contents = new String(html); // TODO - charset?
            htmlParser = new Parser();
            htmlParser.setInputHTML(contents);
        } catch (Exception e) {
            throw new HTMLParseException(e);
        }


        // Now parse the DOM tree
        try {
            // we start to iterate through the elements
            parseNodes(htmlParser.elements(), new URLPointer(baseUrl), urls);
            log.debug("End   : parseNodes");
        } catch (ParserException e) {
            throw new HTMLParseException(e);
        }

View Full Code Here

            throws HTMLParseException {
        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + html);
        }


        Parser htmlParser = null;
        try {
            htmlParser = new Parser();
            htmlParser.setInputHTML(html);
        } catch (Exception e) {
            throw new HTMLParseException(e);
        }


        // Now parse the DOM tree
        try {
            // we start to iterate through the elements
            parseNodes(htmlParser.elements(), formEncodings, pageEncoding);
            log.debug("End   : parseNodes");
        } catch (ParserException e) {
            throw new HTMLParseException(e);
        }
    }

View Full Code Here

    SortedSet<HtmlConverterPSpan> visibleSpansSoFar = new TreeSet<HtmlConverterPSpan>();
    SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();


    // process
    try {
      Parser parser = new Parser(documentText);
      NodeList list = parser.parse(null);
      HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags);
      list.visitAllNodesWith(visitor);
      visibleSpansSoFar = visitor.getTextSpans();
      linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();
    } catch (ParserException e) {

View Full Code Here

   * @return
   * @throws Exception
   */
  public String getHtmlTagsContent(String urlStr) throws Exception {


    Parser parser = new Parser();


    String html = Common.getInputHtmlGBK(urlStr);
    parser.setInputHTML(html);
    String filerStr = "li";
    NodeFilter filter = new TagNameFilter(filerStr);
    // 取得页面内容中标签为"dl"
    NodeList nodeList = parser.extractAllNodesThatMatch(filter);


    Tag tag = (Tag) nodeList.elementAt(0);


    return tag.toHtml();
  }

View Full Code Here

   *            属性应取的值
   * @return 标签列表
   */
  public static <T extends TagNode> List<T> parseTags(String html, final Class<T> tagType, final String attrbuteName, final String attrbutValue) {
    try {
      Parser parser = new Parser();
      parser.setInputHTML(html);
      NodeList list = parser.parse(new NodeFilter() {


        public boolean accept(Node node) {
          if (node.getClass() == tagType) {
            T tagNode = (T) node;
            if (attrbuteName == null) {

View Full Code Here

    protected String toString(Object o,
            ResourcePropertyMapping resourcePropertyMapping,
            MarshallingContext context) {
        String str = (String) o;
        Lexer l = new Lexer(str);
        Parser parser = new Parser(l);
        StringBean sb = new StringBean();


        try {
            parser.visitAllNodesWith(sb);
        } catch (ParserException e) {
            log.warn("RETURNING ORIG VAL: " + str);
            return str;
        }
        String ret = sb.getStrings();

View Full Code Here

            while ((s = br.readLine()) != null) {
                stringBuilder.append(s);
            }


            Lexer l = new Lexer(stringBuilder.toString());
            Parser parser = new Parser(l);
            StringBean sb = new StringBean();


            parser.visitAllNodesWith(sb);


            String ret = sb.getStrings();
            return new StringReader(ret);
        } catch (ParserException e) {
            log.warn("Conversion Exception: " + e);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.htmlparser.Parser

cn.edu.pku.dr.requirement.elicitation.tools.HtmlTransformer

com.gnizr.core.util.FormatUtil

com.knowgate.hipermail.DBMimeMessage

com.knowgate.hipermail.HtmlMimeBodyPart

com.lanyuan.util.HttpClientUtils

com.vgo.movie.thread.DetailFilmThread

com.waxayaz.TomcatMI.core.utils.repoManager.TomcatRepositoryManager

com.wordpress.util.StringUtil

fitnesse.fixtures.PageDriver

fitnesse.slim.converters.MapEditor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.