Package org.htmlparser

Examples of org.htmlparser.Parser


  if (sHtml.length()==0) throw new FileNotFoundException(sHTMLPath);

    PatternMatcher oMatcher = new Perl5Matcher();
    PatternCompiler oCompiler = new Perl5Compiler();

    Parser parser = Parser.createParser(sHtml, null);

    StringBuffer oRetVal = new StringBuffer(sHtml.length());

    try {
      for (NodeIterator i = parser.elements(); i.hasMoreNodes(); ) {
        Node oNode = i.nextNode();
    oRetVal.append(parseNode(oNode, oCompiler, oMatcher));
      } // next
    }
    catch (ParserException pe) {
View Full Code Here


      String rt = createRightTag(htmlTag);
      List<String> t2hl = createText2Highlight(textList);   
     
      PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
      factory.setTextPrototype(new HighlightTextNode("",t2hl,lt,rt));
      Parser htmlParser = new Parser();
      htmlParser.setNodeFactory(factory);
      Parser.createParser(content,"UTF-8");
      htmlParser.setInputHTML(content);
      NodeList nodeList = htmlParser.parse(null);
      content = nodeList.toHtml();
    }catch(Exception e){
      logger.error(e);
    }
    return content;
View Full Code Here

  try
  {
   //*-- Read the file into a string
   String htmlcontents = Files.readFromFile(new File(ifile)); htmlcontents = StringTools.filterChars(htmlcontents);
   if (htmlcontents.length() == 0) return;
   Parser parser = new Parser();

   //*-- Extract the title text
   logger.info("Extracting title from HTML file " + ifile);
   parser.setInputHTML(htmlcontents);

   NodeList nodelist1 = parser.parse(new TagNameFilter ("TITLE"));
   if (nodelist1.elementAt(0) != null)
   { String title = nodelist1.elementAt(0).toPlainTextString();
   doc.setTitle( cleanHTML(title) );
   }

   //*-- Extract information from the meta tags
   logger.info("Extracting METADATA from html file " + ifile);
   parser.setInputHTML(htmlcontents);
   NodeList nodelist2 = parser.parse(new TagNameFilter("META") );
   if (nodelist2 != null)
   {
    String metadata = ""; String author = "";
    for (int i = 0; i < nodelist2.size(); i++)
    { if (nodelist2.elementAt(i) == null) continue;
    String meta = nodelist2.elementAt(i).getText(); meta = StringTools.filterChars(meta);
    MetaTag mtag = new MetaTag(); mtag.setText("<" + meta + ">");
    String tagName = mtag.getMetaTagName();
    if (tagName == null) continue;
    if (tagName.equalsIgnoreCase("keywords"))
    { metadata += mtag.getMetaContent(); }
    if (tagName.equalsIgnoreCase("authors") ||
      tagName.equalsIgnoreCase("author") )
    { author += mtag.getMetaContent(); }
    } // end of for
    doc.setAuthor(author); doc.setMetadata(metadata);
   } // eod of if

   //*-- Populate the contents of the contents with the entire text from the  web page
   logger.info("Extracting text from body of html file " + ifile);
   StringExtractor st = new StringExtractor(ifile);

   //*-- string extractor does not input form values -- handle separately
   parser.setInputHTML(htmlcontents); StringBuffer inputVal = new StringBuffer();
   NodeList nodelist3 = parser.parse(new TagNameFilter ("INPUT"));
   for (int i = 0; i < nodelist3.size(); i++)
   { InputTag itag = (InputTag) nodelist3.elementAt(i);
   if ((itag != null) && (itag.getAttribute("value") != null) )
   { inputVal.append(" "); inputVal.append( itag.getAttribute("value") ); }
   }
View Full Code Here

        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + baseUrl);
        }

        Parser htmlParser = null;
        try {
            String contents = new String(html); // TODO - charset?
            htmlParser = new Parser();
            htmlParser.setInputHTML(contents);
        } catch (Exception e) {
            throw new HTMLParseException(e);
        }

        // Now parse the DOM tree
        try {
            // we start to iterate through the elements
            parseNodes(htmlParser.elements(), new URLPointer(baseUrl), urls);
            log.debug("End   : parseNodes");
        } catch (ParserException e) {
            throw new HTMLParseException(e);
        }
View Full Code Here

            throws HTMLParseException {
        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + html);
        }

        Parser htmlParser = null;
        try {
            htmlParser = new Parser();
            htmlParser.setInputHTML(html);
        } catch (Exception e) {
            throw new HTMLParseException(e);
        }

        // Now parse the DOM tree
        try {
            // we start to iterate through the elements
            parseNodes(htmlParser.elements(), formEncodings, pageEncoding);
            log.debug("End   : parseNodes");
        } catch (ParserException e) {
            throw new HTMLParseException(e);
        }
    }
View Full Code Here

    SortedSet<HtmlConverterPSpan> visibleSpansSoFar = new TreeSet<HtmlConverterPSpan>();
    SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();

    // process
    try {
      Parser parser = new Parser(documentText);
      NodeList list = parser.parse(null);
      HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags);
      list.visitAllNodesWith(visitor);
      visibleSpansSoFar = visitor.getTextSpans();
      linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();
    } catch (ParserException e) {
View Full Code Here

   * @return
   * @throws Exception
   */
  public String getHtmlTagsContent(String urlStr) throws Exception {

    Parser parser = new Parser();

    String html = Common.getInputHtmlGBK(urlStr);
    parser.setInputHTML(html);
    String filerStr = "li";
    NodeFilter filter = new TagNameFilter(filerStr);
    // 取得页面内容中标签为"dl"
    NodeList nodeList = parser.extractAllNodesThatMatch(filter);

    Tag tag = (Tag) nodeList.elementAt(0);

    return tag.toHtml();
  }
View Full Code Here

   *            属性应取的值
   * @return 标签列表
   */
  public static <T extends TagNode> List<T> parseTags(String html, final Class<T> tagType, final String attrbuteName, final String attrbutValue) {
    try {
      Parser parser = new Parser();
      parser.setInputHTML(html);
      NodeList list = parser.parse(new NodeFilter() {

        public boolean accept(Node node) {
          if (node.getClass() == tagType) {
            T tagNode = (T) node;
            if (attrbuteName == null) {
View Full Code Here

    protected String toString(Object o,
            ResourcePropertyMapping resourcePropertyMapping,
            MarshallingContext context) {
        String str = (String) o;
        Lexer l = new Lexer(str);
        Parser parser = new Parser(l);
        StringBean sb = new StringBean();

        try {
            parser.visitAllNodesWith(sb);
        } catch (ParserException e) {
            log.warn("RETURNING ORIG VAL: " + str);
            return str;
        }
        String ret = sb.getStrings();
View Full Code Here

            while ((s = br.readLine()) != null) {
                stringBuilder.append(s);
            }

            Lexer l = new Lexer(stringBuilder.toString());
            Parser parser = new Parser(l);
            StringBean sb = new StringBean();

            parser.visitAllNodesWith(sb);

            String ret = sb.getStrings();
            return new StringReader(ret);
        } catch (ParserException e) {
            log.warn("Conversion Exception: " + e);
View Full Code Here

TOP

Related Classes of org.htmlparser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.