Package org.htmlparser

Examples of org.htmlparser.Parser


         }
         bos.close();

         String html = new String(bos.toByteArray());

         Parser parser = Parser.createParser(html, null);
         StringBean sb = new StringBean();

         // read links or not
         // sb.setLinks(true); //TODO make this configurable

         // extract text
         parser.visitAllNodesWith(sb);

         String text = sb.getStrings();
         refined_text = (text != null) ? text : ""; // delete(text);

      }
View Full Code Here


         }
         bos.close();

         String html = new String(bos.toByteArray());

         Parser parser = Parser.createParser(html, null);
         StringBean sb = new StringBean();

         // read links or not
         // sb.setLinks(true);

         // extract text
         parser.visitAllNodesWith(sb);

         String text = sb.getStrings();
         refined_text = (text != null) ? text : ""; // delete(text);

      }
View Full Code Here

            bos.write(buffer, 0, len);
         bos.close();

         String html = new String(bos.toByteArray());

         Parser parser = Parser.createParser(html, null);
         StringBean sb = new StringBean();

         // read links or not
         // sb.setLinks(true); //TODO make this configurable

         // extract text
         parser.visitAllNodesWith(sb);

         String text = sb.getStrings();
         refined_text = (text != null) ? text : ""; // delete(text);

      }
View Full Code Here

                                   String loginUrl, String contentType) throws IOException,
                                                           ParserException {
        logger.debug("Processing an HTML document");

        String stream = null;
        Parser parser = null;
        NodeVisitor visitor = null;

        // Retrieve HTML stream
        stream =
                readFully(new InputStreamReader(method.getResponseBodyAsStream()));

        // Protection
        if (stream != null) {
            logger.debug("Stream content size: " + stream.length());
            // Parse HTML stream to replace any links to include the path to the valve
            parser = Parser.createParser(stream, null);

            // Instantiate visitor
            visitor = new HTTPVisitor(url, loginUrl);
            // Parse nodes
            parser.visitAllNodesWith(visitor);

            // Get writer
            PrintWriter out = response.getWriter();

            // Push HTML content
View Full Code Here

        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + baseUrl);
        }

        Parser htmlParser = null;
        try {
            String contents = new String(html,encoding);
            htmlParser = new Parser();
            htmlParser.setInputHTML(contents);
        } catch (Exception e) {
            throw new HTMLParseException(e);
        }

        // Now parse the DOM tree
        try {
            // we start to iterate through the elements
            parseNodes(htmlParser.elements(), new URLPointer(baseUrl), urls);
            log.debug("End   : parseNodes");
        } catch (ParserException e) {
            throw new HTMLParseException(e);
        }
View Full Code Here

      e.printStackTrace();
    }*/
    List<Map<String,String>> result = new ArrayList<Map<String,String>>();
    int count = 0;
    try {
      Parser parser = new Parser();
      parser.setInputHTML(inputHTML);
      parser.setEncoding("UTF-8");   
      NodeList nl = parser.parse(null);
      NodeList trs = nl.extractAllNodesThatMatch(new TagNameFilter("tr"),true);
      String regex = "([a-z]+) *= *\"?((?:(?! [a-z]+ *=|/? *>|\").)+)";
        Pattern p = Pattern.compile(regex, Pattern.DOTALL);
        for(int i=0;i<trs.size();i++) {
          NodeList nodes = trs.elementAt(i).getChildren();
View Full Code Here

       
        if (log.isDebugEnabled()) {
            log.debug("Parsing html of: " + baseUrl);
        }
       
        Parser htmlParser = null;
    try {
      String contents = new String(html);
      htmlParser = new Parser();
            htmlParser.setInputHTML(contents);
    } catch (Exception e) {
      throw new HTMLParseException(e);
    }

    // Now parse the DOM tree
    try {
      // we start to iterate through the elements
      parseNodes(htmlParser.elements(), new URLPointer(baseUrl), urls);
      log.debug("End   : parseNodes");
    } catch (ParserException e) {
      throw new HTMLParseException(e);
    }
View Full Code Here

    SortedSet<HtmlConverterPSpan> visibleSpansSoFar = new TreeSet<HtmlConverterPSpan>();
    SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();

    // process
    try {
      Parser parser = new Parser(documentText);
      NodeList list = parser.parse(null);
      HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, skipWhitespaces);
      list.visitAllNodesWith(visitor);
      visibleSpansSoFar = visitor.getTextSpans();
      linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();
    } catch (ParserException e) {
View Full Code Here

  public void process(JCas jcas) throws AnalysisEngineProcessException {
    String documentText = jcas.getDocumentText();
    List<AnnotationFS> annotations = new ArrayList<AnnotationFS>();
    List<AnnotationFS> annotationStack = new ArrayList<AnnotationFS>();
    try {
      Parser parser = new Parser(documentText);
      NodeList list = parser.parse(null);
      HtmlVisitor visitor = new HtmlVisitor(jcas, onlyContent);
      list.visitAllNodesWith(visitor);
      annotations = visitor.getAnnotations();
      annotationStack = visitor.getAnnotationStack();
    } catch (ParserException e) {
View Full Code Here

        sb.append(line + "\n");
      }

      String document = sb.toString();

      Parser parser = new Parser(document);
      NodeList list = parser.parse(null);
      HtmlDocumentationVisitor visitor = new HtmlDocumentationVisitor(document);
      list.visitAllNodesWith(visitor);
      map.putAll(visitor.getMap());
    } catch (Exception e) {
      RutaIdeUIPlugin.error(e);
View Full Code Here

TOP

Related Classes of org.htmlparser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.