Package net.htmlparser.jericho

Examples of net.htmlparser.jericho.Source


   * @param html
   * @return
   */
  public static String html2text(String html) {

    Renderer renderer = new Renderer(new Source(html)) {

      {
        setIncludeHyperlinkURLs(true);
      }

View Full Code Here


                }
                outputStream.close();
                inputStream.close();
                final byte[] responseBodyAsBytes = outputStream.toByteArray();
                String responseBody = new String(responseBodyAsBytes, "US-ASCII");
                Source source = new Source(responseBody);
                List list = source.getAllStartTags(HTMLElementName.META);
                for (Object aList : list) {
                    StartTag startTag = (StartTag) aList;
                    Attributes attributes = startTag.getAttributes();
                    final Attribute attribute = attributes.get("http-equiv");
                    if (attribute != null && attribute.getValue().equalsIgnoreCase("content-type")) {
View Full Code Here

            for (String s : searchEnginesList) {
                try {
                    URL url = new URL(s + URLEncoder.encode(site + "/sitemap.xml", "UTF-8"));
                    logger.debug("Calling " + url.toExternalForm());
                    URLConnection urlConnection = url.openConnection();
                    Source source = new Source(urlConnection);
                    logger.debug(source.getTextExtractor().toString());
                } catch (MalformedURLException e) {
                    logger.error(e.getMessage(), e);
                } catch (UnsupportedEncodingException e) {
                    logger.error(e.getMessage(), e);
                } catch (IOException e) {
View Full Code Here

                        }
                        m.put(field.getName(), renderer);
                    }
                } else if (field.isNodeType("jnt:htmlInput")) {
                    String html = field.getProperty("html").getString();
                    Source source = new Source(html);
                    List<StartTag> inputTags = source.getAllStartTags();
                    for (StartTag inputTag : inputTags) {
                        if ((inputTag.getName().equalsIgnoreCase("input") || inputTag.getName().equalsIgnoreCase("select") || inputTag.getName().equalsIgnoreCase("textarea"))
                                && inputTag.getAttributeValue("name") != null) {
                            m.put(inputTag.getAttributeValue("name"), null);
                        }
View Full Code Here

    }
    return output.toString();
  }

  private String cleanHTML(String input) {
    Renderer renderer=new Source(input).getRenderer();

    // We don't want it to be wrapped
    renderer.setMaxLineLength(Integer.MAX_VALUE);
    renderer.setIncludeHyperlinkURLs(false);
   
View Full Code Here

            //            request unless it can be confirmed by the user, since this might
            //            change the conditions under which the request was issued.

            httpclient.setRedirectStrategy(new LaxRedirectStrategy());
           
            Source source = new Source(EntityUtils.toString(entity));
            List <NameValuePair> nvps = new ArrayList <NameValuePair>();
            FormFields formFields = source.getFormFields();
           
            List<Element> forms = source.getAllElements(HTMLElementName.FORM);
            Assert.assertEquals("Only one form expected but got " + forms.size(), 1, forms.size());
            String postUrl = forms.get(0).getAttributeValue("action");
           
            Assert.assertNotNull("Form field 'wa' not found", formFields.get("wa"));
            Assert.assertNotNull("Form field 'wresult' not found", formFields.get("wresult"));
View Full Code Here

            //            request unless it can be confirmed by the user, since this might
            //            change the conditions under which the request was issued.

            httpclient.setRedirectStrategy(new LaxRedirectStrategy());
           
            Source source = new Source(EntityUtils.toString(entity));
            List <NameValuePair> nvps = new ArrayList <NameValuePair>();
            FormFields formFields = source.getFormFields();
           
            List<Element> forms = source.getAllElements(HTMLElementName.FORM);
            Assert.assertEquals("Only one form expected but got " + forms.size(), 1, forms.size());
            String postUrl = forms.get(0).getAttributeValue("action");
           
            Assert.assertNotNull("Form field 'wa' not found", formFields.get("wa"));
            Assert.assertNotNull("Form field 'wresult' not found", formFields.get("wresult"));
View Full Code Here

    public static String getIndented(String inHTML) {
        String formated_html = null;
        try {
            StringWriter writer = new StringWriter();
            new SourceFormatter(new Source(inHTML)).setIndentString("    ").setTidyTags(true).setCollapseWhiteSpace(true).writeTo(writer);
            formated_html = writer.toString();
        }
        catch (IOException e) {
            LOG.log(Level.SEVERE, null, e);
        }
View Full Code Here

          logger.error("해당 collection에 존재하지 않는 필드입니다. [{}]", fieldName);
          throw new IllegalStateException("해당 collection에 존재하지 않는 필드입니다. ["+fieldName+"]");
        }
       
        if (crescentCollectionField.isRemoveHtmlTag()) {
          Source source = new Source(value);
          value = source.getTextExtractor().toString();
        }
       
        IndexableField indexableField = luceneFieldBuilder.create(fieldsByName.get(fieldName), value);
        document.add(indexableField);
       
View Full Code Here

     * links are inserted after the text of the link.
     * @param html
     * @return
     */
    public static String htmlToText(String html) {
        Source htmlSource = new Source(html);
        Segment htmlSeg = new Segment(htmlSource, 0, htmlSource.length());
        Renderer htmlRend = new Renderer(htmlSeg);
        return htmlRend.toString();
    }
View Full Code Here

TOP

Related Classes of net.htmlparser.jericho.Source

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.