Package net.htmlparser.jericho

Examples of net.htmlparser.jericho.Source


            //            request unless it can be confirmed by the user, since this might
            //            change the conditions under which the request was issued.

            httpclient.setRedirectStrategy(new LaxRedirectStrategy());
           
            Source source = new Source(EntityUtils.toString(entity));
            List <NameValuePair> nvps = new ArrayList <NameValuePair>();
            FormFields formFields = source.getFormFields();
           
            List<Element> forms = source.getAllElements(HTMLElementName.FORM);
            Assert.assertEquals("Only one form expected but got " + forms.size(), 1, forms.size());
            String postUrl = forms.get(0).getAttributeValue("action");
           
            Assert.assertNotNull("Form field 'wa' not found", formFields.get("wa"));
            Assert.assertNotNull("Form field 'wresult' not found", formFields.get("wresult"));
View Full Code Here


            //            request unless it can be confirmed by the user, since this might
            //            change the conditions under which the request was issued.

            httpclient.setRedirectStrategy(new LaxRedirectStrategy());
           
            Source source = new Source(EntityUtils.toString(entity));
            List <NameValuePair> nvps = new ArrayList <NameValuePair>();
            FormFields formFields = source.getFormFields();
           
            List<Element> forms = source.getAllElements(HTMLElementName.FORM);
            Assert.assertEquals("Only one form expected but got " + forms.size(), 1, forms.size());
            String postUrl = forms.get(0).getAttributeValue("action");
           
            Assert.assertNotNull("Form field 'wa' not found", formFields.get("wa"));
            Assert.assertNotNull("Form field 'wresult' not found", formFields.get("wresult"));
View Full Code Here

        return response;
    }

    public List<DBpediaResource> parse(String html) throws AnnotationException {

        Source parser;
        String wikiUrl;
        String surfaceForm;
        List<DBpediaResource> entities = new ArrayList<DBpediaResource>();

        try {
            InputStream is = new ByteArrayInputStream(html.getBytes("UTF-8"));
            parser = new Source(is);
            parser.fullSequentialParse();
            parser.getElementById("div");
        } catch (IOException e) {
            throw new AnnotationException("Error reading output from WikiMachine ",e);
        }
        List<Element>KeywordElements=parser.getAllElementsByClass("keywords");

        if (KeywordElements!=null && !KeywordElements.isEmpty()){
            Element keywordElement= KeywordElements.get(0);
            for (Element linkElement : keywordElement.getAllElements()) {
                wikiUrl="";
View Full Code Here

    public Rule IndentHtmlBlockClose(StringVar tagName) {
        return Sequence(Newline(), '<', Spn1(), '/', OneOrMore(Alphanumeric()), match().equals(tagName.get()), Spn1(), '>');
    }

    public Node createHtmlBlockNode(String text) {
        Source source = new Source(text);
        return createMarkdownInsideHtmlBlockNode(text, 0, source, new SuperNode());
    }
View Full Code Here

     * @param lineSeparator line separator
     * @return plain text
     */
    public static String getPlainText ( final String html, final String lineSeparator )
    {
        final Source source = new Source ( html );
        final Tag[] tags = source.fullSequentialParse ();
        if ( tags.length > 0 )
        {
            final Renderer renderer = source.getRenderer ();
            renderer.setIncludeHyperlinkURLs ( false );
            renderer.setIncludeAlternateText ( false );
            renderer.setDecorateFontStyles ( false );
            renderer.setMaxLineLength ( Integer.MAX_VALUE );
            renderer.setBlockIndentSize ( 4 );
View Full Code Here

     * @param text text to process
     * @return true if the specified text contains HTML tags, false otherwise
     */
    public static boolean hasTags ( final String text )
    {
        return text != null && text.trim ().length () > 0 && new Source ( text ).fullSequentialParse ().length > 0;
    }
View Full Code Here

     */
    public static boolean hasTag ( final String text, final String tag )
    {
        if ( text != null && text.trim ().length () > 0 )
        {
            final Source source = new Source ( text );
            source.fullSequentialParse ();
            return source.getFirstElement ( tag ) != null;
        }
        else
        {
            return false;
        }
View Full Code Here

    private void loadFirstResource ( final List<ResourceFile> resources, final List<String> xmlContent, final List<String> xmlNames,
                                     final List<ResourceFile> xmlFiles ) throws IOException
    {
        final ResourceFile rf = resources.get ( 0 );
        final Source xmlSource = new Source ( ReflectUtils.getClassSafely ( rf.getClassName () ).getResource ( rf.getSource () ) );
        xmlSource.fullSequentialParse ();

        final Element baseClassTag = xmlSource.getFirstElement ( SkinInfoConverter.CLASS_NODE );
        final String baseClass = baseClassTag != null ? baseClassTag.getContent ().toString () : null;

        for ( final Element includeTag : xmlSource.getAllElements ( SkinInfoConverter.INCLUDE_NODE ) )
        {
            final String includeClass = includeTag.getAttributeValue ( SkinInfoConverter.NEAR_CLASS_ATTRIBUTE );
            final String finalClass = includeClass != null ? includeClass : baseClass;
            final String src = includeTag.getContent ().toString ();
            resources.add ( new ResourceFile ( ResourceLocation.nearClass, src, finalClass ) );
        }

        xmlContent.add ( xmlSource.toString () );
        xmlNames.add ( new File ( rf.getSource () ).getName () );
        xmlFiles.add ( rf );

        resources.remove ( 0 );
    }
View Full Code Here

    // <frame src= (par Jericho parser car HTML Cleaner echoue)
    MicrosoftConditionalCommentTagTypes.register();
    PHPTagTypes.register();
    PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this example otherwise they override processing instructions
    MasonTagTypes.register();
    Source source=new Source(rawPage);
    source.fullSequentialParse();

    if (depth==0 || depth==2) {
      List<Element> linkElements=source.getAllElements(HTMLElementName.FRAME);
      for (Element linkElement : linkElements) {
        String link=linkElement.getAttributeValue("src");
        if (link!=null && !"".equals(link))
          if (isValidUrl(link))
            if (!list.contains(link))
              list.add(link);
      }
    }
    if (depth==1 || depth==2) {

      List<Element> linkElements=source.getAllElements(HTMLElementName.A);
      for (Element linkElement : linkElements) {
        String link=linkElement.getAttributeValue("href");
        if (link!=null && !"".equals(link))
          if (isValidUrl(link))
            if (!list.contains(link))
View Full Code Here

public abstract class HtmlUtils {

    public static String extractText(String html) {
        //
        Source source = new Source(html);
        return source.getTextExtractor().toString();
    }
View Full Code Here

TOP

Related Classes of net.htmlparser.jericho.Source

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.