Package org.htmlparser.lexer

Examples of org.htmlparser.lexer.Page


      }
    }

    // Using HTMLParser to extract the content
    String cleanedContent = null;
    Page htmlPage = new Page(cuttedContent, "UTF-8");
    Parser parser = new Parser(new Lexer(htmlPage));
    StringBean stringBean = new StringBean();

    // replace multiple whitespace with one whitespace
    stringBean.setCollapse(true);
    // Do not extract URLs
    stringBean.setLinks(false);
    // replace   with whitespace
    stringBean.setReplaceNonBreakingSpaces(true);

    try {
      // Parse the content
      parser.visitAllNodesWith(stringBean);
      cleanedContent = stringBean.getStrings();

    } catch (ParserException ex) {
      throw new RegainException("Error while parsing content: ", ex);
    }

    // The result of parsing the html-content
    setCleanedContent(cleanedContent);

    // Extract links
    LinkVisitor linkVisitor = new LinkVisitor();
    if (isContentCutted) {
      // This means a new parser run which is expensive but neccessary
      htmlPage = new Page(rawDocument.getContentAsString(), "UTF-8");
      parser = new Parser(new Lexer(htmlPage));
    } else {
      parser.reset();
    }

    try {
      // Parse the content
      parser.visitAllNodesWith(linkVisitor);
      ArrayList<Tag> links = linkVisitor.getLinks();
      htmlPage.setBaseUrl(rawDocument.getUrl());

      // Iterate over all links found
      Iterator linksIter = links.iterator();
      while (linksIter.hasNext()) {
        LinkTag currTag = ((LinkTag) linksIter.next());
View Full Code Here


        final File loadFile = new File(directory.getParentFile(), filePath);
        final String loadPath = loadFile.getPath().replace('\\', '/');
        LOG.debug("loading template '" + loadPath + "'");
        final InputStream in = context.openStream(loadPath);

        Page page;
        try {
            page = new Page(in, null);
        } catch (final UnsupportedEncodingException e) {
            throw new ScimpiException(e);
        }
        final Lexer lexer = new Lexer(page);
View Full Code Here

  private static List<TagNode> getHTMLFileTags(IFile htmlFile, String tagName) throws Exception {
    // find nodes
    Node[] tags;
    {
      String htmlContents = IOUtils2.readString(htmlFile);
      Lexer lexer = new Lexer(new Page(htmlContents));
      Parser parser = new Parser(lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
      TagFindingVisitor visitor = new TagFindingVisitor(new String[]{tagName});
      parser.visitAllNodesWith(visitor);
      tags = visitor.getTags(0);
    }
View Full Code Here

        if (null != mContentHandler)
            try
            {
                mParser = new Parser (
                    new Lexer (
                        new Page (
                            input.getByteStream (),
                            input.getEncoding ())));
                locator = new Locator (mParser);
                if (null != mErrorHandler)
                    feedback = new Feedback (mErrorHandler, locator);
View Full Code Here

     * This sets the base URL to use for the rest of the page.
     * @exception ParserException If setting the base URL fails.
     */
    public void doSemanticAction () throws ParserException
    {
        Page page;
       
        page = getPage ();
        if (null != page)
            page.setBaseUrl (getBaseUrl ());
    }
View Full Code Here

        final File loadFile = new File(directory.getParentFile(), filePath);
        final String loadPath = loadFile.getPath().replace('\\', '/');
        LOG.debug("loading template '" + loadPath + "'");
        final InputStream in = context.openStream(loadPath);

        Page page;
        try {
            page = new Page(in, null);
        } catch (final UnsupportedEncodingException e) {
            throw new ScimpiException(e);
        }
        final Lexer lexer = new Lexer(page);
View Full Code Here

        final File loadFile = new File(directory.getParentFile(), filePath);
        final String loadPath = loadFile.getPath().replace('\\', '/');
        LOG.debug("loading template '" + loadPath + "'");
        final InputStream in = context.openStream(loadPath);

        Page page;
        try {
            page = new Page(in, null);
        } catch (final UnsupportedEncodingException e) {
            throw new ScimpiException(e);
        }
        final Lexer lexer = new Lexer(page);
View Full Code Here

        if (null != mContentHandler)
            try
            {
                mParser = new Parser (
                    new Lexer (
                        new Page (
                            input.getByteStream (),
                            input.getEncoding ())));
                locator = new Locator (mParser);
                if (null != mErrorHandler)
                    feedback = new Feedback (mErrorHandler, locator);
View Full Code Here

     * @param file The file to edit.
     */
    protected void edit (final File file)
    {
        FileInputStream in;
        Page page;
        Cursor cursor;
        int position;
        int expected;
        boolean modified;
        char ch;
        int last;
        StringBuffer buffer;
        FileOutputStream out;

        try
        {
            in = new FileInputStream (file);
            buffer = new StringBuffer (in.available ());
            try
            {
                page = new Page (in, null);
                cursor = new Cursor (page, 0);
                position = 0;
                modified = false;
                expected = 0;
                last = -1;
                while (Page.EOF != (ch = page.getCharacter (cursor)))
                {
                    if (++expected != cursor.getPosition ())
                    {
                        modified = true;
                        expected = cursor.getPosition ();
View Full Code Here

    {
        Parser ret;

        if (null == html)
            throw new IllegalArgumentException ("html cannot be null");
        ret = new Parser (new Lexer (new Page (html, charset)));

        return (ret);
    }
View Full Code Here

TOP

Related Classes of org.htmlparser.lexer.Page

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.