Examples of org.htmlparser.lexer.Page

org.htmlparser.lexer.Page
Represents the contents of an HTML page. Contains the source of characters and an index of positions of line separators (actually the first character position on the next line).

      }
    }


    // Using HTMLParser to extract the content
    String cleanedContent = null;
    Page htmlPage = new Page(cuttedContent, "UTF-8");
    Parser parser = new Parser(new Lexer(htmlPage));
    StringBean stringBean = new StringBean();


    // replace multiple whitespace with one whitespace
    stringBean.setCollapse(true);
    // Do not extract URLs
    stringBean.setLinks(false);
    // replace &nbsp; with whitespace
    stringBean.setReplaceNonBreakingSpaces(true);


    try {
      // Parse the content
      parser.visitAllNodesWith(stringBean);
      cleanedContent = stringBean.getStrings();


    } catch (ParserException ex) {
      throw new RegainException("Error while parsing content: ", ex);
    }


    // The result of parsing the html-content
    setCleanedContent(cleanedContent);


    // Extract links
    LinkVisitor linkVisitor = new LinkVisitor();
    if (isContentCutted) {
      // This means a new parser run which is expensive but neccessary
      htmlPage = new Page(rawDocument.getContentAsString(), "UTF-8");
      parser = new Parser(new Lexer(htmlPage));
    } else {
      parser.reset();
    }


    try {
      // Parse the content
      parser.visitAllNodesWith(linkVisitor);
      ArrayList<Tag> links = linkVisitor.getLinks();
      htmlPage.setBaseUrl(rawDocument.getUrl());


      // Iterate over all links found
      Iterator linksIter = links.iterator();
      while (linksIter.hasNext()) {
        LinkTag currTag = ((LinkTag) linksIter.next());

View Full Code Here

        final File loadFile = new File(directory.getParentFile(), filePath);
        final String loadPath = loadFile.getPath().replace('\\', '/');
        LOG.debug("loading template '" + loadPath + "'");
        final InputStream in = context.openStream(loadPath);


        Page page;
        try {
            page = new Page(in, null);
        } catch (final UnsupportedEncodingException e) {
            throw new ScimpiException(e);
        }
        final Lexer lexer = new Lexer(page);

View Full Code Here

  private static List<TagNode> getHTMLFileTags(IFile htmlFile, String tagName) throws Exception {
    // find nodes
    Node[] tags;
    {
      String htmlContents = IOUtils2.readString(htmlFile);
      Lexer lexer = new Lexer(new Page(htmlContents));
      Parser parser = new Parser(lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
      TagFindingVisitor visitor = new TagFindingVisitor(new String[]{tagName});
      parser.visitAllNodesWith(visitor);
      tags = visitor.getTags(0);
    }

View Full Code Here

        if (null != mContentHandler)
            try
            {
                mParser = new Parser (
                    new Lexer (
                        new Page (
                            input.getByteStream (),
                            input.getEncoding ())));
                locator = new Locator (mParser);
                if (null != mErrorHandler)
                    feedback = new Feedback (mErrorHandler, locator);

View Full Code Here

     * This sets the base URL to use for the rest of the page.
     * @exception ParserException If setting the base URL fails.
     */
    public void doSemanticAction () throws ParserException
    {
        Page page;
        
        page = getPage ();
        if (null != page)
            page.setBaseUrl (getBaseUrl ());
    }

View Full Code Here

        final File loadFile = new File(directory.getParentFile(), filePath);
        final String loadPath = loadFile.getPath().replace('\\', '/');
        LOG.debug("loading template '" + loadPath + "'");
        final InputStream in = context.openStream(loadPath);


        Page page;
        try {
            page = new Page(in, null);
        } catch (final UnsupportedEncodingException e) {
            throw new ScimpiException(e);
        }
        final Lexer lexer = new Lexer(page);

View Full Code Here

        final File loadFile = new File(directory.getParentFile(), filePath);
        final String loadPath = loadFile.getPath().replace('\\', '/');
        LOG.debug("loading template '" + loadPath + "'");
        final InputStream in = context.openStream(loadPath);


        Page page;
        try {
            page = new Page(in, null);
        } catch (final UnsupportedEncodingException e) {
            throw new ScimpiException(e);
        }
        final Lexer lexer = new Lexer(page);

View Full Code Here

        if (null != mContentHandler)
            try
            {
                mParser = new Parser (
                    new Lexer (
                        new Page (
                            input.getByteStream (),
                            input.getEncoding ())));
                locator = new Locator (mParser);
                if (null != mErrorHandler)
                    feedback = new Feedback (mErrorHandler, locator);

View Full Code Here

     * @param file The file to edit.
     */
    protected void edit (final File file)
    {
        FileInputStream in;
        Page page;
        Cursor cursor;
        int position;
        int expected;
        boolean modified;
        char ch;
        int last;
        StringBuffer buffer;
        FileOutputStream out;


        try
        {
            in = new FileInputStream (file);
            buffer = new StringBuffer (in.available ());
            try
            {
                page = new Page (in, null);
                cursor = new Cursor (page, 0);
                position = 0;
                modified = false;
                expected = 0;
                last = -1;
                while (Page.EOF != (ch = page.getCharacter (cursor)))
                {
                    if (++expected != cursor.getPosition ())
                    {
                        modified = true;
                        expected = cursor.getPosition ();

View Full Code Here

    {
        Parser ret;


        if (null == html)
            throw new IllegalArgumentException ("html cannot be null");
        ret = new Parser (new Lexer (new Page (html, charset)));


        return (ret);
    }

View Full Code Here

0 1 2 3

TOP

Related Classes of org.htmlparser.lexer.Page

com.google.gdt.eclipse.designer.util.Utils

fitnesse.fixtures.PageDriver

fitnesse.testsystems.slim.HtmlSlimTestSystem

fitnesse.testsystems.slim.HtmlTableScanner

fitnesse.util.HtmlParserToolsTest

net.sf.regain.crawler.preparator.HtmlPreparator

org.apache.isis.viewer.scimpi.dispatcher.processor.HtmlFileParser

org.archive.wayback.archivalurl.ArchivalUrlSAXRewriteReplayRenderer

org.archive.wayback.archivalurl.FastArchivalUrlReplayParseEventHandlerTest

org.archive.wayback.resourcestore.indexer.HTTPRecordAnnotater

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.