Package org.htmlparser.lexer

Examples of org.htmlparser.lexer.Lexer


    }

    // Using HTMLParser to extract the content
    String cleanedContent = null;
    Page htmlPage = new Page(cuttedContent, "UTF-8");
    Parser parser = new Parser(new Lexer(htmlPage));
    StringBean stringBean = new StringBean();

    // replace multiple whitespace with one whitespace
    stringBean.setCollapse(true);
    // Do not extract URLs
    stringBean.setLinks(false);
    // replace   with whitespace
    stringBean.setReplaceNonBreakingSpaces(true);

    try {
      // Parse the content
      parser.visitAllNodesWith(stringBean);
      cleanedContent = stringBean.getStrings();

    } catch (ParserException ex) {
      throw new RegainException("Error while parsing content: ", ex);
    }

    // The result of parsing the html-content
    setCleanedContent(cleanedContent);

    // Extract links
    LinkVisitor linkVisitor = new LinkVisitor();
    if (isContentCutted) {
      // This means a new parser run which is expensive but neccessary
      htmlPage = new Page(rawDocument.getContentAsString(), "UTF-8");
      parser = new Parser(new Lexer(htmlPage));
    } else {
      parser.reset();
    }

    try {
View Full Code Here


    @Override
    protected String toString(Object o,
            ResourcePropertyMapping resourcePropertyMapping,
            MarshallingContext context) {
        String str = (String) o;
        Lexer l = new Lexer(str);
        Parser parser = new Parser(l);
        StringBean sb = new StringBean();

        try {
            parser.visitAllNodesWith(sb);
View Full Code Here

            String s;
            while ((s = br.readLine()) != null) {
                stringBuilder.append(s);
            }

            Lexer l = new Lexer(stringBuilder.toString());
            Parser parser = new Parser(l);
            StringBean sb = new StringBean();

            parser.visitAllNodesWith(sb);
View Full Code Here

        try {
            page = new Page(in, null);
        } catch (final UnsupportedEncodingException e) {
            throw new ScimpiException(e);
        }
        final Lexer lexer = new Lexer(page);

        Node node = null;
        try {
            Stack<Snippet> tags = allTags;
            String lineNumbers = "1";
            String template = null;
            tags.push(new HtmlSnippet(lineNumbers, filePath));

            // NOTE done like this the tags can be cached for faster processing
            while ((node = lexer.nextNode()) != null) {
                if (node instanceof Remark) {
                    // TODO need to pick up on comments within tags; at the
                    // moment this splits a tag into two causing a
                    // failure later
                    continue;
View Full Code Here

        this("http://www.textsfromlastnight.com/Random-Texts-From-Last-Night.html");
    }

    private void populateTexts() throws InterruptedException {
        HttpURLConnection connection;
        Lexer lexer;

        try {
            connection = (HttpURLConnection) this.url.openConnection();
            lexer = new Lexer(connection);
        } catch (IOException e) {
            log.warn("could not open connection", e);
            this.errorMessage.put("Could not establish a connection to textsfromlastnight.com");
            return;

        } catch (ParserException e) {
            log.error("parse error", e);
            this.errorMessage.put("Parser error");
            return;
        }

        Node node = null;

        while (true) {
            try {
                node = lexer.nextNode();
                if (node == null) {
                    break;
                }

                if (node instanceof Tag) {
                    Tag tag = (Tag) node;
                    String tagReadonly = tag.getAttribute("readonly");
                    String tagName = tag.getRawTagName();

                    if (tagName.equals("textarea") && tagReadonly != null && tagReadonly.equals("readonly")) {
                        node = lexer.nextNode();
                        if (node instanceof Text) {
                            String text = ((Text) node).getText();
                            text = Translate.decode(text);
                            this.texts.put(text);
                        }
View Full Code Here

  private static List<TagNode> getHTMLFileTags(IFile htmlFile, String tagName) throws Exception {
    // find nodes
    Node[] tags;
    {
      String htmlContents = IOUtils2.readString(htmlFile);
      Lexer lexer = new Lexer(new Page(htmlContents));
      Parser parser = new Parser(lexer, new DefaultParserFeedback(DefaultParserFeedback.QUIET));
      TagFindingVisitor visitor = new TagFindingVisitor(new String[]{tagName});
      parser.visitAllNodesWith(visitor);
      tags = visitor.getTags(0);
    }
View Full Code Here

     * @return The line number, or -1 if none is available.
     * @see #getColumnNumber
     */
    public int getLineNumber ()
    {
        Lexer lexer;
       
        lexer = mParser.getLexer ();
        return (lexer.getPage ().row (lexer.getCursor ()));
    }
View Full Code Here

     * @return The column number, or -1 if none is available.
     * @see #getLineNumber
     */
    public int getColumnNumber ()
    {
        Lexer lexer;
       
        lexer = mParser.getLexer ();
        return (lexer.getPage ().column (lexer.getCursor ()));
    }
View Full Code Here

        if (null != mContentHandler)
            try
            {
                mParser = new Parser (
                    new Lexer (
                        new Page (
                            input.getByteStream (),
                            input.getEncoding ())));
                locator = new Locator (mParser);
                if (null != mErrorHandler)
View Full Code Here

        try {
            page = new Page(in, null);
        } catch (final UnsupportedEncodingException e) {
            throw new ScimpiException(e);
        }
        final Lexer lexer = new Lexer(page);

        Node node = null;
        try {
            Stack<Snippet> tags = allTags;
            String lineNumbers = "1";
            String template = null;
            tags.push(new HtmlSnippet(lineNumbers, filePath));

            // NOTE done like this the tags can be cached for faster processing
            while ((node = lexer.nextNode()) != null) {
                if (node instanceof Remark) {
                    // TODO need to pick up on comments within tags; at the
                    // moment this splits a tag into two causing a
                    // failure later
                    continue;
View Full Code Here

TOP

Related Classes of org.htmlparser.lexer.Lexer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.