Package org.htmlparser.util

Examples of org.htmlparser.util.NodeList


    SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();

    // process
    try {
      Parser parser = new Parser(documentText);
      NodeList list = parser.parse(null);
      HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, skipWhitespaces);
      list.visitAllNodesWith(visitor);
      visibleSpansSoFar = visitor.getTextSpans();
      linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();
    } catch (ParserException e) {
      throw new AnalysisEngineProcessException(e);
    }
View Full Code Here


    String documentText = jcas.getDocumentText();
    List<AnnotationFS> annotations = new ArrayList<AnnotationFS>();
    List<AnnotationFS> annotationStack = new ArrayList<AnnotationFS>();
    try {
      Parser parser = new Parser(documentText);
      NodeList list = parser.parse(null);
      HtmlVisitor visitor = new HtmlVisitor(jcas, onlyContent);
      list.visitAllNodesWith(visitor);
      annotations = visitor.getAnnotations();
      annotationStack = visitor.getAnnotationStack();
    } catch (ParserException e) {
      throw new AnalysisEngineProcessException(e);
    }
View Full Code Here

      }

      String document = sb.toString();

      Parser parser = new Parser(document);
      NodeList list = parser.parse(null);
      HtmlDocumentationVisitor visitor = new HtmlDocumentationVisitor(document);
      list.visitAllNodesWith(visitor);
      map.putAll(visitor.getMap());
    } catch (Exception e) {
      RutaIdeUIPlugin.error(e);
    }
View Full Code Here

        try{

            htmlBuffer = "<html>" + htmlBuffer + "</html>";
            Parser parser = new Parser();
            parser.setInputHTML(htmlBuffer);
            NodeList nodelist = parser.parse(null);

            NodeList tableList = nodelist.extractAllNodesThatMatch(new TagNameFilter("TABLE"), true);
            NodeList  headList = tableList.extractAllNodesThatMatch(new TagNameFilter("THEAD"), true);
            NodeList  footList = tableList.extractAllNodesThatMatch(new TagNameFilter("TFOOT"), true);
            NodeList  rowList = tableList.extractAllNodesThatMatch(new TagNameFilter("TR"), true);
           
            //Create a ParserUtils var
            ParserUtils pu = new ParserUtils();
            //Set rowCount to size of rowList
            rowCount = rowList.size();
           
           
            HSSFFont boldFont = wb.createFont();
            boldFont.setBoldweight(HSSFFont.BOLDWEIGHT_BOLD);
            HSSFCellStyle boldStyle = wb.createCellStyle();
            boldStyle.setFont(boldFont);
            boldStyle.setWrapText(true);
           
            //Loop through excel 'Rows'
            for ( int i = 0; i < rowList.size(); i++ ) {
                HSSFRow row;
                String htmlRow = rowList.elementAt(i).toHtml().trim();
                String[] splitHtmlRow;
                List<String> elements = new ArrayList<String>();
               
                if(i == 0 && headList.size() == 1){
                    row = sheet.createRow((short) i);
                    htmlRow = htmlRow.replace("<th></th>","<th> </th>");
                    htmlRow = htmlRow.replace("<TH></TH>","<th> </th>");
                    htmlRow = htmlRow.replace("</span>", " </span>");
                    elements.add("tr");
                    elements.add("th");
                    if (htmlRow.indexOf("<a href=") > -1) {
                      elements.add("a");
                    }
                    if (htmlRow.indexOf("<span ") > -1) {
                      elements.add("span");
                    }
                } else if(i == 1 && footList.size() == 1){
                    row = sheet.createRow((short) rowList.size() - 1);
                    htmlRow = pu.trimTags(htmlRow, new String[]{"span"},false,false);
                    htmlRow = htmlRow.replace("<td></td>","<td> </td>");
                    htmlRow = htmlRow.replace("<TD></TD>","<td> </td>");
                    elements.add("tr");
                    elements.add("td");
View Full Code Here

 
  protected Node getElementById(Parser parser, String id) {
    parser.reset();
    NodeFilter filter = new CssSelectorNodeFilter("#" + id);
    try {
      NodeList list = parser.extractAllNodesThatMatch(filter);
      if (list.size() > 0) {
        return list.elementAt(0);
      }
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
View Full Code Here

    protected static Node getElementById(Parser parser, String id) {
        parser.reset();
        NodeFilter filter = new CssSelectorNodeFilter("#" + id);
        try {
            NodeList list = parser.extractAllNodesThatMatch(filter);
            if (list.size() > 0) {
                return list.elementAt(0);
            }
        } catch (ParserException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
View Full Code Here

        try
        {
            Parser parser = Parser.createParser(htmlData, null);

            NodeList heads = parser.parse(new TagNameFilter("HEAD"));
            if (heads.size() != 1)
                throw new DiscoveryException(
                        "HTML response must have exactly one HEAD element, " +
                                "found " + heads.size() + " : " + heads.toHtml());
            Node head = heads.elementAt(0);
            for (NodeIterator i = head.getChildren().elements();
                 i.hasMoreNodes();)
            {
                Node node = i.nextNode();
                if (node instanceof TagNode)
View Full Code Here

      return fetchCount;
   }

   private int fetchBoards(File rootDir, Parser parser, OrFilter filter, int bingoIdx) throws Exception {
      int fetchCount = 0;
      NodeList list = parser.extractAllNodesThatMatch(filter);
      SimpleNodeIterator simpleNodeIterator = list.elements();
      while (simpleNodeIterator.hasMoreNodes()) {
         Node node = simpleNodeIterator.nextNode();
         if (node instanceof ImageTag) {
            ImageTag img = (ImageTag) node;
            String attribute = img.getAttribute("src");
View Full Code Here

 

  private void trouverLiens() throws Exception{
    Parser parser;
    NodeList listUrlNode;
    NodeList listTitleNode;
    String tmpString;
    String tmpLien;
    String tmpValeur;
    LinkedList<HashMap> listUrl = new LinkedList<HashMap>();
    HashMap<String, String> tmpHashMap;
    String tmpRecherche;
    int i=0;

    try {
      tmpRecherche = "http://www.imdb.com/find?s=tt&q=" + localParent.getJValeurNom().getText().replaceAll(" ", "+");

      parser = new Parser(tmpRecherche);

      listTitleNode = parser.extractAllNodesThatMatch(new NodeClassFilter (TitleTag.class));
      //On commence par d�tecter le titre
      for(i = 0;i < listTitleNode.size(); i++){
        tmpString = listTitleNode.elementAt(i).toHtml();
        if(tmpString.indexOf("IMDb Title  Search") <= -1){
          urlFilm = tmpRecherche;
        }
      }
View Full Code Here

  }

  private void recupererInfoFilm() throws Exception{
    Parser divParser;
    NodeList listDiv;
    Node tmpNode; 

    boolean acteursVu = false;
    boolean titreVu = false;
    boolean realisateurVu = false;
    boolean paysVu = false;
    boolean tempsVu = false;
    boolean genreVu = false;
    boolean awardsVu = false;
    boolean dateSortieVue = false;
    boolean imageVues = false;
    int indexDeb = 0;
    int indexFin = 0;


    try{
      divParser = new Parser(urlFilm);
      listDiv = divParser.extractAllNodesThatMatch (new NodeClassFilter (Div.class));
      for (int i = 0; i < listDiv.size(); i++){
        NodeList tmpDivNodeContent = listDiv.elementAt(i).getChildren();

        if(tmpDivNodeContent != null){
          for(int j = 0; j< tmpDivNodeContent.size(); j++){
            tmpNode = tmpDivNodeContent.elementAt(j);

            if(tmpNode.toHtml().indexOf("alt=\"Cast\"") > -1 && !acteursVu){
              LinkedList<Acteurs> tmpListActeur = new LinkedList<Acteurs>();
              Acteurs tmpActeur = null;
View Full Code Here

TOP

Related Classes of org.htmlparser.util.NodeList

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.