Source Code of org.olat.core.util.SimpleHtmlParser

/**
* OLAT - Online Learning and Training<br>
* http://www.olat.org
* <p>
* Licensed under the Apache License, Version 2.0 (the "License"); <br>
* you may not use this file except in compliance with the License.<br>
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing,<br>
* software distributed under the License is distributed on an "AS IS" BASIS, <br>
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
* See the License for the specific language governing permissions and <br>
* limitations under the License.
* <p>
* Copyright (c) 1999-2006 at Multimedia- & E-Learning Services (MELS),<br>
* University of Zurich, Switzerland.
* <p>
*/ 


package org.olat.core.util;


import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.olat.core.logging.OLog;
import org.olat.core.logging.Tracing;
import org.olat.core.util.vfs.VFSLeaf;


/**
 * Simple html parser that grabs css, javascript etc. stuff out of the header. Not very tolerant but at least very fast
 * Initial Date:  11.07.2004
 *
 * @author Mike Stock
 * @author guido
 */
public class SimpleHtmlParser {


  private static final Pattern COMMENT_OR_LINK = Pattern.compile("<(!--.*--|link[^>]*)>", Pattern.DOTALL | Pattern.CASE_INSENSITIVE
      | Pattern.MULTILINE);


  private static final Pattern LINK_REL = Pattern.compile("rel\\s*=\\s*\"?(home|next|prev(ious)?|start|up)\"?",
      Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
  
  // use extra characters that the StringBuilder doesn't grow
  // the optimistic length is 5 (navigation links) * 7 (comment tags) per header
  private static final int EXTRA_HEADER_LENGTH = 64;


  private static final String CHARSET = "charset=";
  private static final String ISO_8859_1 = "ISO-8859-1";


  private static final OLog log = Tracing.createLoggerFor(SimpleHtmlParser.class);
  
  private String htmlDocType;
  private String xhtmlNamespaces;
  private String htmlHead;
  private String jsOnLoad;
  private String bodyTag;
  private String htmlContent;
  private String charsetName;
  private boolean validHtml;
  //does the parsed content has it's own css classes inline or a css file defined
  private boolean ownCss;


  
  /**
   * @param htmlString
   * @param lookForCharset
   */
  public SimpleHtmlParser(String htmlString) {
    parseContent(htmlString);
  }
  
  /**
   * 
   * @param file
   */
  public static String extractHTMLCharset(VFSLeaf sourceFile) {
    
    ByteArrayOutputStream target = new ByteArrayOutputStream(8192);
    InputStream in = sourceFile.getInputStream();
    FileUtils.copy(in, target);
    FileUtils.closeSafely(in);


    byte[] bytes = target.toByteArray();
    if (bytes.length == 0) return ISO_8859_1; //empty file


    // we assume that, in order to indicate an encoding, the html meta element
    // is used.
    // if none is specified, we assume iso-8859-1
    // 1. parse the body assuming iso-8859-1 (so us-ascii is ok and we do not
    // needs a reparse if not iso-8859-1


    String parse = null;
    try {
      parse = new String(bytes, ISO_8859_1);
    } catch (UnsupportedEncodingException e1) {
      //won't happen
    }
    
    // Parse HTML header
    // look for body start tag - some people forget to use proper header tags, also support those
    int bspos = lookForTag(parse, 0, "<body");
    
    String charset = null;
    if (bspos > 0)  charset = checkForCharset(parse.substring(0, bspos));
    //if nothing found return default
    return StringHelper.containsNonWhitespace(charset) ? charset :ISO_8859_1;
  }
  
  /**
   * Parses the given string and looks for head, onload functions and the page body
   * the parsed data is stored in the class variables
   * <ul>
   * <li />htmlHead
   * <li />jsOnLoad
   * <li />htmlContent
   * </ul>
   * @param cont
   * @param lookForCharset
   */
  private void parseContent(String cont) {
    //TODO: measure time
    //long t1 = System.currentTimeMillis();
    
    // check for doctype
    int docTypePos = cont.indexOf("<!DOCTYPE");
    if (docTypePos != -1 ) {
      int endOfhtmlDocTypePos = cont.indexOf(">", docTypePos);
      htmlDocType = cont.substring(docTypePos, endOfhtmlDocTypePos+1);
    }
    
    // Check for <html>: lowercase, uppercase and mixed
    int spos = cont.indexOf("<html");
    if (spos == -1) spos = cont.indexOf("<HTML");
    if (spos == -1) spos = cont.toLowerCase().indexOf("<html");
    if (spos == -1) {
      // This is not valid HTML - assume whole file is the content
      htmlContent = cont;
      if (log.isDebug()) log.debug("Could not detect proper HTML content, no HTML tag found.", cont);
      return; 
    }
    
    // find positions of body part
    int bodypos = cont.indexOf("<body");
    if (bodypos == -1) bodypos = cont.indexOf("<BODY");
    if (bodypos == -1) bodypos = cont.toLowerCase().indexOf("<body");
    if (bodypos == -1) {
      // This is not valid HTML - assume whole file is the content
      htmlContent = cont;
      if (log.isDebug()) log.debug("Could not detect proper HTML content, no BODY tag found.", cont);
      return; 
    }
    
    // Parse HTML header
    int hspos = cont.indexOf("<head>");
    if (hspos == -1) hspos = cont.indexOf("<HEAD"); // be tolerant, try also capitals
    if (hspos == -1) hspos = cont.toLowerCase().indexOf("<HEAD"); // be tolerant, try also mixed
    if (hspos != -1) { // look for end tag if start tag found
      int hepos = lookForTag(cont, hspos, "</head>");
      if ( hepos > hspos )htmlHead = cont.substring(hspos + 6, hepos);
      // extract xhtml namespaces
      if (htmlDocType != null && htmlDocType.indexOf("XHTML") > 0) {
        xhtmlNamespaces = cont.substring(spos, hspos);
      }
    } else {
      // no head tag found - use everything between HTML and BODY tag to support those crippled pages as well
      htmlHead = cont.substring((cont.indexOf(">", spos))+1, bodypos);
    }
    if (htmlHead != null) {
      // Filter out base tag
      int bsPos = htmlHead.indexOf("<base ");
      if (bsPos == -1) bsPos = htmlHead.indexOf("<BASE ");
      if (bsPos == -1) bsPos = htmlHead.toLowerCase().indexOf("<BASE ");
      if (bsPos != -1) {
        int bePos = htmlHead.indexOf('>', bsPos + 6);
        if (bePos > -1) {
          // remove base tag from head
          String beforeBase = htmlHead.substring(0, bsPos);
          htmlHead = beforeBase + htmlHead.substring(bePos);
        }
      }
      // Filter the navigation links
      // olat and firefox problem 
      htmlHead = filterHeader(htmlHead);
      // Filter out CSS definitions from HEAD
      if (htmlHead.indexOf("text/css") > 0) ownCss = true;
      // Filter out character set
      charsetName = checkForCharset(htmlHead);
    }


    
    // Parse body onLoad java script function
    int jspos = cont.indexOf("onload=", bodypos);
    if (jspos == -1) jspos = cont.indexOf("onLoad=", bodypos);
    if (jspos == -1) jspos = cont.toLowerCase().indexOf("onload=", bodypos);
    int jepos = cont.indexOf(">",jspos);
    if (jspos == -1 || jepos == -1) {
      jsOnLoad = null; // no match
    } else {
      jsOnLoad = cont.substring(jspos+7, jepos);
      // trimm whitespace
      jsOnLoad = jsOnLoad.trim();
      // trimm ' or "
      if (jsOnLoad.indexOf("'") == 0 || jsOnLoad.indexOf("\"") == 0) {
        jsOnLoad = jsOnLoad.substring(1, jsOnLoad.length()-1);
      }
      // else assume no ' or " are used, should be fine for us
      // check if commands end with ;. This is necessary when doing inline integration (OLAT-3340)
      if (jsOnLoad.length() > 0 && !jsOnLoad.endsWith(";")) {
        jsOnLoad = jsOnLoad + ";";
      }
    }
    
    // FIXME gs: add support for other body attributes as well, see 
    // http://de.selfhtml.org/html/referenz/attribute.htm#body
    // http://de.selfhtml.org/html/referenz/attribute.htm#universalattribute
    // Maybe it's safer to reuse the whole body tag as it comes and only inject 
    // the custom onload stuff or add the unload method if no such method is there


    //look for class attribute of body tag
    int endOfBodyTagPos = cont.indexOf(">", bodypos);


    // get entire body tag
    bodyTag = cont.substring(bodypos, endOfBodyTagPos+1);
    
    int bepos = -1;
    if (bodypos != -1) { // look for end tag is start tag found
      bepos = lookForTag(cont, endOfBodyTagPos, "</body>");
      if (bepos == -1) {
        // use end of html as fallback
        bepos = lookForTag(cont, endOfBodyTagPos, "</html>");
        if (bepos == -1) {
          // use end of content as fallback
          bepos = cont.length()-1;
        }
      }
    }
    
    if (bodypos == -1 || bepos == -1) {
      htmlContent = cont; // no match, return the string unmodified
    } else {
      String res = cont.substring(endOfBodyTagPos+1, bepos);
      htmlContent = res;
    }
    validHtml = true;
    
    if (log.isDebug()) {      
      log.debug("original content::", cont);
      log.debug("xhtmlNamespaces::", xhtmlNamespaces);
      log.debug("charsetName::", charsetName);
      log.debug("htmlHead::", htmlHead);
      log.debug("jsOnLoad::", jsOnLoad);
      log.debug("bodyTag::", bodyTag);
      log.debug("htmlContent::", htmlContent);
    }
  }


  private static String checkForCharset(String hdr) {
    // we assume that, in order to indicate an encoding, the html meta element
    // is used
    //<html><head>
    //<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    //<title>OLAT - Online Learning And Training (Version 3)</title>
    //..
                
    /*
     * from http://www.w3.org/TR/html401/charset.html#doc-char-set 
     * <META http-equiv="Content-Type" content="text/html; charset=EUC-JP"> 
     * The META
     * declaration must only be used when the character encoding is organized
     * such that ASCII-valued bytes stand for ASCII characters (at least until
     * the META element is parsed). META declarations should appear as early
     * as possible in the HEAD element.
     */
    int charsetPos = hdr.indexOf(CHARSET);
    if (charsetPos == -1) { 
      // try it lowercase
      charsetPos = hdr.toLowerCase().indexOf(CHARSET);
    }
    if (charsetPos != -1) { // found!
      int endofmetastop = hdr.indexOf('>', charsetPos + CHARSET.length());
      if (endofmetastop != -1) {
        // found meta tag stop character, continue searching for closing quotes
        int endofcs = hdr.indexOf('"', charsetPos + CHARSET.length());
        if (endofcs == -1 || endofcs > endofmetastop)
          endofcs = hdr.indexOf('\'', charsetPos + CHARSET.length());
        if (endofcs != -1 && endofcs < endofmetastop)
          return hdr.substring(charsetPos + CHARSET.length(), endofcs).trim(); 
      }
    }
    return null;
  }


  /**
   * Looks for a tag (both lower and upper case). Returns -1 if not present,
   * tag position within content, otherwise.
   * 
   * @param tag
   * @return -1 if not present, tag position within content, otherwise.
   */
  private static int lookForTag(String content, int offset, String tag) {
    int tagpos = offset;
    while (true) {
      tagpos = content.indexOf(tag, tagpos);
      if (tagpos == -1) {
        tagpos = content.indexOf(tag.toUpperCase(), tagpos); // be tolerant, try also capitals
      }
      if (tagpos == -1) {
        tagpos = content.toLowerCase().indexOf(tag, tagpos); // be tolerant, try also mixed
      }
      if (tagpos == -1) break;
      if(!isWithinComment(content, offset, tagpos)) break;
      else {
        tagpos = tagpos + tag.length(); // offset new search position by found </body> tag (7 chrs)
        if (content.indexOf(tag, tagpos) == -1) {
          // only found within the comment which is equal to not found. break to prevent endless loop
          tagpos = -1;
          break;
        }
      }
    }
    return tagpos;
  }
  
  /**
   * Check if source beginning at startPos and ending at endPos has a comment
   * start tag which is not (yet) closed
   * @param startPos
   * @param endPos
   * @return true if content has unclosed comment start tag
   */
  private static boolean isWithinComment(String content, int startPos, int endPos) {
    if (startPos == -1 || endPos == -1) return false;
    int commentTagS = -1;
    while ((commentTagS = content.indexOf("<!--", startPos)) != -1) {
      if (commentTagS > endPos) return false; // this is not within range of interest
      int commentTagE = content.indexOf("-->", commentTagS); // look for end tag
      if (commentTagE == -1) return false; // no end tag -> break
      if (commentTagE < endPos) {
        startPos = commentTagE;
        continue; // comment is closed within start/end -> not of interest
      }
      else return true; // this is within a comment.
    }
    return false;
  }


  
  
  private String filterHeader(String in) {
    if (in == null) return null;
    StringBuffer out = new StringBuffer(in.length() + EXTRA_HEADER_LENGTH);
    Matcher m = COMMENT_OR_LINK.matcher(in);
    String match;
    while (m.find()) {
      match = m.group();
      if (isComment(match)) {
        // ignore comment
        continue;
      } else if (isNavigationLink(match)) {
        // comment the navigation link out
        m.appendReplacement(out, "<!--");
        out.append(match).append("-->");
      }
    }
    m.appendTail(out);
    return out.toString();
  }


  private boolean isComment(String tag) {
    // is it "<!--"
    return tag.charAt(1) == '!';
  }
  
  private boolean isNavigationLink(String tag) {
    // looking for rel="next" ...
    // start searching after "<link " 
    return LINK_REL.matcher(tag).find(6);
  }
  
  private String extractMatches(String in, Pattern pattern){
    if (in == null) return null;
    else {
      StringBuilder out = new StringBuilder(128);
      in = removeLineTerminators(in);
      Matcher m = pattern.matcher(in);
      while (m.find()) {
        out.append(m.group());
      }
      return out.toString();
    }
  }
  
  public String removeLineTerminators(String in) {
//    String patternStr = "$^|[\\r\\n]+\\z";
//    String replaceStr = " ";
//    Pattern pattern = Pattern.compile(patternStr, Pattern.MULTILINE);
//    Matcher matcher = pattern.matcher(inputStr);
//    System.out.println(matcher.replaceAll(replaceStr));
//    return matcher.replaceAll(replaceStr);
    //the above does not work, grrr??? but it should remove all line terminators like the win and mac ones
    return in.replaceAll("\\n", "");
}
  
  
  
  /**
   * @return Returns the htmlContent.
   */
  public String getHtmlContent() {
    return htmlContent;
  }
  /**
   * @return Returns the htmlHead.
   */
  public String getHtmlHead() {
    return htmlHead;
  }
  /**
   * @return Returns the jsOnLoad.
   */
  public String getJsOnLoad() {
    return jsOnLoad;
  }
  /**
   * @return Returns the body tag including all onload, onclick, class, style etc.
   */
  public String getBodyTag() {
    return bodyTag;
  }
  /**
   * @return Returns the validHtml.
   */
  public boolean isValidHtml() {
    return validHtml;
  }
  /**
   * @return Returns the charsetName.
   */
  public String getCharsetName() {
    return charsetName;
  }


  public boolean hasOwnCss() {
    return ownCss;
  }


  public String getHtmlDocType() {
    return htmlDocType;
  }


  public String getXhtmlNamespaces() {
    return xhtmlNamespaces;
  }
}
Source Code of org.olat.core.util.SimpleHtmlParser

Related Classes of org.olat.core.util.SimpleHtmlParser