Source Code of org.sf.mustru.filters.HtmlHandler

package org.sf.mustru.filters;


import java.io.File;
import java.io.IOException;


import org.apache.log4j.Logger;
import org.htmlparser.Parser;
import org.htmlparser.parserapplications.StringExtractor;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.*;
import org.htmlparser.filters.*;


import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.*;


import com.aliasi.util.Files;


/**
 * Extract text and metadata from a HTML file using htmlparser.
 *
 */
public class HtmlHandler implements HandlerInterface 
{


 static Logger logger = Logger.getLogger(HtmlHandler.class.getName());


 /**
  * Empty constructor
  */
 public HtmlHandler() { super(); }


 /**
  * Extract the body and title tags from a HTML file 
  */
 public void getDocument(String ifile, IndexableDoc doc) 
 {
  doc.setFileType("webpage");
  try 
  {
   //*-- Read the file into a string
   String htmlcontents = Files.readFromFile(new File(ifile)); htmlcontents = StringTools.filterChars(htmlcontents);
   if (htmlcontents.length() == 0) return;
   Parser parser = new Parser(); 


   //*-- Extract the title text
   logger.info("Extracting title from HTML file " + ifile);
   parser.setInputHTML(htmlcontents);


   NodeList nodelist1 = parser.parse(new TagNameFilter ("TITLE")); 
   if (nodelist1.elementAt(0) != null)
   { String title = nodelist1.elementAt(0).toPlainTextString();
   doc.setTitle( cleanHTML(title) );
   }


   //*-- Extract information from the meta tags
   logger.info("Extracting METADATA from html file " + ifile);
   parser.setInputHTML(htmlcontents);
   NodeList nodelist2 = parser.parse(new TagNameFilter("META") );
   if (nodelist2 != null)
   {
    String metadata = ""; String author = "";
    for (int i = 0; i < nodelist2.size(); i++)
    { if (nodelist2.elementAt(i) == null) continue;
    String meta = nodelist2.elementAt(i).getText(); meta = StringTools.filterChars(meta);
    MetaTag mtag = new MetaTag(); mtag.setText("<" + meta + ">");
    String tagName = mtag.getMetaTagName();
    if (tagName == null) continue;
    if (tagName.equalsIgnoreCase("keywords")) 
    { metadata += mtag.getMetaContent(); }
    if (tagName.equalsIgnoreCase("authors") || 
      tagName.equalsIgnoreCase("author") )
    { author += mtag.getMetaContent(); }
    } // end of for
    doc.setAuthor(author); doc.setMetadata(metadata);
   } // eod of if


   //*-- Populate the contents of the contents with the entire text from the  web page
   logger.info("Extracting text from body of html file " + ifile);
   StringExtractor st = new StringExtractor(ifile); 


   //*-- string extractor does not input form values -- handle separately
   parser.setInputHTML(htmlcontents); StringBuffer inputVal = new StringBuffer();
   NodeList nodelist3 = parser.parse(new TagNameFilter ("INPUT"));
   for (int i = 0; i < nodelist3.size(); i++)
   { InputTag itag = (InputTag) nodelist3.elementAt(i); 
   if ((itag != null) && (itag.getAttribute("value") != null) )
   { inputVal.append(" "); inputVal.append( itag.getAttribute("value") ); }
   }


   //*-- finally set the contents of the document
   doc.setContents( new StringBuffer(cleanHTML( st.extractStrings(false)) + " " + inputVal) ); 
   doc.setFileName(ifile);


  } //*-- end of try block
  catch (OutOfMemoryError exc) 
  { logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage()); }
  catch (ParserException exc) 
  { logger.error("Parser error " + ifile + " or could be corrupt file " + exc.getMessage()); }
  catch (IOException e) 
  { logger.error("IO Error for file: " + ifile + " " +  e.getMessage()); }
  catch (RuntimeException e) 
  { logger.error("Could not extract text from the HTML document: may contain frames " + ifile + " " +  e.getMessage()); }


  return;
 } //*-- end of getDocument


 //*-- remove extraneous characters from extracted text
 private String cleanHTML(String in)
 { String out = StringTools.removeAmpersandStrings(in); 
 out = StringTools.filterChars(out);
 return(out);
 }


}
Source Code of org.sf.mustru.filters.HtmlHandler

Related Classes of org.sf.mustru.filters.HtmlHandler