package org.sf.mustru.filters;
import java.io.File;
import java.io.IOException;
import org.apache.log4j.Logger;
import org.htmlparser.Parser;
import org.htmlparser.parserapplications.StringExtractor;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.*;
import org.htmlparser.filters.*;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.*;
import com.aliasi.util.Files;
/**
* Extract text and metadata from a HTML file using htmlparser.
*
*/
public class HtmlHandler implements HandlerInterface
{
static Logger logger = Logger.getLogger(HtmlHandler.class.getName());
/**
* Empty constructor
*/
public HtmlHandler() { super(); }
/**
* Extract the body and title tags from a HTML file
*/
public void getDocument(String ifile, IndexableDoc doc)
{
doc.setFileType("webpage");
try
{
//*-- Read the file into a string
String htmlcontents = Files.readFromFile(new File(ifile)); htmlcontents = StringTools.filterChars(htmlcontents);
if (htmlcontents.length() == 0) return;
Parser parser = new Parser();
//*-- Extract the title text
logger.info("Extracting title from HTML file " + ifile);
parser.setInputHTML(htmlcontents);
NodeList nodelist1 = parser.parse(new TagNameFilter ("TITLE"));
if (nodelist1.elementAt(0) != null)
{ String title = nodelist1.elementAt(0).toPlainTextString();
doc.setTitle( cleanHTML(title) );
}
//*-- Extract information from the meta tags
logger.info("Extracting METADATA from html file " + ifile);
parser.setInputHTML(htmlcontents);
NodeList nodelist2 = parser.parse(new TagNameFilter("META") );
if (nodelist2 != null)
{
String metadata = ""; String author = "";
for (int i = 0; i < nodelist2.size(); i++)
{ if (nodelist2.elementAt(i) == null) continue;
String meta = nodelist2.elementAt(i).getText(); meta = StringTools.filterChars(meta);
MetaTag mtag = new MetaTag(); mtag.setText("<" + meta + ">");
String tagName = mtag.getMetaTagName();
if (tagName == null) continue;
if (tagName.equalsIgnoreCase("keywords"))
{ metadata += mtag.getMetaContent(); }
if (tagName.equalsIgnoreCase("authors") ||
tagName.equalsIgnoreCase("author") )
{ author += mtag.getMetaContent(); }
} // end of for
doc.setAuthor(author); doc.setMetadata(metadata);
} // eod of if
//*-- Populate the contents of the contents with the entire text from the web page
logger.info("Extracting text from body of html file " + ifile);
StringExtractor st = new StringExtractor(ifile);
//*-- string extractor does not input form values -- handle separately
parser.setInputHTML(htmlcontents); StringBuffer inputVal = new StringBuffer();
NodeList nodelist3 = parser.parse(new TagNameFilter ("INPUT"));
for (int i = 0; i < nodelist3.size(); i++)
{ InputTag itag = (InputTag) nodelist3.elementAt(i);
if ((itag != null) && (itag.getAttribute("value") != null) )
{ inputVal.append(" "); inputVal.append( itag.getAttribute("value") ); }
}
//*-- finally set the contents of the document
doc.setContents( new StringBuffer(cleanHTML( st.extractStrings(false)) + " " + inputVal) );
doc.setFileName(ifile);
} //*-- end of try block
catch (OutOfMemoryError exc)
{ logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage()); }
catch (ParserException exc)
{ logger.error("Parser error " + ifile + " or could be corrupt file " + exc.getMessage()); }
catch (IOException e)
{ logger.error("IO Error for file: " + ifile + " " + e.getMessage()); }
catch (RuntimeException e)
{ logger.error("Could not extract text from the HTML document: may contain frames " + ifile + " " + e.getMessage()); }
return;
} //*-- end of getDocument
//*-- remove extraneous characters from extracted text
private String cleanHTML(String in)
{ String out = StringTools.removeAmpersandStrings(in);
out = StringTools.filterChars(out);
return(out);
}
}