try
{
//*-- Read the file into a string
String htmlcontents = Files.readFromFile(new File(ifile)); htmlcontents = StringTools.filterChars(htmlcontents);
if (htmlcontents.length() == 0) return;
Parser parser = new Parser();
//*-- Extract the title text
logger.info("Extracting title from HTML file " + ifile);
parser.setInputHTML(htmlcontents);
NodeList nodelist1 = parser.parse(new TagNameFilter ("TITLE"));
if (nodelist1.elementAt(0) != null)
{ String title = nodelist1.elementAt(0).toPlainTextString();
doc.setTitle( cleanHTML(title) );
}
//*-- Extract information from the meta tags
logger.info("Extracting METADATA from html file " + ifile);
parser.setInputHTML(htmlcontents);
NodeList nodelist2 = parser.parse(new TagNameFilter("META") );
if (nodelist2 != null)
{
String metadata = ""; String author = "";
for (int i = 0; i < nodelist2.size(); i++)
{ if (nodelist2.elementAt(i) == null) continue;
String meta = nodelist2.elementAt(i).getText(); meta = StringTools.filterChars(meta);
MetaTag mtag = new MetaTag(); mtag.setText("<" + meta + ">");
String tagName = mtag.getMetaTagName();
if (tagName == null) continue;
if (tagName.equalsIgnoreCase("keywords"))
{ metadata += mtag.getMetaContent(); }
if (tagName.equalsIgnoreCase("authors") ||
tagName.equalsIgnoreCase("author") )
{ author += mtag.getMetaContent(); }
} // end of for
doc.setAuthor(author); doc.setMetadata(metadata);
} // eod of if
//*-- Populate the contents of the contents with the entire text from the web page
logger.info("Extracting text from body of html file " + ifile);
StringExtractor st = new StringExtractor(ifile);
//*-- string extractor does not input form values -- handle separately
parser.setInputHTML(htmlcontents); StringBuffer inputVal = new StringBuffer();
NodeList nodelist3 = parser.parse(new TagNameFilter ("INPUT"));
for (int i = 0; i < nodelist3.size(); i++)
{ InputTag itag = (InputTag) nodelist3.elementAt(i);
if ((itag != null) && (itag.getAttribute("value") != null) )
{ inputVal.append(" "); inputVal.append( itag.getAttribute("value") ); }
}