Package org.sf.mustru.filters

Source Code of org.sf.mustru.filters.HtmlHandler

package org.sf.mustru.filters;

import java.io.File;
import java.io.IOException;

import org.apache.log4j.Logger;
import org.htmlparser.Parser;
import org.htmlparser.parserapplications.StringExtractor;
import org.htmlparser.tags.InputTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.*;
import org.htmlparser.filters.*;

import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.utils.*;

import com.aliasi.util.Files;

/**
* Extract text and metadata from a HTML file using htmlparser.
*
*/
public class HtmlHandler implements HandlerInterface
{

static Logger logger = Logger.getLogger(HtmlHandler.class.getName());

/**
  * Empty constructor
  */
public HtmlHandler() { super(); }

/**
  * Extract the body and title tags from a HTML file
  */
public void getDocument(String ifile, IndexableDoc doc)
{
  doc.setFileType("webpage");
  try
  {
   //*-- Read the file into a string
   String htmlcontents = Files.readFromFile(new File(ifile)); htmlcontents = StringTools.filterChars(htmlcontents);
   if (htmlcontents.length() == 0) return;
   Parser parser = new Parser();

   //*-- Extract the title text
   logger.info("Extracting title from HTML file " + ifile);
   parser.setInputHTML(htmlcontents);

   NodeList nodelist1 = parser.parse(new TagNameFilter ("TITLE"));
   if (nodelist1.elementAt(0) != null)
   { String title = nodelist1.elementAt(0).toPlainTextString();
   doc.setTitle( cleanHTML(title) );
   }

   //*-- Extract information from the meta tags
   logger.info("Extracting METADATA from html file " + ifile);
   parser.setInputHTML(htmlcontents);
   NodeList nodelist2 = parser.parse(new TagNameFilter("META") );
   if (nodelist2 != null)
   {
    String metadata = ""; String author = "";
    for (int i = 0; i < nodelist2.size(); i++)
    { if (nodelist2.elementAt(i) == null) continue;
    String meta = nodelist2.elementAt(i).getText(); meta = StringTools.filterChars(meta);
    MetaTag mtag = new MetaTag(); mtag.setText("<" + meta + ">");
    String tagName = mtag.getMetaTagName();
    if (tagName == null) continue;
    if (tagName.equalsIgnoreCase("keywords"))
    { metadata += mtag.getMetaContent(); }
    if (tagName.equalsIgnoreCase("authors") ||
      tagName.equalsIgnoreCase("author") )
    { author += mtag.getMetaContent(); }
    } // end of for
    doc.setAuthor(author); doc.setMetadata(metadata);
   } // eod of if

   //*-- Populate the contents of the contents with the entire text from the  web page
   logger.info("Extracting text from body of html file " + ifile);
   StringExtractor st = new StringExtractor(ifile);

   //*-- string extractor does not input form values -- handle separately
   parser.setInputHTML(htmlcontents); StringBuffer inputVal = new StringBuffer();
   NodeList nodelist3 = parser.parse(new TagNameFilter ("INPUT"));
   for (int i = 0; i < nodelist3.size(); i++)
   { InputTag itag = (InputTag) nodelist3.elementAt(i);
   if ((itag != null) && (itag.getAttribute("value") != null) )
   { inputVal.append(" "); inputVal.append( itag.getAttribute("value") ); }
   }

   //*-- finally set the contents of the document
   doc.setContents( new StringBuffer(cleanHTML( st.extractStrings(false)) + " " + inputVal) );
   doc.setFileName(ifile);

  } //*-- end of try block
  catch (OutOfMemoryError exc)
  { logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage()); }
  catch (ParserException exc)
  { logger.error("Parser error " + ifile + " or could be corrupt file " + exc.getMessage()); }
  catch (IOException e)
  { logger.error("IO Error for file: " + ifile + " " +  e.getMessage()); }
  catch (RuntimeException e)
  { logger.error("Could not extract text from the HTML document: may contain frames " + ifile + " " +  e.getMessage()); }

  return;
} //*-- end of getDocument

//*-- remove extraneous characters from extracted text
private String cleanHTML(String in)
{ String out = StringTools.removeAmpersandStrings(in);
out = StringTools.filterChars(out);
return(out);
}

}
TOP

Related Classes of org.sf.mustru.filters.HtmlHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.