*
* @param filename filename of webpage
* @return author, title, and summary of a web page speficied in filename
*/
public WebPageMetaData getWebPageMetaData(String filename) {
WebPageMetaData tempWpmd = new WebPageMetaData();
tempWpmd.setFilename(filename);
BufferedReader reader = null;
PrintWriter writer = null;
try {
boolean inTag = false;
boolean foundSummary = false;
boolean inBody = false;
boolean inScript = false;
boolean inTitle = false;
StringBuffer tagBuf = new StringBuffer();
StringBuffer titleBuf = new StringBuffer();
StringBuffer summaryBuf = new StringBuffer();
// open reader and writer
File origFile = new File(filename);
reader = new BufferedReader(new InputStreamReader(new FileInputStream(origFile)));
writer = new PrintWriter(new FileWriter(ds.htmlTextFile));
StringBuffer nonTagTextf = new StringBuffer();
int curBodyNonTagTextNum = 0;
int sumMaxSize = 220;
int ch;
// step during html source
while ((ch = reader.read()) > -1) {
char curChar = (char) ch;
if (curChar == '>') {
inTag = false;
//
tagBuf.append(curChar);
String realTag = tagBuf.toString();
String lowerTag = realTag.toLowerCase();
if (lowerTag.startsWith(META_TAG)) {
String tempMetaName = Utils.getTagString("name=", lowerTag);
if (tempMetaName.startsWith("description")) {
String tempMetaContent = Utils.getTagString("content=", realTag);
if (! tempMetaContent.trim().equals("")) {
tempWpmd.setDescription(tempMetaContent);
foundSummary = true;
}
}
else if (tempMetaName.startsWith("summary")) {
String tempMetaContent = Utils.getTagString("content=", realTag);
if (! tempMetaContent.trim().equals("")) {
tempWpmd.setDescription(tempMetaContent);
foundSummary = true;
}
}
else if (tempMetaName.startsWith("author") || tempMetaName.indexOf("webmaster") != -1) {
String tempMetaContent = Utils.getTagString("content=", realTag);
tempWpmd.setAuthor(tempMetaContent);
}
}
else if (lowerTag.startsWith(SCRIPT_TAG)) {
if (!lowerTag.endsWith("/>")) {
inScript = true;
}
}
else if (lowerTag.startsWith(SCRIPT_TAG_END)) {
inScript = false;
}
else if (lowerTag.startsWith(BODY_TAG)) {
inBody = true;
}
else if (lowerTag.startsWith(BODY_TAG_END)) {
inBody = false;
}
else if (lowerTag.startsWith(TITLE_TAG)) {
inTitle = true;
}
else if (lowerTag.startsWith(TITLE_TAG_END)) {
inTitle = false;
tempWpmd.setTitle(titleBuf.toString());
}
// reset our buffers
tagBuf = new StringBuffer();
nonTagTextf = new StringBuffer();
}
else if (curChar == '<') {
inTag = true;
tagBuf = new StringBuffer();
String t = nonTagTextf.toString().trim();
int tSize = t.length();
if (tSize > 0) {
if (! inScript && inBody) {
writer.println(t);
}
}
nonTagTextf = new StringBuffer();
//
if (! foundSummary) {
//
if (inBody) {
curBodyNonTagTextNum += tSize;
summaryBuf.append(' ');
summaryBuf.append(t);
summaryBuf.append(' ');
if (curBodyNonTagTextNum >= sumMaxSize) {
tempWpmd.setDescription(Utils.concatStrToEnd(summaryBuf.toString(), sumMaxSize));
foundSummary = true;
}
}
}
//
} // end for the beginning of a tag
if (inTitle && curChar != '>' && ! inTag) {
titleBuf.append(curChar);
}
else if (! inTag && curChar != '>') {
nonTagTextf.append(curChar);
}
else if (inTag) {
tagBuf.append(curChar);
}
} // end for while reading
if (! foundSummary && curBodyNonTagTextNum > 0) {
tempWpmd.setDescription(summaryBuf.toString());
}
}
catch (IOException ioe) {
logger.error("getWebPageMetaData() failed", ioe);
ds.setStatus(I18n.getString("error") + " : " + ioe.toString());