/*
* This software and supporting documentation were developed by
*
* Siemens Corporate Technology
* Competence Center Knowledge Management and Business Transformation
* D-81730 Munich, Germany
*
* Authors (representing a really great team ;-) )
* Stefan B. Augustin, Thorbj�rn Hansen, Manfred Langen
*
* This software is Open Source under GNU General Public License (GPL).
* Read the text of this license in LICENSE.TXT
* or look at www.opensource.org/licenses/
*
* Once more we emphasize, that:
* THIS SOFTWARE IS MADE AVAILABLE, AS IS, WITHOUT ANY WARRANTY
* REGARDING THE SOFTWARE, ITS PERFORMANCE OR
* FITNESS FOR ANY PARTICULAR USE, FREEDOM FROM ANY COMPUTER DISEASES OR
* ITS CONFORMITY TO ANY SPECIFICATION. THE ENTIRE RISK AS TO QUALITY AND
* PERFORMANCE OF THE SOFTWARE IS WITH THE USER.
*
*/
// ************ package ****************************************************
package appl.Portal.Utils.LinkSearch;
// ************ imports ******************************************************
import java.io.*;
import java.util.*;
import java.net.*;
import sun.net.www.http.HttpClient;
import appl.Portal.Utils.LinkSearch.ResultItem;
import appl.Portal.Utils.LinkSearch.ResultSet;
//import appl.Portal.Utils.XML.XMLwithDOMBuilder;
//import appl.Portal.Utils.XML.HTMLwithXSLBuilder;
//import appl.Portal.Utils.LinkSearch.SearchEngineResultSet;
import appl.Portal.Utils.LinkSearch.Item;
import appl.Portal.Utils.LinkSearch.HtmlParser;
import KFM.HTML.HtmlLoader2;
import KFM.log.*;
/**
* This class is the superclass of all Searchengine specific subclasses (e. g. AltaVista)
* and delivers a ResultSet.
*
* The method buildSearchUrl() is defined here, wich has to be
* implemeted by all subclasses in their own way
*
* See interface "ResultSet.java" for full Documentation of the methods
* defined in the interface implemented by this class
*/
public abstract class SearchEngineResultSet implements ResultSet
{
// ************************************************************
// Constants
// ************************************************************
// ************************************************************
// Variables
// ************************************************************
/** The proxy host to be used. Provided within the constructor.
* We need it, as all Internet content can only be accessed via a proxy.
*
* @see mProxyPort.
*/
private String mProxyHost = "proxy";
/** The proxy port to be used. Provided within the constructor.
* We need it, as all Internet content can only be accessed via a proxy.
*
* @see mProxyHost.
*/
private String mProxyPort = "80";
/** The searchenginges usually have the ability to split up the results
* in ranges (0-10, 11-20, 21-30, ...).
*
* mRangeDelta: how many results are to be retrieved from the searchengine
* at one time
*/
private int mRangeDelta = 10;
/** Indicates the maximum number of items wich shall be retrieved from a searchengine.
* Note: Yahoo never delivers more than 200.
*/
private int mMaxAmount = 0;
private Date mDateOfResultSet;
/** Stores the name of the searchengine. */
protected String mOrigin;
/** Stores the searchterm. */
protected String mSearchTerm;
protected String mSearchUrlString;
/** Some links found in the searched webpage are relative to the searchUrl. */
protected static String mAbsoluteBase;
/** Some links found in the searched webpage are relative to the searchUrl. */
protected static String mRelativeBase;
/** Stores the content of the website. */
private String mContent = null;
/** Fetches the result-html-page of a searchengine query.
*
* for detailed information see docu at
* file:///O|/KFM/www-docs/protected/developer/Tutorial/UtilityKlassen.htm#HTML_Loader
*
*/
private HtmlLoader2 mHtmlLoader = new HtmlLoader2();
/** Keeps track of the current index of the Vector that stores the resultset. */
private int mCnt = 0;
/** Stores a regular expression.
*
* Input: content of a result-html-page.
*
* This string stores a regular expression, wich matches the dynamic content
* of a result-html-page of a searchengine query.
* Usage:
* Normally, a 'RegExpFrame' pattern would look like this:
*
* </font><dl><dt><b>[0-9]+. </b>(.*?)</dl><font face=Arial size=-1>
*
* Now, please take a look at an example of a string wich shall be matched:
*
* </font><dl><dt><b>1. </b>Here come the resultitems e. g. from 1 to 10</dl><font face=Arial size=-1>
*
* The regular expression will match the string.
*
* The whole match is always called group(0). group(0) is of type string
* and contains everything: matching conditions
* ("</font><dl><dt><b>1. </b>" and "</dl><font face=Arial size=-1>")
* and the amount of characters matched by a '(.*?)'
*
* The parenthesized thing '(.*?)' delivers a group bigger than 0, also of type string, e. g. group(1)
* and contains the matched string, without(!) the matching conditions.
*
* In the example above: "Here come the resultitems e. g. from 1 to 10".
*
* To RegExpItemSet group(0) is passed on.
*
* For information about OROMatcher and regular Expressions see doc at
* $/KFM/www-docs/protected/developer/appl/Portal/MetaSearch/SearchEngineWrapper.html)
*/
protected String mRegExpFrame;
/** Stores a regular expression.
*
* Input: group(0) from 'RegExpFrame'
*
* This string stores a regular expression, wich matches one item of the dynamic content
* of a searchengine query.
* Usage:
* Normally, a 'RegExpItemSet' pattern would look like this:
*
* <b>[0-9]+. </b>(.*?)<font color="#808080">(.*?)</font><br>
*
* Now, please take a look at an example of a string wich shall be matched:
*
* <b>1. </b><a href="http://www.vetmed.uni-muenchen.de/">
* <font color="#808080">www.vetmed.uni-muenchen.de/</font><br>
*
*
* The regular expression will match the string.
*
* The whole match is always called group(0). group(0) is of type string
* and contains everything: matching conditions
* ("<b>1. </b>" and "<font color="#808080">" and "</font><br>")
* and the amount of characters matched by a '(.*?)'
*
* The parenthesized thing '(.*?)' delivers a group bigger than 0, also of type string, e. g. group(1)
* and contains the matched string, without(!) the matching conditions.
*
* In the example above: "<a href="http://www.vetmed.uni-muenchen.de/">" and
* "www.vetmed.uni-muenchen.de/".
*
*
* To RegExpItem group(0) is passed on.
*
*
* For information about OROMatcher and regular Expressions see doc at
* $/KFM/www-docs/protected/developer/appl/Portal/MetaSearch/SearchEngineWrapper.html)
*/
protected String mRegExpItemSet;
/** Stores a regular expression.
*
* Input: group(0) from 'RegExpItemSet'.
*
* This string stores a regular expression, wich matches the attributes of one item of the dynamic content
* of a searchengine query.
*
* Usage:
* Normally, a 'RegExpItem' pattern would look like this:
*
* <b>[0-9]+\. </b><a href="(.+?)">\s*<b>(.+?)</b></a>\s*</dt>\s*<dd>(.+?)</dd>\s*<br><b>URL:</b>
* <font color="#808080">(.+?)</font>\s*<br><font color="#808080">(.+?)</font><br>\s*
*
* Now, please take a look at an example of a string wich shall be matched:
*
* <b>2. </b><a href="http://www.lodging-germany.com/munchen/hotels.htm">
* <b>Hotels Germany. H�tels Allemagne. M�nchen. Hotel Munchen, h�tels Munich.</b></a></dt>
* <dd>Munchen, M�nich a large selection of hotels in the inner city in all price categories....</dd>
* <br><b>URL:</b>\s*<font color="#808080">www.lodging-germany.com/munchen/hotels.htm</font>
* <br><font color="#808080">Last modified on: 17-Feb-2000 - 9K bytes - in English</font><br>
*
*
* The regular expression will match the string.
*
* The whole match is always called group(0). group(0) is of type string
* and contains everything: matching conditions
* ("<b>2. </b>" and "<font color="#808080">" and "</font><br>" etc.)
* and the amount of characters matched by a '(.*?)'
*
* The parenthesized thing '(.*?)' delivers a group bigger than 0, also of type string, e. g. group(1)
* and contains the matched string, without(!) the matching conditions.
*
* In the example above:
*
* group(0) = contains whole example string
* group(1) = "Url" = "<a href="http://www.vetmed.uni-muenchen.de/">" and
* group(2) = "Description" = "Hotels Germany. H�tels Allemagne. M�nchen. Hotel Munchen, h�tels Munich."
* group(3) = "Summary" = "Munchen, M�nich a large selection of hotels in the inner city in all price categories...."
* group(4) = "HighlightedUrl" = "www.lodging-germany.com/munchen/hotels.htm"
* group(5) = other details = "Last modified on: 17-Feb-2000 - 9K bytes - in English"
*
* For information about OROMatcher and regular Expressions see doc at
* $/KFM/www-docs/protected/developer/appl/Portal/MetaSearch/SearchEngineWrapper.html)
*/
protected String mRegExpItem;
/** Allows to reference the index of a Hashtable with a name given by the arrayindex of mNames[].
*
* For information on groups and Hashtable referencing see doc at
* $/KFM/www-docs/protected/developer/appl/Portal/MetaSearch/SearchEngineWrapper.html)
*
*
* As "mRegExpItem" describes one item of a resultset and delivers its itemattributes
* as subgroups (e. g. group(1) - group(5), see example directly above), we could store these
* attributes in a string array or a Vector.
* But for better readability of code, we store the attributes in a
* hashtable, referencing the items of the hashtable with the names given in mNames;
* example:
* mNames may contain { "Url", "Description" }
* we have the following subgroups: group(0) and group(1)
* So, after instantiating a hashtable we can
* reference "Url" with group(1)
* reference "Description" with group(2)
*/
protected String[] mNames;
private static HtmlParser mHtmlParser;
// ************************************************************
// Methods
// ************************************************************
public SearchEngineResultSet ()
{
}
public SearchEngineResultSet (String aProxyHost, String aProxyPort)
{
mProxyHost = aProxyHost;
mProxyPort = aProxyPort;
}
/** Concatenates the different parts of the Url:
* the searchengine's address + specific parameters (language, etc).
*/
abstract boolean buildSearchUrl(String[] someSearchWords);
/** Form the index part to the Url with the given index i.
*
* Has to be search engine specific because of the different
* types of parameters needed here.
*
*@param aRangeStart Start of range.
*/
abstract String buildIndexPart(int aRangeStart);
public boolean hasMoreItems() { return mCnt < mHtmlParser.getNumberOfItems(); }
/** Returns previous item.*/
public ResultItem previousItem ()
{
if(mCnt > 0) {
// get Item at index from HtmlParser
Hashtable tHash = mHtmlParser.getItem(mCnt);
// decrement index
--mCnt;
SearchEngineResultItem tItem = new SearchEngineResultItem();
// convert item stored in a HashTable to a "true" ResultItem
try {// a little bit strange, but i think it has to be this way
if((String) tHash.get("Url") != null) {
tItem.setURL((String) tHash.get("Url"));
}
} catch(MalformedURLException e) {
KFMSystem.log.error("SearchEngineResultSet:: previousItem: Got a MalformedURLException", e);
}
// set description, wich gets "Summary" in the DOM Document
if((String) tHash.get("Description") != null)
tItem.setDescription((String) tHash.get("Description"));
if((String) tHash.get("Title") != null)
tItem.setTitle((String) tHash.get("Title"));
if((String) tHash.get("Author") != null)
tItem.setAuthor((String) tHash.get("Author"));
if((String) tHash.get("DocLanguage") != null)
tItem.setLanguage((String) tHash.get("DocLanguage"));
if((String) tHash.get("DocSize") != null)
tItem.setSize((Long) tHash.get("DocSize"));
if((String) tHash.get("ModifiedDate") != null)
tItem.setLastModified((Date) tHash.get("ModifiedDate"));
if((String) tHash.get("Score") != null)
tItem.setHitScore((Double) tHash.get("Score"));
return tItem;
} else {
return null;
}
}
public ResultItem nextItem()
{
if(mCnt < mHtmlParser.getNumberOfItems()) {
// get Item at index from HtmlParser
Hashtable tHash = mHtmlParser.getItem(mCnt);
// increment counter
++mCnt;
SearchEngineResultItem tItem = new SearchEngineResultItem();
// convert item stored in a HashTable to a "true" ResultItem
try {// a little bit strange, but i think it has to be this way
if((String) tHash.get("Url") != null)
tItem.setURL((String) tHash.get("Url"));
} catch(MalformedURLException e) {
KFMSystem.log.error("SearchEngineResultSet:: nextItem: Got a MalformedURLException", e);
}
// set description, wich will be "Summary" in the DOM Document
if((String) tHash.get("Description") != null)
tItem.setDescription((String) tHash.get("Description"));
if((String) tHash.get("Title") != null)
tItem.setTitle((String) tHash.get("Title"));
if((String) tHash.get("Author") != null)
tItem.setAuthor((String) tHash.get("Author"));
if((String) tHash.get("DocLanguage") != null)
tItem.setLanguage((String) tHash.get("DocLanguage"));
if((String) tHash.get("DocSize") != null)
tItem.setSize((Long) tHash.get("DocSize"));
if((String) tHash.get("ModifiedDate") != null)
tItem.setLastModified((Date) tHash.get("ModifiedDate"));
if((String) tHash.get("Score") != null)
tItem.setHitScore((Double) tHash.get("Score"));
return tItem;
} else {
return null;
}
}
// store Origin of ResultSet e.g. AltaVista
public String getOrigin () {
if(mOrigin != null) { return mOrigin; } else { return null; }
}
// stores the amount of items to bde retrieved
public void setMaxAmount (int aAmount) { mMaxAmount = aAmount; }
public void setRangeDelta(int rangeDelta) { mRangeDelta = rangeDelta; }
// return Origin of ResultSet e.g. AltaVista
public void setOrigin(String aOrigin) { mOrigin = aOrigin; }
// return date of document
public Date getDate() {
if(mDateOfResultSet != null) { return mDateOfResultSet; } else { return null; }
}
public void setDate(Date aDateOfResultSet) { mDateOfResultSet = aDateOfResultSet; }
public String getSearchTerm() { return mSearchTerm; }
public void setSearchTerm(String aSearchTerm) { mSearchTerm = aSearchTerm; }
public String getSearchUrlString() { return mSearchUrlString; }
public void setSearchUrlString(String aSearchUrlString) { mSearchUrlString = aSearchUrlString; }
public void startSearch (String[] someSearchWords, boolean proxyFlag)
{
// store searchTerm
this.setSearchTerm(someSearchWords[0]);
if(this.buildSearchUrl(someSearchWords)) {
// create a new instance of the mHtmlParser with
// the searchenginespecific parameters
mHtmlParser = new HtmlParser();
mHtmlParser.setRegExpFrame(mRegExpFrame);
mHtmlParser.setRegExpItemSet(mRegExpItemSet);
mHtmlParser.setRegExpItem(mRegExpItem);
mHtmlParser.setNames(mNames);
for(int i = 1; i < this.mMaxAmount; i += mRangeDelta) {
String tSearchUrlString = this.getSearchUrlString() + this.buildIndexPart(i);
// * Load page (with proxy)
try {
// ** Set proxy
if(proxyFlag) {
System.getProperties().put("proxyHost", mProxyHost);
// proxyport
System.getProperties().put("proxyPort", mProxyPort);
// Reset the properties otherwise the HTMLLoader won't load any longer.
HttpClient.resetProperties();
}
KFMSystem.log.detail("SearchEngineReusltSet::StartSearch: "
+ "Loading URL " + tSearchUrlString);
boolean tSuccessfulLoaded = mHtmlLoader.get(new URL(tSearchUrlString), 0);
// Is page really loaded?
if(tSuccessfulLoaded == false) {
// @@@
KFMSystem.log.info("SearchEngineReusltSet::StartSearch: "
+ "Load error with searchstring: " + tSearchUrlString + ".");
}
// Then return content as a string.
// You could also take mHtmlLoader.getBody()
// which would deliver all between <body> and </body>.
mContent = mHtmlLoader.getContent();
} catch(IOException e) {
/// @@@
KFMSystem.log.error("SearchEngineResultSet:: startSearch: Got an IOException", e);
}
// Parse document.
mHtmlParser.parse(mContent);
}
} else {
// @@@
KFMSystem.log.info("Url could not be built at SearchEngineResultSet.buildSearchUrl()");
}
}
/** This inner class stores the attributes ("Url", "Description", "Date") of a ResultItem.
*
* I don't want to document it all, because I think you can quite guess what its all about.
*
* Just keep in mind that a Description may turn out to be a summary
* in the XML document etc.
*
* See interface "ResultItem.java" for full Documentation
*/
static public class SearchEngineResultItem extends ResultItemAdapter
{
private URL mUrl;
private String mDescription;
private String mTitle;
private String mAuthor;
private String mLanguage;
private Long mFileSize;
private Hashtable mCategories[];
private Double mHitScore;
private Date mLastModified;
public String getURL() { return mUrl.toString(); }
// public void setURL(URL aURL) { mUrl = aURL; }
public void setURL(String aURL)
throws MalformedURLException
{
try {
mUrl = new URL(aURL);
} catch(MalformedURLException e) {
// Something went wrong with the Url. Let's try it with adding a base:
try {
if(aURL.startsWith("/")) {
aURL = mAbsoluteBase + aURL;
} else {
aURL = mRelativeBase + aURL;
}
// KFMSystem.log.info("Changed Url to " + aURL);
mUrl = new URL(aURL);
} catch (MalformedURLException ex) {
KFMSystem.log.error("SearchEngineResultSet:: setURL: Got a MalformedURLException", e);
}
}
}
public String getTitle() { return mTitle; }
public void setTitle(String aTitle) { mTitle = aTitle; }
public String getAuthor() { return mAuthor; }
public void setAuthor(String aAuthor) { mAuthor = aAuthor; }
public Long getSize() { return mFileSize; }
public void setSize(Long aFileSize) { mFileSize = aFileSize; }
public Date getLastModified() { return mLastModified; }
public void setLastModified(Date aLastModifiedDate) { mLastModified = aLastModifiedDate; }
public Hashtable[] getCategories() { return mCategories; }
public void setCategories(Hashtable[] aCategories) { mCategories = aCategories; }
public void setDescription(String newDescription) { mDescription = newDescription; }
public String getDescription() { return mDescription; }
public Double getHitScore() { return mHitScore; }
public void setHitScore(Double aHitScore) { mHitScore = aHitScore; }
public String getLanguage() { return mLanguage; }
public void setLanguage(String aLanguage) { mLanguage = aLanguage; }
}
}