/*
* This software and supporting documentation were developed by
*
* Siemens Corporate Technology
* Competence Center Knowledge Management and Business Transformation
* D-81730 Munich, Germany
*
* Authors (representing a really great team ;-) )
* Stefan B. Augustin, Thorbj�rn Hansen, Manfred Langen
*
* This software is Open Source under GNU General Public License (GPL).
* Read the text of this license in LICENSE.TXT
* or look at www.opensource.org/licenses/
*
* Once more we emphasize, that:
* THIS SOFTWARE IS MADE AVAILABLE, AS IS, WITHOUT ANY WARRANTY
* REGARDING THE SOFTWARE, ITS PERFORMANCE OR
* FITNESS FOR ANY PARTICULAR USE, FREEDOM FROM ANY COMPUTER DISEASES OR
* ITS CONFORMITY TO ANY SPECIFICATION. THE ENTIRE RISK AS TO QUALITY AND
* PERFORMANCE OF THE SOFTWARE IS WITH THE USER.
*
*/
// ************ package ****************************************************
package appl.Portal.Utils.LinkSearch;
// ************ imports ******************************************************
import appl.Portal.Utils.LinkSearch.Item;
// KFM
import KFM.Exceptions.ProgrammerException;
import KFM.log.*;
// OROMatcher packages
import com.oroinc.text.regex.*;
// java packages
import java.util.Hashtable;
import java.util.Vector;
public class HtmlParser
{
/** Vector that stores the ResultSet locally.
*
* Remember: The whole ResultSet is never returned by
* this class but always just ( Hashtable - wise ) with getItem
*/
private Vector mResultSet = new Vector();
/**
* This variable tells the method match( String, String, boolean, int )
* from wich offset on to match.
*/
private int mOffset;
/** The HTML - document to be parsed is contained in this string. */
private String mHTMLFile;
/**
* debug flag
*/
boolean mDebug = false;
private String[] mNames;
/**
*
*/
private String mRegExpFrame;
/**
*
*/
private String mRegExpItemSet;
private String mRegExpItem;
/** Variable that holds the current item ( see class "Item.java" ). */
private Item mItem;
/** Constructor. */
public HtmlParser (){}
public String getRegExpFrame(){return mRegExpFrame; }
public void setRegExpFrame( String aRegExpFrame ){mRegExpFrame = aRegExpFrame;}
public String getRegExpItemSet(){return mRegExpItemSet;}
public void setRegExpItemSet( String aRegExpItemSet ){mRegExpItemSet =aRegExpItemSet;}
public String getRegExpItem(){return mRegExpItem;}
public void setRegExpItem( String aRegExpItem ){mRegExpItem =aRegExpItem;}
public String[] getNames(){return mNames;}
public void setNames ( String[] aNames ){mNames = aNames;}
/** New since 2002-12-12: Access mRegExpFrame.group(1) instead of group(0), which was a mistake. */
public void parse ( String aHTMLFile )
{
mHTMLFile = aHTMLFile;
MatchResult tMatcher;
mOffset = 0;
tMatcher = this.match(mRegExpFrame, mHTMLFile, true, mOffset );
if( tMatcher != null)
{
KFMSystem.log.debug("Matched EXPFRAME");
// shorter string for performance
String tCutString = tMatcher.group(1);
KFMSystem.log.debug("Extracted String is:");
KFMSystem.log.debug(tCutString);
// tell match to start matching at index mOffset
mOffset = 0;
while (true)
{
// tell match to start again matching at index mOffset
tMatcher = this.match( mRegExpItemSet, tCutString, true, mOffset );
// note: the tMatcher.endOffset( 0 ) is greater 1 than the offset of the
// last matched charakter
// match() found something
if( tMatcher != null )
{
// after every match the offset is set
// to the end of the last match
mOffset = tMatcher.endOffset( 0 );
//see SearchEngineResultSet.java for explanation of
// 'mRegExpItem' and 'mNames'
// 'tMatcher.group(0)' contains the whole matched
// string
KFMSystem.log.debug("Matched ITEMSET");
KFMSystem.log.debug("Extracted String is:");
KFMSystem.log.debug(tMatcher.group(0));
mItem = new Item( mRegExpItem, tMatcher.group(0), mNames );
// add the Element to that Vector wich contains all
// ResultItems
// Note: 'mItem.getItem()' returns a Hashtable!
Hashtable tItem = mItem.getItem();
if (!tItem.isEmpty()){
mResultSet.addElement( tItem);
}
}
else {
KFMSystem.log.debug("NO ITEMSET MATCH");
break;
}
}
}
KFMSystem.log.debug("parsing done");
}
public int getNumberOfItems()
{
return mResultSet.size();
}
/**
* Returns one resultitem.
*/
public Hashtable getItem( int aIndex )
{
return (Hashtable)mResultSet.elementAt(aIndex);
}
/**
* Method wich matches a patternstring with a contentstring.
* Here the regular expression metacharacter '.' matches
* everything, even new lines ('\n'). See code below:
*
* For more information about the implementation of regular expression ba OROMatcher see:
* $/KFM/www-docs/protected/developer/external-docu/OROMatcher-1.0.7/doc/index.html
*
* @return MatchResult
*/
public MatchResult match (
String patternString,
String text,
boolean caseSensitive,
int Offset)
{
int groups;
PatternMatcher matcher = new Perl5Matcher();
PatternCompiler compiler = new Perl5Compiler();
Pattern pattern = null;
PatternMatcherInput input;
MatchResult result;
// Wenn you set the Perl5Compiler.SINGLELINE_MASK option
// the contentstring is treated singleline, even if there
// are some '\n' in it.
try {
pattern = compiler.compile(patternString,
Perl5Compiler.SINGLELINE_MASK | Perl5Compiler.CASE_INSENSITIVE_MASK);
} catch(MalformedPatternException e) {
System.err.println("LinkSearch.HtmlParser.match: Bad pattern: `" + e.getMessage() + "�.");
//@@@ System.exit(1);
//@@@ Make this cleaner some day.
throw new ProgrammerException("LinkSearch.HtmlParser.match: Bad pattern: `" + e.getMessage() + "�.");
}
input = new PatternMatcherInput(text);
// For debugging purposes.
// KFMSystem.log.debug( text);
// set the current Offset to prevent
// that the matcher starts again from the beginning of the string
input.setCurrentOffset( Offset );
if(matcher.contains(input, pattern)) {
result = matcher.getMatch();
} else {
result = null;
}
return result;
}
}