/*************************************************************************
*
* $RCSfile: XmlIndexBuilder.java,v $
*
* $Revision: 1.1 $
*
* last change: $Author: abi $ $Date: 2000/11/30 18:03:48 $
*
* The Contents of this file are made available subject to the terms of
* either of the following licenses
*
* - GNU Lesser General Public License Version 2.1
* - Sun Industry Standards Source License Version 1.1
*
* Sun Microsystems Inc., October, 2000
*
* GNU Lesser General Public License Version 2.1
* =============================================
* Copyright 2000 by Sun Microsystems, Inc.
* 901 San Antonio Road, Palo Alto, CA 94303, USA
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
*
* Sun Industry Standards Source License Version 1.1
* =================================================
* The contents of this file are subject to the Sun Industry Standards
* Source License Version 1.1 (the "License"); You may not use this file
* except in compliance with the License. You may obtain a copy of the
* License at http://www.openoffice.org/license.html.
*
* Software provided under this License is provided on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING,
* WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
* MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
* See the License for the specific provisions governing your rights and
* obligations concerning the Software.
*
* The Initial Developer of the Original Code is: Sun Microsystems, Inc.
*
* Copyright: 2000 by Sun Microsystems, Inc.
*
* All Rights Reserved.
*
* Contributor(s): _______________________________________
*
*
************************************************************************/
package com.sun.xmlsearch.xml.indexer;
import java.io.*;
import java.util.Hashtable;
import java.util.Vector;
import java.util.Enumeration;
import java.net.URL;
import org.xml.sax.InputSource;
import org.xml.sax.HandlerBase;
import com.sun.xml.parser.Resolver;
import com.sun.xml.tree.XmlDocument;
import com.sun.xmlsearch.tree.*;
import com.sun.xmlsearch.util.*;
import com.sun.xmlsearch.db.*;
import com.sun.xmlsearch.xml.XmlIndex;
import com.sun.xml.parser.Parser;
import com.sun.xml.parser.ValidatingParser;
import com.jclark.xsl.om.*;
import com.jclark.xsl.sax.*;
import com.jclark.xsl.tr.Result;
import com.jclark.xsl.tr.OutputMethod;
import com.jclark.xsl.tr.LoadContext;
import com.jclark.xsl.dom.Transform;
import com.jclark.xsl.dom.TransformEngine;
import com.jclark.xsl.dom.TransformException;
import com.jclark.xsl.dom.XSLTransformEngine;
public final class XmlIndexBuilder {
final class MyXslEngine extends XSLTransformEngine {
public Node load(URL url,
int documentIndex,
LoadContext context,
NameTable nameTable) throws XSLException {
System.out.println("loading for indexing " + url.toString());
try {
System.out.println("parsing");
final Node root = parseTargetDocument(url);
System.out.println("parsed");
return root;
}
catch (Exception e) {
throw new XSLException(e);
}
}
} // end of MyXslEngine
// private final String Http = "http://localhost:8089/";
// GTM: this needs to be parameterized but for now,
// GTM: replace 'file:///home/jacek/docs/'
//private final String Http = "file:///home/jacek/docs/";
private final String Http =
"http://localhost:8084/";
// locations array data
private static int InitSize = 4096;
private int _size = InitSize;
private int _free = 0;
private ConceptLocation[] _locations = new ConceptLocation[_size];
private XmlIndex _index;
private int _currentDocID = 0;
private Hashtable _indexers = new Hashtable();
private Hashtable _stoplist = new Hashtable();
private Hashtable _linkCodes = new Hashtable();
private Vector _linknames = new Vector();
// indexing state
private static int CurrenMaxLinkCode = 0;
private int _availContextNumber;
private int _lastWordNumber;
private int _firstWord;
private boolean _anyLocationsStored = false;
private final class IndexAdapter extends ResultAdapter {
private static final String IndexNS = "http://sun.com/2000/XMLSearch";
private static final String DefTok =
"com.sun.xmlsearch.util.SimpleTokenizer";
private static final int StackSize = 64;
// names of indexing elements and attributes
private final Name _indexText_Name;
private final Name _indexElement_Name;
private final Name _indexAttribute_Name;
private final Name _nodeID_Name;
private final Name _tokenizer_Name;
private final Name _attributeName_Name;
private Vector _textNodes = new Vector(512);
private Hashtable _tokenizers = new Hashtable();
private Tokenizer _defaultTokenizer;
private Hashtable _numberedNodes = new Hashtable(1024*4);
private boolean[] _indexOnOffStack = new boolean[StackSize];
private int _sp;
private Tokenizer[] _tokenizerStack = new Tokenizer[StackSize];
private int _tsp;
private String[] _attributeStack = new String[StackSize];
private int _attrSP;
private Node _currentNode;
public IndexAdapter(NameTable nameTable) {
_indexText_Name = nameTable.createName("index:text", IndexNS);
_indexElement_Name = nameTable.createName("index:element", IndexNS);
_indexAttribute_Name =nameTable.createName("index:attribute",IndexNS);
_nodeID_Name = nameTable.createName("index:nodeID", IndexNS);
_tokenizer_Name = nameTable.createName("index:tokenizer", IndexNS);
_attributeName_Name = nameTable.createName("index:attributeName",
IndexNS);
_defaultTokenizer = getTokenizer(DefTok);
}
public void init() throws XSLException {
_availContextNumber = 0;
_lastWordNumber = 0;
_anyLocationsStored = false;
// all the contexts' tables
_initialWords.clear();
_sp = -1;
_tsp = -1;
_attrSP = -1;
_free = 0;
}
public void finish() throws XSLException {
_numberedNodes.clear();
_dests.clear();
_seqNumbers.clear();
_links.clear();
final int nTextNodes = _textNodes.size();
_availContextNumber = nTextNodes;
// vector to hold parents of text nodes
Vector parents = new Vector(nTextNodes * 2);
/*****
for each of the text nodes its sequence number is stored
as well as the index of its parent (in _dests)
_link is not stored as it is always "text()"
_availContextNumber only used to number parent element contexts
******/
for (int i = 0; i < nTextNodes; i++) {
final Node node = (Node)_textNodes.elementAt(i);
final Node parent = node.getParent();
// find this text node's seq number
final SafeNodeIterator siblings = parent.getChildren();
Node sibling;
int counter = 1;
while ((sibling = siblings.next()) != node) {
if (sibling.getType() == Node.TEXT)
++counter;
}
_seqNumbers.add(counter);
// check whether parent already encountered
Object number = _numberedNodes.get(parent);
if (number == null) { // not yet seen
final int newContext = _availContextNumber++;
_numberedNodes.put(parent, new Integer(newContext));
_dests.add(newContext);
// enqueue parent: its parent will need a number too
parents.addElement(parent);
// System.out.println(parent.getName().toString() +
// " -> " + newContext);
}
else {
_dests.add(((Integer)number).intValue());
}
}// end for
_textNodes.setSize(0);
// store info about element ancestry of the above text nodes
// grandparents are added to the end of the vector
int rootElementPos = 0;
for (int i = 0; i < parents.size(); i++) {
final Node node = (Node)parents.elementAt(i);
final Name name = node.getName();
final Node parent = node.getParent();
_links.add(getLinkCode(name.toString()));
if (parent.getType() == Node.ELEMENT) { // not ROOT
// find sequence number
final SafeNodeIterator siblings = parent.getChildren();
Node sibling;
int counter = 1;
while ((sibling = siblings.next()) != node) {
if (sibling.getName() == name)
++counter;
}
_seqNumbers.add(counter);
// check whether parent already known
Object number = _numberedNodes.get(parent);
if (number == null) {
final int newContext = _availContextNumber++;
_numberedNodes.put(parent, new Integer(newContext));
_dests.add(newContext);
// enqueue parent: its parent will need a number too
parents.addElement(parent);
//System.out.println(parent.getName().toString() +
// " -> " + newContext);
}
else {
_dests.add(((Integer)number).intValue());
}
}
else {
_dests.add(0); // placeholder
_seqNumbers.add(1);
rootElementPos = i + nTextNodes;
// System.out.println("rootElementPos = " + i);
}
} // end for
// index to sentinel
_dests.set(rootElementPos, _availContextNumber);
/******
_dests.add(-1);
final int card = _dests.cardinality();
boolean failed = false;
for (int k = 0; k < card && !failed; k++) {
int counter = 0;
for (int context = _dests.at(k);
context != -1;
context = _dests.at(context))
if (++counter > 2*card) {
System.err.println("test failed at " + k);
failed = true;
break;
}
}
if (failed) {
System.err.println("nTextNodes = " + nTextNodes);
for (int k = 0; k < card; k++) {
System.err.println(k+":"+_dests.at(k));
}
System.exit(1);
}
_dests.pop();
System.err.println("nTextNodes = " + nTextNodes);
System.out.println("|_initialWords| " +
_initialWords.cardinality());
System.out.println("|_dests| " + _dests.cardinality());
System.out.println("|_seqNumbers| " +
_seqNumbers.cardinality());
System.out.println("|_links| " + _links.cardinality());
******/
} // end public void finish
public void characters(String str) throws XSLException {
if (_sp >= 0 && _indexOnOffStack[_sp]) {
try {
indexText(str, _tsp != -1
? _tokenizerStack[_tsp]
: _defaultTokenizer);
}
catch (Exception e) {
throw new XSLException(e);
}
}
}
public void startElement(Name elementType, NamespacePrefixMap nsMap)
throws XSLException {
//System.out.println("startElement: " + elementType.toString());
if (elementType == _indexElement_Name) {
_indexOnOffStack[++_sp] = true;
// pop Tokenizer stack
// following attribute can push selected Tokenizer
if (_tsp != -1)
_tsp--;
}
else if (elementType == _indexText_Name) {
}
else if (elementType == _indexAttribute_Name) {
_attrSP++;
}
}
public void attribute(Name name, String value) throws XSLException {
// System.out.println("attribute: " + name.toString() +
// " = " + value);
if (name == _nodeID_Name)
_currentNode = (Node)_nodes.get(value);
else if (name == _tokenizer_Name)
_tokenizerStack[++_tsp] = getTokenizer(value);
else if (name == _attributeName_Name) {
try {
NamespacePrefixMap nspm =
_currentNode.getNamespacePrefixMap();
Name attributeName =
nspm.expandAttributeName(value, _currentNode);
String attrVal = _currentNode.getAttributeValue(
attributeName);
//System.out.println("attrVal = " + attrVal);
_attributeStack[_attrSP] =
_currentNode.getName().toString() +
'<'+value+'<'+attrVal;
storeLocation("+<" + _attributeStack[_attrSP]);
}
catch (Exception e) {
throw new XSLException(e);
}
}
}
public void endElement(Name elementType) throws XSLException {
if (elementType == _indexElement_Name) {
_sp--;
}
else if (elementType == _indexText_Name) {
// reset
}
else if (elementType == _indexAttribute_Name) {
try {
storeLocation("-<" + _attributeStack[_attrSP--]);
}
catch (Exception e) {
throw new XSLException(e);
}
}
}
private void storeLocation(String token, int number) throws Exception {
if (_free == _size) {
ConceptLocation[] newArray = new ConceptLocation[_size *= 2];
System.arraycopy(_locations, 0, newArray, 0, _free);
_locations = newArray;
}
_locations[_free++] = new ConceptLocation(intern(token),
number, number);
}
private void storeLocation(String token) throws Exception {
System.out.println(token);
storeLocation(token, _lastWordNumber++);
}
private void indexText(String text, Tokenizer tokenizer)
throws Exception {
tokenizer.setText(text);
_firstWord = _lastWordNumber;
_anyLocationsStored = false;
Token token;
while ((token = tokenizer.nextToken()) != null) {
final String lowercaseToken = token.toLowerCaseString();
if (_stoplist.get(lowercaseToken) == null) {
storeLocation(lowercaseToken);
_anyLocationsStored = true;
}
else {
_lastWordNumber++;
}
}
if (_anyLocationsStored && _firstWord > -1) {
_initialWords.add(_firstWord);
_textNodes.addElement(_currentNode);
}
// reset before next batch
_firstWord = -1;
}
private Tokenizer getTokenizer(String className) {
Object tokenizer;
if ((tokenizer = _tokenizers.get(className)) != null)
return (Tokenizer)tokenizer;
else {
try {
tokenizer = Class.forName(className).newInstance();
_tokenizers.put(className, tokenizer);
return (Tokenizer)tokenizer;
}
catch (Exception e) {
System.err.println(e);
return _defaultTokenizer;
}
}
}
} // end of IndexAdapter
// mapping from generated node ids to nodes themselves
private Hashtable _nodes = new Hashtable(4096);
private IntegerArray _initialWords = new IntegerArray(512);
private IntegerArray _links = new IntegerArray(512);
private IntegerArray _dests = new IntegerArray(512);
private IntegerArray _seqNumbers = new IntegerArray(512);
private ElementIndexer _defaultIndexer;
private PrefixTranslator _prefixTransl;
// GTM new:
private String _transformLocation = null;
// by default use web server to get transform files...
private boolean _useLocalTransformFile = false;
private TreeBuilder _treeBuilder;
private MyXslEngine _transformEngine;
private Transform _indexingTransform;
private Transform _defaultTransform;
private IndexAdapter _indexAdapter;
public XmlIndexBuilder(String indexDir) throws Exception {
_index = new XmlIndex(indexDir, true);
}
public boolean init(String transform) throws Exception {
if (_index.init()) {
File etc = new File(System.getProperty("XMLSEARCH"), "etc");
readStoplist(new File(etc, "StopList"));
// _defaultIndexer = new DefaultElementIndexer(this);
reset();
// initialize vector and hashtable
String[] linkNames = _index.getLinkNames();
if (linkNames != null){
for (int i = 0; i < linkNames.length; i++){
getLinkCode(linkNames[i]);
}
}
initXmlProcessor(transform);
return true;
}
else {
return false;
}
}
public void indexDocument(URL docURL, String title) throws Exception {
InputSource source = new InputSource(docURL.openStream());
source.setSystemId(docURL.toString());
Parser sourceParser = new ValidatingParser();
sourceParser.setFastStandalone(true);
XMLProcessorEx sourceLoader = new XMLProcessorImpl(sourceParser);
// long start = System.currentTimeMillis();
Node root = sourceLoader.load(source, 0,
_transformEngine.getSourceLoadContext(),
_transformEngine.getNameTable());
//System.out.println((System.currentTimeMillis()-start)+" msec parse");
// build association from generated node ids to nodes
// start = System.currentTimeMillis();
recordNodes(root.getChildren().next());
// System.out.println((System.currentTimeMillis()-start) +
// " record nodes");
openDocument(docURL.toString());
_indexAdapter.init();
// start = System.currentTimeMillis();
_indexingTransform.transform(root, _indexAdapter);
// System.out.println((System.currentTimeMillis()-start)+" transform");
// start = System.currentTimeMillis();
_indexAdapter.finish();
// System.out.println((System.currentTimeMillis()-start)+" finish");
// start = System.currentTimeMillis();
closeDocument(title);
// System.out.println((System.currentTimeMillis()-start)+" close");
_nodes.clear();
}
private void recordNodes(Node node) {
switch (node.getType()) {
case Node.ELEMENT:
_nodes.put(node.getGeneratedId(), node);
SafeNodeIterator iterator = node.getAttributes();
Node node1;
while ((node1 = iterator.next()) != null)
_nodes.put(node1.getGeneratedId(), node1);
iterator = node.getChildren();
while ((node1 = iterator.next()) != null)
recordNodes(node1);
break;
case Node.TEXT:
_nodes.put(node.getGeneratedId(), node);
break;
}
}
private final void initXmlProcessor(String transform) throws Exception {
_transformEngine = new MyXslEngine();
NameTable nameTable = _transformEngine.getNameTable();
_treeBuilder = new TreeBuilder(nameTable);
_defaultTransform = getTransform("default");
_indexingTransform = getTransform(transform);
_indexAdapter = new IndexAdapter(nameTable);
}
private Node parseTargetDocument(URL docUrl) throws Exception {
return _treeBuilder.getRoot(docUrl);
}
private Transform getTransform(String stylesheetName) throws Exception {
System.out.println("creating indexing transform: " + stylesheetName);
URL stylesheetUrl = null;
if(_useLocalTransformFile){
stylesheetUrl = new URL("file:" +
_transformLocation +
"/" + stylesheetName + ".xsl");
}
else{
stylesheetUrl = new URL(Http + stylesheetName + ".xsl");
}
System.out.println(stylesheetUrl.toString());
InputStream stylesheetStream = stylesheetUrl.openStream();
XmlDocument sheet = XmlDocument.createXmlDocument(stylesheetStream,
false);
return _transformEngine.createTransform(sheet);
}
// GTM new:
public void setTransformLocation(String filelocation){
_transformLocation = null;
_useLocalTransformFile = false;
final File testfile = new File(filelocation);
if(testfile.exists()){
_transformLocation = filelocation;
_useLocalTransformFile = true;
}
}
public void setPrefixTranslator(PrefixTranslator translator) {
_prefixTransl = translator;
}
public void updateIndex(Hashtable toRemove, Hashtable toRefresh,
Hashtable toAdd) throws Exception {
// first prune microindexes to be removed or replaced
_index.pruneIndex(toRemove, toRefresh);
// reindex docs to be refreshed
Enumeration documents = toRefresh.keys();
while (documents.hasMoreElements()) {
final String document = (String)documents.nextElement();
System.out.println("= " + document);
indexDocument(new URL(document), "xml");
}
// index new documents
documents = toAdd.keys();
while (documents.hasMoreElements()) {
final String document = (String)documents.nextElement();
System.out.println("+ " + document);
indexDocument(new URL(document), "xml");
}
}
public void clearIndex() throws IOException {
_index.clear();
}
private int intern(String name) throws Exception {
return _index.intern(name);
}
public void openDocument(String name) throws Exception {
if (_currentDocID != 0){
throw new Exception("document already open");
}
_currentDocID =
intern(_prefixTransl != null
? _prefixTransl.translatePrefix(name)
: name);
reset(); // reset context gathering state
}
public void closeDocument(String title) throws Exception {
if (_currentDocID == 0)
throw new Exception("no document open");
else if (_free > 0) {
IntegerArray kTable = new IntegerArray();
Compressor compressor1 = new Compressor();
Compressor compressor2 = new Compressor();
Compressor compressor3 = new Compressor();
Compressor compressor4 = new Compressor();
kTable.add(compressor1.compressAscending(_initialWords));
kTable.add(compressor2.minimize(_dests, 2));
kTable.add(compressor3.minimize(_links, 2));
kTable.add(compressor4.minimize(_seqNumbers, 2));
Compressor compressor0 = new Compressor();
int k0 = compressor0.minimize(kTable, 4);
compressor0.concatenate(compressor1);
compressor0.concatenate(compressor2);
compressor0.concatenate(compressor3);
compressor0.concatenate(compressor4);
_index.compress(_currentDocID,
intern(title),
_locations,
_free,
null, // extents
0, // extent count
k0,
compressor0);
}
else {
System.out.println("no indexable content");
}
_free = 0;
_currentDocID = 0; // state: nothing open
}
private void readStoplist(File file) {
try {
final LineInput in = new LineInput(new FileInputStream(file));
String line;
while ((line = in.readLine()) != null) {
// value doesn't matter...
_stoplist.put(line.trim(), this);
}
}
catch (IOException e) {
e.printStackTrace();
}
}
public void close() throws Exception {
// store link names
Object[] linkNames = _linknames.toArray(new String[_linknames.size()]);
_index.saveLinkNames(linkNames);
// output link codes
/*
Enumeration keys = _linknames.elements();
while (keys.hasMoreElements())
System.out.println((String)keys.nextElement());
*/
_index.close();
System.out.println("done");
}
private void reset() {
_availContextNumber = 0;
_lastWordNumber = 0;
_free = 0;
_anyLocationsStored = false;
// all the contexts' tables
_initialWords.clear();
_dests.clear();
_links.clear();
_seqNumbers.clear();
}
private void storeToken(String token) throws Exception {
final int number = _lastWordNumber++; // counting all tokens now
if (_stoplist.get(token) == null) // not found in stoplist
storeLocation(token, number);
}
private void storeLocation(String text, int tokenNo) throws Exception {
// System.out.println(text + "\t\t@\t" + tokenNo);
if (_free == _size) {
ConceptLocation[] newArray = new ConceptLocation[_size *= 2];
System.arraycopy(_locations, 0, newArray, 0, _free);
_locations = newArray;
}
_locations[_free++] = new ConceptLocation(intern(text), tokenNo,
tokenNo);
_anyLocationsStored = true;
}
private int getLinkCode(String linkName) {
final Integer code;
if ((code = (Integer)_linkCodes.get(linkName)) != null){
return code.intValue();
}
else {
_linknames.addElement(linkName);
final int newCode = CurrenMaxLinkCode++;
_linkCodes.put(linkName, new Integer(newCode));
return newCode;
}
}
private void storeEdge(int relation, int seqNumber, int destination) {
_links.add(relation);
_seqNumbers.add(seqNumber);
_dests.add(destination);
}
}