Source Code of com.sun.xmlsearch.xml.indexer.XmlIndexBuilder

/*************************************************************************
 *
 *  $RCSfile: XmlIndexBuilder.java,v $
 *
 *  $Revision: 1.1 $
 *
 *  last change: $Author: abi $ $Date: 2000/11/30 18:03:48 $
 *
 *  The Contents of this file are made available subject to the terms of
 *  either of the following licenses
 *
 *         - GNU Lesser General Public License Version 2.1
 *         - Sun Industry Standards Source License Version 1.1
 *
 *  Sun Microsystems Inc., October, 2000
 *
 *  GNU Lesser General Public License Version 2.1
 *  =============================================
 *  Copyright 2000 by Sun Microsystems, Inc.
 *  901 San Antonio Road, Palo Alto, CA 94303, USA
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License version 2.1, as published by the Free Software Foundation.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *  MA  02111-1307  USA
 *
 *
 *  Sun Industry Standards Source License Version 1.1
 *  =================================================
 *  The contents of this file are subject to the Sun Industry Standards
 *  Source License Version 1.1 (the "License"); You may not use this file
 *  except in compliance with the License. You may obtain a copy of the
 *  License at http://www.openoffice.org/license.html.
 *
 *  Software provided under this License is provided on an "AS IS" basis,
 *  WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING,
 *  WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
 *  MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
 *  See the License for the specific provisions governing your rights and
 *  obligations concerning the Software.
 *
 *  The Initial Developer of the Original Code is: Sun Microsystems, Inc.
 *
 *  Copyright: 2000 by Sun Microsystems, Inc.
 *
 *  All Rights Reserved.
 *
 *  Contributor(s): _______________________________________
 *
 *
 ************************************************************************/


package com.sun.xmlsearch.xml.indexer;


import java.io.*;
import java.util.Hashtable;
import java.util.Vector;
import java.util.Enumeration;
import java.net.URL;
import org.xml.sax.InputSource;
import org.xml.sax.HandlerBase;
import com.sun.xml.parser.Resolver;
import com.sun.xml.tree.XmlDocument;
import com.sun.xmlsearch.tree.*;
import com.sun.xmlsearch.util.*;
import com.sun.xmlsearch.db.*;
import com.sun.xmlsearch.xml.XmlIndex;


import com.sun.xml.parser.Parser;
import com.sun.xml.parser.ValidatingParser;


import com.jclark.xsl.om.*;
import com.jclark.xsl.sax.*;
import com.jclark.xsl.tr.Result;
import com.jclark.xsl.tr.OutputMethod;
import com.jclark.xsl.tr.LoadContext;


import com.jclark.xsl.dom.Transform;
import com.jclark.xsl.dom.TransformEngine;
import com.jclark.xsl.dom.TransformException;
import com.jclark.xsl.dom.XSLTransformEngine;


public final class XmlIndexBuilder {
    final class MyXslEngine extends XSLTransformEngine {
  public Node load(URL url,
       int documentIndex,
       LoadContext context,
       NameTable nameTable) throws XSLException {
      System.out.println("loading for indexing " + url.toString());
      try {
    System.out.println("parsing");
    final Node root = parseTargetDocument(url);
    System.out.println("parsed");
    return root;
      }
      catch (Exception e) {
    throw new XSLException(e);
      }
  }
    } // end of MyXslEngine
    
    //  private final String Http = "http://localhost:8089/";
    // GTM: this needs to be parameterized but for now,
    // GTM: replace 'file:///home/jacek/docs/' 
    //private final String Http = "file:///home/jacek/docs/";
    private final String Http = 
  "http://localhost:8084/";


    // locations array data
    private static int InitSize = 4096;
    private int _size = InitSize;
    private int _free = 0;
    private ConceptLocation[] _locations = new ConceptLocation[_size];
  
    private XmlIndex _index;
  
    private int _currentDocID = 0;


    private Hashtable _indexers = new Hashtable();
    private Hashtable _stoplist = new Hashtable();
  
    private Hashtable _linkCodes = new Hashtable();
    private Vector _linknames = new Vector();
  
    // indexing state
    private static int CurrenMaxLinkCode = 0;
    private int _availContextNumber;
    private int _lastWordNumber;
    private int _firstWord;
    private boolean _anyLocationsStored = false;
    
    private final class IndexAdapter extends ResultAdapter {
  private static final String IndexNS = "http://sun.com/2000/XMLSearch";
  private static final String DefTok  = 
      "com.sun.xmlsearch.util.SimpleTokenizer";
  private static final int StackSize = 64;
    
  // names of indexing elements and attributes
  private final Name _indexText_Name;
  private final Name _indexElement_Name;
  private final Name _indexAttribute_Name;
  private final Name _nodeID_Name;
  private final Name _tokenizer_Name;
  private final Name _attributeName_Name;


  private Vector    _textNodes  = new Vector(512);
  private Hashtable _tokenizers = new Hashtable();
  private Tokenizer _defaultTokenizer;
  private Hashtable _numberedNodes = new Hashtable(1024*4);
    
  private boolean[]  _indexOnOffStack = new boolean[StackSize];
  private int       _sp;
  private Tokenizer[] _tokenizerStack = new Tokenizer[StackSize];
  private int       _tsp;


  private String[] _attributeStack = new String[StackSize];
  private int       _attrSP;


  private Node _currentNode;
    
  public IndexAdapter(NameTable nameTable) {
      _indexText_Name = nameTable.createName("index:text", IndexNS);
      _indexElement_Name = nameTable.createName("index:element", IndexNS);
      _indexAttribute_Name =nameTable.createName("index:attribute",IndexNS);
      _nodeID_Name = nameTable.createName("index:nodeID", IndexNS);
      _tokenizer_Name = nameTable.createName("index:tokenizer", IndexNS);
      _attributeName_Name = nameTable.createName("index:attributeName", 
                   IndexNS);
      _defaultTokenizer = getTokenizer(DefTok);
  }


  public void init() throws XSLException {
      _availContextNumber = 0;
      _lastWordNumber = 0;
      _anyLocationsStored = false;
      // all the contexts' tables
      _initialWords.clear();
      _sp     = -1;
      _tsp     = -1;
      _attrSP = -1;
      _free   = 0;
  }


  public void finish() throws XSLException {
      _numberedNodes.clear();
      _dests.clear();
      _seqNumbers.clear();
      _links.clear();


            final int nTextNodes = _textNodes.size();
      _availContextNumber = nTextNodes;
      // vector to hold parents of text nodes
      Vector parents = new Vector(nTextNodes * 2);
      /*****
      for each of the text nodes its sequence number is stored
      as well as the index of its parent (in _dests)
      _link is not stored as it is always "text()"
      _availContextNumber only used to number parent element contexts
      ******/
      for (int i = 0; i < nTextNodes; i++) {
    final Node node = (Node)_textNodes.elementAt(i);
    final Node parent = node.getParent();
    // find this text node's seq number
    final SafeNodeIterator siblings = parent.getChildren();
    Node sibling;
    int counter = 1;
    while ((sibling = siblings.next()) != node) {
        if (sibling.getType() == Node.TEXT)
      ++counter;
    }
    _seqNumbers.add(counter);
    // check whether parent already encountered
    Object number = _numberedNodes.get(parent);
    if (number == null) {  // not yet seen
        final int newContext = _availContextNumber++;
        _numberedNodes.put(parent, new Integer(newContext));
        _dests.add(newContext);
        // enqueue parent: its parent will need a number too
        parents.addElement(parent);
        //  System.out.println(parent.getName().toString() + 
        //       " -> " + newContext);
    }
    else {
        _dests.add(((Integer)number).intValue());
    }
      }// end for


      _textNodes.setSize(0);
      
      // store info about element ancestry of the above text nodes
      // grandparents are added to the end of the vector 
      int rootElementPos = 0;
      for (int i = 0; i < parents.size(); i++) {
    final Node node = (Node)parents.elementAt(i);
    final Name name = node.getName();
    final Node parent = node.getParent();
  
    _links.add(getLinkCode(name.toString()));


    if (parent.getType() == Node.ELEMENT) {  // not ROOT
        // find sequence number
        final SafeNodeIterator siblings = parent.getChildren();
        Node sibling;
        int counter = 1;
        while ((sibling = siblings.next()) != node) {
      if (sibling.getName() == name)
          ++counter;
        }
    
        _seqNumbers.add(counter);
    
        // check whether parent already known
        Object number = _numberedNodes.get(parent);
        if (number == null) {
      final int newContext = _availContextNumber++;
      _numberedNodes.put(parent, new Integer(newContext));
      _dests.add(newContext);
      // enqueue parent: its parent will need a number too
      parents.addElement(parent);
      //System.out.println(parent.getName().toString() +
      //   " -> " + newContext);
        }
        else {
      _dests.add(((Integer)number).intValue());
        }
    }
    else {
        _dests.add(0);  // placeholder
        _seqNumbers.add(1);
        rootElementPos = i + nTextNodes;
        //    System.out.println("rootElementPos = " + i);
    }
      } // end for






      // index to sentinel
      _dests.set(rootElementPos, _availContextNumber);
      /******
       _dests.add(-1);
       final int card = _dests.cardinality();
       boolean failed = false;
       for (int k = 0; k < card && !failed; k++) {
       int counter = 0;
       for (int context = _dests.at(k);
       context != -1;
       context = _dests.at(context))
       if (++counter > 2*card) {
       System.err.println("test failed at " + k);
       failed = true;
       break;
       }
       }
       if (failed) {
       System.err.println("nTextNodes = " + nTextNodes);
       for (int k = 0; k < card; k++) {
       System.err.println(k+":"+_dests.at(k));
       }
       System.exit(1);
       }
       _dests.pop();
       System.err.println("nTextNodes = " + nTextNodes);
       System.out.println("|_initialWords| " + 
       _initialWords.cardinality());
       System.out.println("|_dests| " + _dests.cardinality());
       System.out.println("|_seqNumbers| " + 
       _seqNumbers.cardinality());
       System.out.println("|_links| " + _links.cardinality());
      ******/
  } // end public void finish


  public void characters(String str) throws XSLException {
      if (_sp >= 0 && _indexOnOffStack[_sp]) {
    try {
        indexText(str, _tsp != -1
            ? _tokenizerStack[_tsp]
            : _defaultTokenizer);
    }
    catch (Exception e) {
        throw new XSLException(e);
    }
      }
  }


  public void startElement(Name elementType, NamespacePrefixMap nsMap)
      throws XSLException {
      //System.out.println("startElement: " + elementType.toString());
      if (elementType == _indexElement_Name) {
    _indexOnOffStack[++_sp] = true;
    // pop Tokenizer stack
    // following attribute can push selected Tokenizer
    if (_tsp != -1)
        _tsp--;
      }
      else if (elementType == _indexText_Name) {
      }
      else if (elementType == _indexAttribute_Name) {
    _attrSP++;
      }
  }


  public void attribute(Name name, String value) throws XSLException {
      // System.out.println("attribute: " + name.toString() +
      //      " = " + value);
      if (name == _nodeID_Name)
    _currentNode = (Node)_nodes.get(value);
      else if (name == _tokenizer_Name)
    _tokenizerStack[++_tsp] = getTokenizer(value);
      else if (name == _attributeName_Name) {
    try {
        NamespacePrefixMap nspm = 
      _currentNode.getNamespacePrefixMap();
        Name attributeName = 
      nspm.expandAttributeName(value, _currentNode);
        String attrVal = _currentNode.getAttributeValue(
                    attributeName);
        //System.out.println("attrVal = " + attrVal);
        _attributeStack[_attrSP] = 
      _currentNode.getName().toString() + 
      '<'+value+'<'+attrVal;
        storeLocation("+<" + _attributeStack[_attrSP]);
    }
    catch (Exception e) {
        throw new XSLException(e);
    }
      }
  }


  public void endElement(Name elementType) throws XSLException {
      if (elementType == _indexElement_Name) {
    _sp--;
      }
      else if (elementType == _indexText_Name) {
    // reset
      }
      else if (elementType == _indexAttribute_Name) {
    try {
        storeLocation("-<" + _attributeStack[_attrSP--]);
    }
    catch (Exception e) {
        throw new XSLException(e);
    }
      }
  }
  
  private void storeLocation(String token, int number) throws Exception {
      if (_free == _size) {
    ConceptLocation[] newArray = new ConceptLocation[_size *= 2];
    System.arraycopy(_locations, 0, newArray, 0, _free);
    _locations = newArray;
      }
      _locations[_free++] = new ConceptLocation(intern(token), 
                  number, number);
   }


  private void storeLocation(String token) throws Exception {
      System.out.println(token);
      
      storeLocation(token, _lastWordNumber++);
  }


  private void indexText(String text, Tokenizer tokenizer)
      throws Exception {
      tokenizer.setText(text);
      _firstWord = _lastWordNumber;
      _anyLocationsStored = false;
      Token token;
      while ((token = tokenizer.nextToken()) != null) {
    final String lowercaseToken = token.toLowerCaseString();
    if (_stoplist.get(lowercaseToken) == null) {
        storeLocation(lowercaseToken);
        _anyLocationsStored = true;
    }
    else {
        _lastWordNumber++;
    }
      }
  
      if (_anyLocationsStored && _firstWord > -1) {
    _initialWords.add(_firstWord);
    _textNodes.addElement(_currentNode);
      }
      // reset before next batch
      _firstWord = -1;
  }
    
  private Tokenizer getTokenizer(String className) {
      Object tokenizer;
      if ((tokenizer = _tokenizers.get(className)) != null)
    return (Tokenizer)tokenizer;
      else {
    try {
        tokenizer = Class.forName(className).newInstance();
        _tokenizers.put(className, tokenizer);
        return (Tokenizer)tokenizer;
    }
    catch (Exception e) {
        System.err.println(e);
        return _defaultTokenizer;
    }
      }
  }
    } // end of IndexAdapter
 
    // mapping from generated node ids to nodes themselves
    private Hashtable _nodes = new Hashtable(4096);
  
    private IntegerArray _initialWords = new IntegerArray(512);
    private IntegerArray _links        = new IntegerArray(512);
    private IntegerArray _dests        = new IntegerArray(512);
    private IntegerArray _seqNumbers   = new IntegerArray(512);
  
    private ElementIndexer _defaultIndexer;
    private PrefixTranslator _prefixTransl;


    // GTM new:
    private String  _transformLocation = null;
    // by default use web server to get transform files...
    private boolean _useLocalTransformFile = false; 
  
    private TreeBuilder _treeBuilder;
    private MyXslEngine _transformEngine;
    private Transform _indexingTransform;
    private Transform _defaultTransform;
    private IndexAdapter _indexAdapter;


    public XmlIndexBuilder(String indexDir) throws Exception {
  _index = new XmlIndex(indexDir, true);
    }


    public boolean init(String transform) throws Exception {
  if (_index.init()) {
      File etc = new File(System.getProperty("XMLSEARCH"), "etc");
      readStoplist(new File(etc, "StopList"));
      // _defaultIndexer = new DefaultElementIndexer(this);
      reset();
      // initialize vector and hashtable
      String[] linkNames = _index.getLinkNames();
      if (linkNames != null){
    for (int i = 0; i < linkNames.length; i++){
        getLinkCode(linkNames[i]);
    }
      }
      initXmlProcessor(transform);
      return true;
  }
  else {
      return false;
  }
    }


    public void indexDocument(URL docURL, String title) throws Exception {
  InputSource source = new InputSource(docURL.openStream());
  source.setSystemId(docURL.toString());
  Parser sourceParser = new ValidatingParser();
  sourceParser.setFastStandalone(true);
  XMLProcessorEx sourceLoader = new XMLProcessorImpl(sourceParser);
  //  long start = System.currentTimeMillis();
  Node root = sourceLoader.load(source, 0,
              _transformEngine.getSourceLoadContext(),
              _transformEngine.getNameTable());
    
  //System.out.println((System.currentTimeMillis()-start)+" msec parse");
  // build association from generated node ids to nodes
  // start = System.currentTimeMillis();
  recordNodes(root.getChildren().next());
  // System.out.println((System.currentTimeMillis()-start) +
  //  " record nodes");


  openDocument(docURL.toString());
  _indexAdapter.init();
  // start = System.currentTimeMillis();
  _indexingTransform.transform(root, _indexAdapter);
  // System.out.println((System.currentTimeMillis()-start)+" transform");
  // start = System.currentTimeMillis();
  _indexAdapter.finish();
  // System.out.println((System.currentTimeMillis()-start)+" finish");
  // start = System.currentTimeMillis();
  closeDocument(title);
  // System.out.println((System.currentTimeMillis()-start)+" close");
  _nodes.clear();
    }


    private void recordNodes(Node node) {
  switch (node.getType()) {
  case Node.ELEMENT:
      _nodes.put(node.getGeneratedId(), node);
      SafeNodeIterator iterator = node.getAttributes();
      Node node1;
      while ((node1 = iterator.next()) != null)
    _nodes.put(node1.getGeneratedId(), node1);
      iterator = node.getChildren();
      while ((node1 = iterator.next()) != null)
    recordNodes(node1);
      break;
    
  case Node.TEXT:
      _nodes.put(node.getGeneratedId(), node);
      break;
  }
    }


    private final void initXmlProcessor(String transform) throws Exception {
  _transformEngine   = new MyXslEngine();
  NameTable nameTable = _transformEngine.getNameTable();
  _treeBuilder       = new TreeBuilder(nameTable);
  _defaultTransform  = getTransform("default");
  _indexingTransform = getTransform(transform);
  _indexAdapter      = new IndexAdapter(nameTable);
    }


    private Node parseTargetDocument(URL docUrl) throws Exception {
  return _treeBuilder.getRoot(docUrl);
    }
 
    private Transform getTransform(String stylesheetName) throws Exception {
  System.out.println("creating indexing transform: " + stylesheetName);
  URL stylesheetUrl = null;
  if(_useLocalTransformFile){
      stylesheetUrl = new URL("file:" + 
            _transformLocation +
            "/" + stylesheetName + ".xsl");  
  }
  else{
      stylesheetUrl = new URL(Http + stylesheetName + ".xsl");
  }
  System.out.println(stylesheetUrl.toString());
  InputStream stylesheetStream = stylesheetUrl.openStream();
  XmlDocument sheet = XmlDocument.createXmlDocument(stylesheetStream, 
                false);
  return _transformEngine.createTransform(sheet);
    }


    // GTM new:
    public void setTransformLocation(String filelocation){
   _transformLocation = null; 
        _useLocalTransformFile = false;
  final File testfile = new File(filelocation);
  if(testfile.exists()){
      _transformLocation = filelocation;
      _useLocalTransformFile = true;
  }
    }
  
    public void setPrefixTranslator(PrefixTranslator translator) {
  _prefixTransl = translator;
    }


    public void updateIndex(Hashtable toRemove, Hashtable toRefresh,
          Hashtable toAdd) throws Exception {
  // first prune microindexes to be removed or replaced
  _index.pruneIndex(toRemove, toRefresh);
  // reindex docs to be refreshed
  Enumeration documents = toRefresh.keys();
  while (documents.hasMoreElements()) {
      final String document = (String)documents.nextElement();
      System.out.println("= " + document);
      indexDocument(new URL(document), "xml");
  }
  // index new documents
  documents = toAdd.keys();
  while (documents.hasMoreElements()) {
      final String document = (String)documents.nextElement();
      System.out.println("+ " + document);
      indexDocument(new URL(document), "xml");
  }
    }
  
    public void clearIndex() throws IOException {
  _index.clear();
    }
  
    private int intern(String name) throws Exception {
  return _index.intern(name);
    }
  
    public void openDocument(String name) throws Exception {
  if (_currentDocID != 0){
      throw new Exception("document already open");
  }
  _currentDocID =
      intern(_prefixTransl != null
       ? _prefixTransl.translatePrefix(name)
       : name);
  reset();      // reset context gathering state
    }
  
    public void closeDocument(String title) throws Exception {
  if (_currentDocID == 0)
      throw new Exception("no document open");
  else if (_free > 0) {
      IntegerArray kTable = new IntegerArray();
     
      Compressor compressor1 = new Compressor();
      Compressor compressor2 = new Compressor();
      Compressor compressor3 = new Compressor();
      Compressor compressor4 = new Compressor();
      
      kTable.add(compressor1.compressAscending(_initialWords));
      kTable.add(compressor2.minimize(_dests,      2));
      kTable.add(compressor3.minimize(_links,      2));
      kTable.add(compressor4.minimize(_seqNumbers, 2));
      
      Compressor compressor0 = new Compressor();
      int k0 = compressor0.minimize(kTable, 4);
      
      compressor0.concatenate(compressor1);
      compressor0.concatenate(compressor2);
      compressor0.concatenate(compressor3);
      compressor0.concatenate(compressor4);
      
      _index.compress(_currentDocID,
          intern(title),
          _locations,
          _free,
          null, // extents
          0, // extent count
          k0,
          compressor0);
  }
  else {
      System.out.println("no indexable content");
  }
        _free = 0;
  _currentDocID = 0;    // state: nothing open
    }


    private void readStoplist(File file) {
  try {
      final LineInput in = new LineInput(new FileInputStream(file));
      String line;
      while ((line = in.readLine()) != null) {
    // value doesn't matter...
    _stoplist.put(line.trim(), this); 
      }
  }
  catch (IOException e) {
      e.printStackTrace();
  }
    }
  
    public void close() throws Exception {
  // store link names
  Object[] linkNames = _linknames.toArray(new String[_linknames.size()]);
  _index.saveLinkNames(linkNames);


  // output link codes
  /*
    Enumeration keys = _linknames.elements();
    while (keys.hasMoreElements())
    System.out.println((String)keys.nextElement());
  */
    
  _index.close();
  System.out.println("done");
    }


    private void reset() {
  _availContextNumber = 0;
  _lastWordNumber = 0;
  _free = 0;
  _anyLocationsStored = false;
  // all the contexts' tables
  _initialWords.clear();
  _dests.clear();
  _links.clear();
  _seqNumbers.clear();
    }


    private void storeToken(String token) throws Exception {
  final int number = _lastWordNumber++; // counting all tokens now
  if (_stoplist.get(token) == null)   // not found in stoplist
      storeLocation(token, number);
    }


    private void storeLocation(String text, int tokenNo) throws Exception {
  // System.out.println(text + "\t\t@\t" + tokenNo);
  if (_free == _size) {
      ConceptLocation[] newArray = new ConceptLocation[_size *= 2];
      System.arraycopy(_locations, 0, newArray, 0, _free);
      _locations = newArray;
  }
  _locations[_free++] = new ConceptLocation(intern(text), tokenNo, 
              tokenNo);
  _anyLocationsStored = true;
    }




    private int getLinkCode(String linkName) {
  final Integer code;
  if ((code = (Integer)_linkCodes.get(linkName)) != null){
      return code.intValue();
  }
  else {
      _linknames.addElement(linkName);
      final int newCode = CurrenMaxLinkCode++;
      _linkCodes.put(linkName, new Integer(newCode));
      return newCode;
  }
    }
  
    private void storeEdge(int relation, int seqNumber, int destination) {
  _links.add(relation);
  _seqNumbers.add(seqNumber);
  _dests.add(destination);
    }
}
Source Code of com.sun.xmlsearch.xml.indexer.XmlIndexBuilder

Related Classes of com.sun.xmlsearch.xml.indexer.XmlIndexBuilder