/*
* eXist Open Source Native XML Database
* Copyright (C) 2001-07 The eXist Project
* http://exist-db.org
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* $Id$
*/
package org.exist.storage;
//import java.io.EOFException;
import org.exist.EXistException;
import org.exist.collections.Collection;
import org.exist.dom.*;
import org.exist.fulltext.ElementContent;
import org.exist.fulltext.FTMatch;
import org.exist.numbering.NodeId;
import org.exist.security.PermissionDeniedException;
import org.exist.storage.analysis.TextToken;
import org.exist.storage.btree.BTreeCallback;
import org.exist.storage.btree.BTreeException;
import org.exist.storage.btree.DBException;
import org.exist.storage.btree.IndexQuery;
import org.exist.storage.btree.Value;
import org.exist.storage.index.BFile;
import org.exist.storage.io.VariableByteArrayInput;
import org.exist.storage.io.VariableByteInput;
import org.exist.storage.io.VariableByteOutputStream;
import org.exist.storage.lock.Lock;
import org.exist.util.ByteArray;
import org.exist.util.ByteConversion;
import org.exist.util.Configuration;
import org.exist.util.LockException;
import org.exist.util.Occurrences;
import org.exist.util.ProgressIndicator;
import org.exist.util.ReadOnlyException;
import org.exist.util.UTF8;
import org.exist.util.XMLString;
import org.exist.xquery.Constants;
import org.exist.xquery.TerminatedException;
import org.exist.xquery.XQueryContext;
import org.w3c.dom.Node;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* This class is responsible for fulltext-indexing. Text-nodes are handed over
* to this class to be fulltext-indexed. Method storeText() is called by
* RelationalBroker whenever it finds a TextNode. Method getNodeIDsContaining()
* is used by the XPath-engine to process queries where a fulltext-operator is
* involved. The class keeps two database tables: table <code>dbTokens</code> stores the words
* found with their unique id. Table <code>invertedIndex</code> contains the word occurrences for
* every word-id per document.
*
* TODO: store node type (attribute or text) with each entry
*
* @author Wolfgang Meier
*/
public class NativeTextEngine extends TextSearchEngine implements ContentLoadingObserver {
public static final String FILE_NAME = "words.dbx";
public static final String FILE_KEY_IN_CONFIG = "db-connection.words";
public static final double DEFAULT_WORD_CACHE_GROWTH = 1.4;
public static final double DEFAULT_WORD_KEY_THRESHOLD = 0.01;
public static final double DEFAULT_WORD_VALUE_THRESHOLD = 0.015;
public final static byte TEXT_SECTION = 0;
public final static byte ATTRIBUTE_SECTION = 1;
public final static byte QNAME_SECTION = 2;
private final static byte IDX_GENERIC = 0;
private final static byte IDX_QNAME = 1;
public static int ATTRIBUTE_BY_QNAME = 0;
public static int ATTRIBUTE_NOT_BY_QNAME = 1;
public static int TOKENIZE = 0;
public static int DO_NOT_TOKENIZE = 1;
public static int TEXT_BY_QNAME = 2;
public static int FOURTH_OPTION = 3;
public final static int LENGTH_NODE_TYPE = 1; //sizeof byte
public final static int LENGTH_NODE_IDS_FREQ_OFFSETS = 4; //sizeof int
public final static int OFFSET_NODE_TYPE = 0;
public final static int OFFSET_ELEMENT_CHILDREN_COUNT = OFFSET_NODE_TYPE + LENGTH_NODE_TYPE; //1
public final static int OFFSET_ATTRIBUTE_DLN_LENGTH = OFFSET_NODE_TYPE + LENGTH_NODE_TYPE; //1
public final static int OFFSET_TEXT_DLN_LENGTH = OFFSET_NODE_TYPE + LENGTH_NODE_TYPE; //1
public final static int OFFSET_DLN = OFFSET_TEXT_DLN_LENGTH + NodeId.LENGTH_NODE_ID_UNITS; //3
/** Length limit for the tokens */
public final static int MAX_TOKEN_LENGTH = 2048;
/** The data-store for this token index */
protected BFile dbTokens;
protected InvertedIndex invertedIndex;
/** The current document */
private DocumentImpl doc;
/** Work output Stream that should be cleared before every use */
private VariableByteOutputStream os = new VariableByteOutputStream(7);
public NativeTextEngine(DBBroker broker, BFile dbFile, Configuration config) throws DBException {
super(broker, config);
this.invertedIndex = new InvertedIndex();
this.dbTokens = dbFile;
}
public String getFileName() {
return FILE_NAME;
}
public String getConfigKeyForFile() {
return FILE_KEY_IN_CONFIG;
}
public NativeTextEngine getInstance() {
return this;
}
/**
* Checks if the given string could be a regular expression.
*
* @param str The string
*/
public final static boolean containsWildcards(String str) {
if (str == null || str.length() == 0)
{return false;}
for (int i = 0; i < str.length(); i++)
switch (str.charAt(i)) {
case '*' :
case '?' :
case '\\' :
case '[' :
case ']' :
return true;
}
return false;
}
public final static boolean startsWithWildcard(String str) {
if (str == null || str.length() == 0)
{return false;}
switch (str.charAt(0)) {
case '*' :
case '?' :
case '\\' :
case '[' :
return true;
default:
return false;
}
}
public int getTrackMatches() {
return trackMatches;
}
public void setTrackMatches(int flags) {
trackMatches = flags;
}
public void setDocument(DocumentImpl document) {
if (this.doc != null && this.doc.getDocId() != document.getDocId())
{flush();}
this.doc = document;
invertedIndex.setDocument(doc);
}
/**
* Indexes the tokens contained in an attribute.
*
* @param node The attribute to be indexed
*/
//TODO : unify functionalities with storeText -pb
public void storeAttribute(AttrImpl node, NodePath currentPath, int indexingHint, FulltextIndexSpec indexSpec, boolean remove) {
if ((indexingHint & ATTRIBUTE_BY_QNAME) == ATTRIBUTE_BY_QNAME ||
(indexingHint & ATTRIBUTE_NOT_BY_QNAME) == ATTRIBUTE_NOT_BY_QNAME) {
//TODO : case conversion should be handled by the tokenizer -pb
tokenizer.setText(node.getValue().toLowerCase());
TextToken token;
while (null != (token = tokenizer.nextToken())) {
if (token.length() > MAX_TOKEN_LENGTH) {
LOG.warn("Token length exceeded " + MAX_TOKEN_LENGTH + ": " +
token.getText().substring(0,20) + "...");
continue;
}
if (stoplist.contains(token)) {
continue;
}
//TODO : the tokenizer should strip unwanted token types itself -pb
if (!token.isAlpha() && indexSpec != null && !indexSpec.getIncludeAlphaNum()) {
continue;
}
if (indexingHint == ATTRIBUTE_BY_QNAME)
{invertedIndex.addAttribute(token, node, remove);}
else
{invertedIndex.addAttribute(token, node.getNodeId(), remove);}
}
}
}
//TODO : unify with above choosing one of these 2 strategies :
//1) compute the indexing strategy from thhe broker (introduce some kind of dependency)
//2) read the configuration from the indexer (possible performance loss)
public void storeAttribute(AttrImpl node, NodePath currentPath, int indexingHint,
RangeIndexSpec idx, boolean remove) {
//Nothing actually done
}
/**
* Indexes the tokens contained in a text node.
*
* @param indexSpec The index configuration
* @param node The text node to be indexed
* @param indexingHint
* if <code>true</code>, given text is indexed as a single token
* if <code>false</code>, it is tokenized before being indexed
*/
//TODO : use an indexSpec member in order to get rid of <code>noTokenizing</code>
public void storeText(CharacterDataImpl node, int indexingHint, FulltextIndexSpec indexSpec, boolean remove) {
if (indexingHint == TOKENIZE || indexingHint == DO_NOT_TOKENIZE) {
//TODO : case conversion should be handled by the tokenizer -pb
final XMLString t = node.getXMLString().transformToLower();
TextToken token;
if (indexingHint == DO_NOT_TOKENIZE) {
token = new TextToken(TextToken.ALPHA, t, 0, t.length());
invertedIndex.addText(token, node.getNodeId(), remove);
} else if (indexingHint == TOKENIZE){
tokenizer.setText(t);
while (null != (token = tokenizer.nextToken())) {
if (token.length() > MAX_TOKEN_LENGTH) {
LOG.warn("Token length exceeded " + MAX_TOKEN_LENGTH +
": " + token.getText().substring(0,20) + "...");
continue;
}
if (stoplist.contains(token)) {
continue;
}
if (indexSpec != null) {
//TODO : the tokenizer should strip unwanted token types itself -pb
if (!indexSpec.getIncludeAlphaNum() && !token.isAlpha()) {
continue;
}
}
invertedIndex.addText(token, node.getNodeId(), remove);
}
}
}
}
public void storeText(StoredNode parent, ElementContent text, int indexingHint, FulltextIndexSpec indexSpec, boolean remove) {
//TODO : case conversion should be handled by the tokenizer -pb
TextToken token;
ElementContent.TextSpan span = text.getFirst();
XMLString data = null;
int currentOffset = 0;
while (span != null) {
if (data == null)
{data = span.getContent().transformToLower();}
else {
currentOffset = data.length();
data.append(span.getContent().transformToLower());
}
tokenizer.setText(data, currentOffset);
while (null != (token = tokenizer.nextToken())) {
if (token.length() > MAX_TOKEN_LENGTH) {
LOG.warn("Token length exceeded " + MAX_TOKEN_LENGTH + ": " + token.getText().substring(0,20) + "...");
continue;
}
if (stoplist.contains(token)) {
continue;
}
if (indexSpec != null) {
//TODO : the tokenizer should strip unwanted token types itself -pb
if (!indexSpec.getIncludeAlphaNum() && !token.isAlpha()) {
continue;
}
}
if (indexingHint == TEXT_BY_QNAME)
{invertedIndex.addText(token, (ElementImpl) parent, remove);}
else
{invertedIndex.addText(token, parent.getNodeId(), remove);}
}
span = span.getNext();
}
}
public void storeText(TextImpl node, NodePath currentPath, int indexingHint) {
// TODO Auto-generated method stub
}
public void removeNode(StoredNode node, NodePath currentPath, String content) {
// TODO Auto-generated method stub
}
/* (non-Javadoc)
* @see org.exist.storage.ContentLoadingObserver#sync()
*/
public void sync() {
final Lock lock = dbTokens.getLock();
try {
lock.acquire(Lock.WRITE_LOCK);
dbTokens.flush();
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e);
//TODO : throw an exception ? -pb
} catch (final DBException e) {
LOG.error(e.getMessage(), e);
//TODO : throw an exception ? -pb
} finally {
lock.release(Lock.WRITE_LOCK);
}
}
/* (non-Javadoc)
* @see org.exist.storage.ContentLoadingObserver#flush()
*/
public void flush() {
invertedIndex.flush();
}
public void remove() {
invertedIndex.remove();
}
/* Drop all index entries for the given collection.
* @see org.exist.storage.ContentLoadingObserver#dropIndex(org.exist.collections.Collection)
*/
public void dropIndex(Collection collection) {
final Lock lock = dbTokens.getLock();
try {
lock.acquire(Lock.WRITE_LOCK);
// remove generic index
Value value = new WordRef(collection.getId());
dbTokens.removeAll(null, new IndexQuery(IndexQuery.TRUNC_RIGHT, value));
// remove QName index
value = new QNameWordRef(collection.getId());
dbTokens.removeAll(null, new IndexQuery(IndexQuery.TRUNC_RIGHT, value));
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
} catch (final BTreeException e) {
LOG.error(e.getMessage(), e);
//TODO : throw exception ? -pb
} catch (final IOException e) {
LOG.error(e.getMessage(), e);
//TODO: throw exception ? -pb
} finally {
lock.release(Lock.WRITE_LOCK);
}
}
/* Drop all index entries for the given document.
* @see org.exist.storage.ContentLoadingObserver#dropIndex(org.exist.dom.DocumentImpl)
*/
public void dropIndex(DocumentImpl document) {
invertedIndex.dropIndex(document);
}
public NodeSet getNodesContaining(XQueryContext context, DocumentSet docs, NodeSet contextSet, int axis,
QName qname, String expr, int type, boolean matchAll) throws TerminatedException {
if (type == DBBroker.MATCH_EXACT && containsWildcards(expr)) {
//TODO : log this fallback ? -pb
type = DBBroker.MATCH_WILDCARDS;
}
switch (type) {
case DBBroker.MATCH_EXACT :
return getNodesExact(context, docs, contextSet, axis, qname, expr);
//TODO : stricter control -pb
default :
return getNodesRegexp(context, docs, contextSet, axis, qname, expr, type, matchAll);
}
}
/**
* Get all nodes whose content exactly matches the give expression.
*/
public NodeSet getNodesExact(XQueryContext context, DocumentSet docs, NodeSet contextSet, int axis,
QName qname, String expr) throws TerminatedException {
//Return early
if (expr == null)
{return null;}
//TODO : filter the expression *before* -pb
if (stoplist.contains(expr))
{return null;}
//TODO : case conversion should be handled by the tokenizer -pb
expr = expr.toLowerCase();
//TODO : use an indexSpec member in order to get rid of this or do the job *before* -pb
String token;
if (stem)
{token = stemmer.stem(expr);}
else
{token = expr;}
final NodeSet result = new NewArrayNodeSet(docs.getDocumentCount(), 250);
for (final Iterator<Collection> iter = docs.getCollectionIterator(); iter.hasNext();) {
final int collectionId = (iter.next()).getId();
Value key;
if (qname == null)
{key = new WordRef(collectionId, token);}
else {
key = new QNameWordRef(collectionId, qname, token, broker.getBrokerPool().getSymbols());
}
final Lock lock = dbTokens.getLock();
try {
lock.acquire(Lock.READ_LOCK);
final VariableByteInput is = dbTokens.getAsStream(key);
//Does the token already has data in the index ?
if (is == null)
{continue;}
while (is.available() > 0) {
final int storedDocId = is.readInt();
final int storedSection = is.readByte();
final int gidsCount = is.readInt();
//Read (variable) length of node IDs + frequency + offsets
final int length = is.readFixedInt();
final DocumentImpl storedDocument = docs.getDoc(storedDocId);
//Exit if the document is not concerned
if (storedDocument == null) {
is.skipBytes(length);
continue;
}
//Process the nodes
NodeId previous = null;
for (int m = 0; m < gidsCount; m++) {
NodeId nodeId = broker.getBrokerPool().getNodeFactory().createFromStream(previous, is);
previous = nodeId;
final int freq = is.readInt();
NodeProxy storedNode;
switch (storedSection) {
case ATTRIBUTE_SECTION :
storedNode = new NodeProxy(storedDocument, nodeId, Node.ATTRIBUTE_NODE);
break;
case TEXT_SECTION :
storedNode = new NodeProxy(storedDocument, nodeId, Node.TEXT_NODE);
break;
case QNAME_SECTION :
storedNode = new NodeProxy(storedDocument, nodeId,
qname.getNameType() == ElementValue.ATTRIBUTE ?
Node.ATTRIBUTE_NODE : Node.ELEMENT_NODE);
break;
default :
throw new IllegalArgumentException("Invalid section type in '" +
dbTokens.getFile().getName() + "'");
}
// if a context set is specified, we can directly check if the
// matching text node is a descendant of one of the nodes
// in the context set.
if (contextSet != null) {
NodeProxy parent;
switch(storedSection) {
case ATTRIBUTE_SECTION :
if (contextSet instanceof VirtualNodeSet) {
parent = contextSet.parentWithChild(storedNode,
false, true, NodeProxy.UNKNOWN_NODE_LEVEL);
if (parent != null && !parent.getNodeId().equals(storedNode.getNodeId()))
{parent = null;}
} else
{parent = contextSet.get(storedNode);}
break;
case QNAME_SECTION:
case TEXT_SECTION :
parent = contextSet.parentWithChild(storedNode,
false, true, NodeProxy.UNKNOWN_NODE_LEVEL);
break;
default :
throw new IllegalArgumentException("Invalid section type in '" + dbTokens.getFile().getName() + "'");
}
if (parent != null) {
final Match match = new FTMatch(-1, nodeId, token, freq);
readOccurrences(freq, is, match, token.length());
if (axis == NodeSet.ANCESTOR) {
parent.addMatch(match);
final int sizeHint = contextSet.getSizeHint(storedDocument);
result.add(parent, sizeHint);
} else {
storedNode.addMatch(match);
final int sizeHint = contextSet.getSizeHint(storedDocument);
result.add(storedNode, sizeHint);
}
} else {
is.skip(freq);
}
//Otherwise, we add all text nodes without check
} else {
final Match match = new FTMatch(-1, nodeId, token, freq);
readOccurrences(freq, is, match, token.length());
storedNode.addMatch(match);
result.add(storedNode, Constants.NO_SIZE_HINT);
}
context.proceed();
}
}
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
} catch (final IOException e) {
LOG.error(e.getMessage() + " in '" +
dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
} finally {
lock.release(Lock.READ_LOCK);
}
}
return result;
}
private NodeSet getNodesRegexp(XQueryContext context, DocumentSet docs,
NodeSet contextSet, int axis, QName qname, String expr, int type,
boolean matchAll) throws TerminatedException {
//Return early
if (expr == null)
{return null;}
if (stoplist.contains(expr))
{return null;}
//TODO : case conversion should be handled by the tokenizer -pb
expr = expr.toLowerCase();
//If the regexp starts with a char sequence, we restrict the index scan
//to entries starting with the same sequence. Otherwise, we have to scan
//the whole index.
CharSequence start = "";
if (matchAll) {
StringBuilder buf = new StringBuilder();
for (int i = 0; i < expr.length(); i++) {
if (Character.isLetterOrDigit(expr.charAt(i)))
{buf.append(expr.charAt(i));}
else
{break;}
}
start = buf;
}
try {
final TermMatcher comparator = new RegexMatcher(expr, type,
Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE, matchAll);
return getNodes(context, docs, contextSet, axis, qname, comparator, start);
} catch (final EXistException e) {
//TODO : throw exception ? -pb
return null;
}
}
/* Return all nodes for wich the matcher matches.
* @see org.exist.storage.TextSearchEngine#getNodes(org.exist.xquery.XQueryContext, org.exist.dom.DocumentSet, org.exist.dom.NodeSet, org.exist.storage.TermMatcher, java.lang.CharSequence)
*/
public NodeSet getNodes(XQueryContext context, DocumentSet docs,
NodeSet contextSet, int axis, QName qname, TermMatcher matcher,
CharSequence startTerm) throws TerminatedException {
if (LOG.isTraceEnabled() && qname != null)
{LOG.trace("Index lookup by QName: " + qname);}
final NodeSet result = new NewArrayNodeSet();
final SearchCallback cb = new SearchCallback(context, matcher, result,
contextSet, axis, docs, qname);
final Lock lock = dbTokens.getLock();
for (final Iterator<Collection> iter = docs.getCollectionIterator(); iter.hasNext();) {
final int collectionId = ((Collection) iter.next()).getId();
//Compute a key for the token
Value value;
if (startTerm != null && startTerm.length() > 0) {
//TODO : case conversion should be handled by the tokenizer -pb
if (qname == null) {
value = new WordRef(collectionId, startTerm.toString().toLowerCase());
} else {
value = new QNameWordRef(collectionId, qname, startTerm.toString().toLowerCase(),
broker.getBrokerPool().getSymbols());
}
} else {
if (qname == null) {
value = new WordRef(collectionId);
} else {
value = new QNameWordRef(collectionId, qname, broker.getBrokerPool().getSymbols());
}
}
final IndexQuery query = new IndexQuery(IndexQuery.TRUNC_RIGHT, value);
try {
lock.acquire(Lock.READ_LOCK);
dbTokens.query(query, cb);
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
} catch (final BTreeException e) {
LOG.error(e.getMessage(), e);
//TODO : throw exception ? -pb
} catch (final IOException e) {
LOG.error(e.getMessage(), e);
//TODO : throw exception ? -pb
} finally {
lock.release(Lock.READ_LOCK);
}
}
return result;
}
public String[] getIndexTerms(DocumentSet docs, TermMatcher matcher) {
final IndexCallback cb = new IndexCallback(null, matcher);
final Lock lock = dbTokens.getLock();
for (final Iterator<Collection> iter = docs.getCollectionIterator(); iter.hasNext();) {
final int collectionId = ((Collection) iter.next()).getId();
//Compute a key for the token
final Value value = new WordRef(collectionId);
final IndexQuery query = new IndexQuery(IndexQuery.TRUNC_RIGHT, value);
try {
lock.acquire(Lock.READ_LOCK);
dbTokens.query(query, cb);
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
} catch (final IOException e) {
LOG.error(e.getMessage(), e);
//TODO : throw exception ? -pb
} catch (final BTreeException e) {
LOG.error(e.getMessage(), e);
//TODO: throw exception ? -pb
} catch (final TerminatedException e) {
LOG.warn(e.getMessage(), e);
//TODO : throw exception ? -pb
} finally {
lock.release(Lock.READ_LOCK);
}
}
return cb.getMatches();
}
public Occurrences[] scanIndexTerms(DocumentSet docs, NodeSet contextSet,
String start, String end) throws PermissionDeniedException {
final IndexScanCallback cb = new IndexScanCallback(docs, contextSet, false);
final Lock lock = dbTokens.getLock();
for (final Iterator<Collection> i = docs.getCollectionIterator(); i.hasNext();) {
final int collectionId = (i.next()).getId();
final IndexQuery query;
if (start == null) {
final Value startRef = new WordRef(collectionId);
query = new IndexQuery(IndexQuery.TRUNC_RIGHT, startRef);
} else if (end == null) {
final Value startRef = new WordRef(collectionId, start.toLowerCase());
query = new IndexQuery(IndexQuery.TRUNC_RIGHT, startRef);
} else {
final Value startRef = new WordRef(collectionId, start.toLowerCase());
final Value endRef = new WordRef(collectionId, end.toLowerCase());
query = new IndexQuery(IndexQuery.BW, startRef, endRef);
}
try {
lock.acquire(Lock.READ_LOCK);
dbTokens.query(query, cb);
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
} catch (final IOException e) {
LOG.error(e.getMessage(), e);
//TODO : throw exception ? -pb
} catch (final BTreeException e) {
LOG.error(e.getMessage(), e);
//TODO : throw exception ? -pb
} catch (final TerminatedException e) {
LOG.warn(e.getMessage(), e);
//TODO : throw exception ? -pb
} finally {
lock.release(Lock.READ_LOCK);
}
}
final Occurrences[] result = new Occurrences[cb.map.size()];
return (Occurrences[]) cb.map.values().toArray(result);
}
public Occurrences[] scanIndexTerms(DocumentSet docs, NodeSet contextSet,
QName[] qnames, String start, String end) throws PermissionDeniedException {
final Lock lock = dbTokens.getLock();
final IndexScanCallback cb = new IndexScanCallback(docs, contextSet, true);
for (int q = 0; q < qnames.length; q++) {
for (final Iterator<Collection> i = docs.getCollectionIterator(); i.hasNext();) {
final int collectionId = (i.next()).getId();
final IndexQuery query;
if (start == null) {
final Value startRef = new QNameWordRef(collectionId, qnames[q],
broker.getBrokerPool().getSymbols());
query = new IndexQuery(IndexQuery.TRUNC_RIGHT, startRef);
} else if (end == null) {
final Value startRef = new QNameWordRef(collectionId, qnames[q],
start.toLowerCase(), broker.getBrokerPool().getSymbols());
query = new IndexQuery(IndexQuery.TRUNC_RIGHT, startRef);
} else {
final Value startRef = new QNameWordRef(collectionId, qnames[q], start.toLowerCase(),
broker.getBrokerPool().getSymbols());
final Value endRef = new QNameWordRef(collectionId, qnames[q], end.toLowerCase(),
broker.getBrokerPool().getSymbols());
query = new IndexQuery(IndexQuery.BW, startRef, endRef);
}
try {
lock.acquire(Lock.READ_LOCK);
dbTokens.query(query, cb);
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for '" + dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
} catch (final IOException e) {
LOG.error(e.getMessage(), e);
//TODO : throw exception ? -pb
} catch (final BTreeException e) {
LOG.error(e.getMessage(), e);
//TODO : throw exception ? -pb
} catch (final TerminatedException e) {
LOG.warn(e.getMessage(), e);
//TODO : throw exception ? -pb
} finally {
lock.release(Lock.READ_LOCK);
}
}
}
final Occurrences[] result = new Occurrences[cb.map.size()];
return (Occurrences[]) cb.map.values().toArray(result);
}
/**
* @param freq
* @param is
* @param match
* @throws IOException
*/
private void readOccurrences(int freq, VariableByteInput is, Match match,
int length) throws IOException {
for (int n = 0; n < freq; n++) {
match.addOffset(is.readInt(), length);
}
}
/**
* Collect all words in a document to be removed
*
* @param words Description of the Parameter
* @param domIterator Description of the Parameter
*/
//TODO : unify functionalities with storeText -pb
private void collect(Set words, Iterator domIterator) {
TextToken token;
int readOffset;
final byte[] data = ((Value) domIterator.next()).getData();
final short type = Signatures.getType(data[OFFSET_NODE_TYPE]);
switch (type) {
case Node.ELEMENT_NODE :
final int childrenCount = ByteConversion.byteToInt(data, OFFSET_ELEMENT_CHILDREN_COUNT);
for (int i = 0; i < childrenCount; i++)
//recursive call on children
collect(words, domIterator);
break;
case Node.TEXT_NODE :
int dlnLen = ByteConversion.byteToShort(data, OFFSET_TEXT_DLN_LENGTH);
int nodeIdLen = broker.getBrokerPool().getNodeFactory().lengthInBytes(dlnLen, data, OFFSET_DLN);
readOffset = nodeIdLen + OFFSET_DLN;
final String s = new String(data, readOffset, data.length - readOffset, UTF_8);
tokenizer.setText(s);
while (null != (token = tokenizer.nextToken())) {
final String word = token.getText();
if (stoplist.contains(word))
{continue;}
words.add(word.toLowerCase());
}
break;
case Node.ATTRIBUTE_NODE :
final byte idSizeType = (byte) (data[OFFSET_NODE_TYPE] & 0x3);
final boolean hasNamespace = (data[OFFSET_NODE_TYPE] & 0x10) == 0x10;
dlnLen = ByteConversion.byteToShort(data, OFFSET_ATTRIBUTE_DLN_LENGTH);
nodeIdLen = broker.getBrokerPool().getNodeFactory().lengthInBytes(dlnLen, data, OFFSET_DLN);
readOffset = Signatures.getLength(idSizeType) + nodeIdLen + OFFSET_DLN;
if (hasNamespace) {
//TODO : check the order in wich both info are read (and discarded)
readOffset += SymbolTable.LENGTH_LOCAL_NAME; // skip namespace id
final short prefixLen = ByteConversion.byteToShort(data, readOffset);
readOffset += prefixLen + SymbolTable.LENGTH_NS_URI; // skip prefix
}
final String val = new String(data, readOffset, data.length - readOffset, UTF_8);
tokenizer.setText(val);
while (null != (token = tokenizer.nextToken())) {
final String word = token.getText();
if (stoplist.contains(word))
{continue;}
words.add(word.toLowerCase());
}
break;
default :
//Other types are ignored : some may be useful though -pb
//TOUNDERSTAND : it looks like other types (got : Node.PROCESSING_INSTRUCTION_NODE)
//are stored in the index ??? -pb
}
}
public void closeAndRemove() {
config.setProperty(getConfigKeyForFile(), null);
dbTokens.closeAndRemove();
}
public boolean close() throws DBException {
config.setProperty(getConfigKeyForFile(), null);
return dbTokens.close();
}
public void printStatistics() {
dbTokens.printStatistics();
}
public String toString() {
return this.getClass().getName() + " at "+ dbTokens.getFile().getName() +
" owned by " + broker.toString();
}
/**
* This inner class is responsible for actually storing the list of
* occurrences.
*
* @author Wolfgang Meier <meier@ifs.tu-darmstadt.de>
*/
final class InvertedIndex {
private class QNameTerm implements Comparable {
QName qname;
String term;
public QNameTerm(QName qname, String term) {
this.qname = qname;
this.term = term;
}
public int compareTo(Object o) {
final QNameTerm other = (QNameTerm) o;
final int cmp = qname.compareTo(other.qname);
if (cmp == 0)
{return term.compareTo(other.term);}
else
{return cmp;}
}
}
private DocumentImpl doc = null;
// To distinguish between attribute values and text, we use
// two maps: words[0] collects text, words[1] stores attribute
// values.
//TODO : very tricky. Why not 2 inverted indexes ??? -pb
private Map words[] = new Map[3];
private int TEXT_NODES = 0;
private int ATTRIBUTE_NODES = 1;
private int BY_QNAME = 2;
public InvertedIndex() {
words[TEXT_NODES] = new HashMap(512);
words[ATTRIBUTE_NODES] = new HashMap(256);
//seems to be linked with QName indexes
words[BY_QNAME] = new TreeMap();
}
public void setDocument(DocumentImpl document) {
if (this.doc != null && this.doc.getDocId() != document.getDocId())
{flush();}
this.doc = document;
}
public void addText(TextToken token, NodeId nodeId, boolean remove) {
if (!remove) {
//Is this token already pending ?
OccurrenceList list = (OccurrenceList) words[TEXT_NODES].get(token);
//Create a GIDs list
if (list == null) {
list = new OccurrenceList();
list.add(nodeId, token.startOffset());
words[TEXT_NODES].put(token.getText(), list);
} else {
//Add node's GID to the list
list.add(nodeId, token.startOffset());
}
} else {
if (!words[TEXT_NODES].containsKey(token))
{words[TEXT_NODES].put(token, null);}
}
}
public void addText(TextToken token, ElementImpl ancestor, boolean remove) {
final QNameTerm term = new QNameTerm(ancestor.getQName(), token.getText());
if (!remove) {
//Is this token already pending ?
OccurrenceList list = (OccurrenceList) words[BY_QNAME].get(term);
//Create a GIDs list
if (list == null) {
list = new OccurrenceList();
list.add(ancestor.getNodeId(), token.startOffset());
words[BY_QNAME].put(term, list);
} else {
//Add node's GID to the list
list.add(ancestor.getNodeId(), token.startOffset());
}
} else {
if (!words[BY_QNAME].containsKey(term))
{words[BY_QNAME].put(term, null);}
}
}
//TODO : unify functionalities with addText -pb
public void addAttribute(TextToken token, NodeId nodeId, boolean remove) {
//Is this token already pending ?
if (!remove) {
OccurrenceList list = (OccurrenceList) words[ATTRIBUTE_NODES].get(token);
//Create a GIDs list
if (list == null) {
list = new OccurrenceList();
list.add(nodeId, token.startOffset());
words[ATTRIBUTE_NODES].put(token.getText(), list);
} else {
//Add node's GID to the list
list.add(nodeId, token.startOffset());
}
} else {
if (!words[ATTRIBUTE_NODES].containsKey(token))
{words[ATTRIBUTE_NODES].put(token, null);}
}
}
public void addAttribute(TextToken token, AttrImpl attr, boolean remove) {
final QNameTerm term = new QNameTerm(attr.getQName(), token.getText());
if (!remove) {
//Is this token already pending ?
OccurrenceList list = (OccurrenceList) words[BY_QNAME].get(term);
//Create a GIDs list
if (list == null) {
list = new OccurrenceList();
list.add(attr.getNodeId(), token.startOffset());
words[BY_QNAME].put(term, list);
} else {
//Add node's GID to the list
list.add(attr.getNodeId(), token.startOffset());
}
} else {
if (!words[BY_QNAME].containsKey(term))
{words[BY_QNAME].put(term, null);}
}
}
public void flush() {
//return early
if (this.doc == null)
{return;}
final int wordsCount = words[TEXT_NODES].size() +
words[ATTRIBUTE_NODES].size() + words[BY_QNAME].size();
if (wordsCount == 0)
{return;}
final ProgressIndicator progress = new ProgressIndicator(wordsCount, 100);
final int collectionId = this.doc.getCollection().getId();
int count = 0;
for (byte currentSection = 0; currentSection <= QNAME_SECTION; currentSection++) {
//Not very necessary, but anyway...
switch (currentSection) {
case TEXT_SECTION :
case ATTRIBUTE_SECTION :
case QNAME_SECTION :
break;
default :
throw new IllegalArgumentException("Invalid section type in '" +
dbTokens.getFile().getName() + "' (inverted index)");
}
for (final Iterator i = words[currentSection].entrySet().iterator(); i.hasNext(); count++) {
final Map.Entry entry = (Map.Entry) i.next();
final Object token = entry.getKey();
final OccurrenceList occurences = (OccurrenceList) entry.getValue();
if (occurences == null)
{continue;} // may happen if the index is in an invalid state due to earlier errors
//Don't forget this one
occurences.sort();
os.clear();
os.writeInt(this.doc.getDocId());
os.writeByte(currentSection);
os.writeInt(occurences.getTermCount());
//Mark position
final int lenOffset = os.position();
//Dummy value : actual one will be written below
os.writeFixedInt(0);
NodeId previous = null;
for (int m = 0; m < occurences.getSize(); ) {
try {
previous = occurences.getNode(m).write(previous, os);
} catch (final IOException e) {
LOG.error("IOException while writing fulltext index: " + e.getMessage(), e);
//TODO : throw exception ? -pb
}
int freq = occurences.getOccurrences(m);
os.writeInt(freq);
for (int n = 0; n < freq; n++) {
os.writeInt(occurences.getOffset(m + n));
}
m += freq;
}
//Write (variable) length of node IDs + frequency + offsets
os.writeFixedInt(lenOffset, os.position() - lenOffset - LENGTH_NODE_IDS_FREQ_OFFSETS);
flushWord(currentSection, collectionId, token, os.data());
progress.setValue(count);
if (progress.changed()) {
setChanged();
notifyObservers(progress);
}
}
//TOUNDERSTAND : is this a flush ?
//If so, the ProgressIndicator should be reinitialized -pb
if (wordsCount > 100) {
progress.finish();
setChanged();
notifyObservers(progress);
}
words[currentSection].clear();
}
}
private void flushWord(int currentSection, int collectionId, Object token, ByteArray data) {
//return early
//TODO : is this ever called ? -pb
if (data.size() == 0)
{return;}
final Lock lock = dbTokens.getLock();
try {
lock.acquire(Lock.WRITE_LOCK);
Value key;
if (currentSection == QNAME_SECTION) {
final QNameTerm term = (QNameTerm) token;
key = new QNameWordRef(collectionId, term.qname, term.term,
broker.getBrokerPool().getSymbols());
} else {
key = new WordRef(collectionId, token.toString());
}
dbTokens.append(key, data);
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for '" +
dbTokens.getFile().getName() + "' (inverted index)", e);
//TODO : throw exception ? -pb
} catch (final ReadOnlyException e) {
LOG.warn("Read-only error on '" + dbTokens.getFile().getName() +
"' (inverted index)", e);
//TODO : throw exception ?
} catch (final IOException e) {
LOG.error(e.getMessage() + "' in '" + dbTokens.getFile().getName() +
"' (inverted index)", e);
//TODO : throw exception ? -pb
} finally {
lock.release(Lock.WRITE_LOCK);
os.clear();
}
}
public void dropIndex(DocumentImpl document) {
//Return early
if (document == null)
{return;}
final int collectionId = document.getCollection().getId();
final Lock lock = dbTokens.getLock();
for (byte currentSection = 0; currentSection <= QNAME_SECTION; currentSection++) {
//Not very necessary, but anyway...
switch (currentSection) {
case TEXT_SECTION :
case ATTRIBUTE_SECTION :
case QNAME_SECTION :
break;
default :
throw new IllegalArgumentException("Invalid section type in '" +
dbTokens.getFile().getName() + "' (inverted index)");
}
LOG.debug("Removing " + words[currentSection].size() + " tokens");
for (final Iterator i = words[currentSection].entrySet().iterator(); i.hasNext();) {
//Compute a key for the token
final Map.Entry entry = (Map.Entry) i.next();
final Object token = entry.getKey();
Value key;
if (currentSection == QNAME_SECTION) {
final QNameTerm term = (QNameTerm) token;
key = new QNameWordRef(collectionId, term.qname,
term.term, broker.getBrokerPool().getSymbols());
} else {
key = new WordRef(collectionId, token.toString());
}
os.clear();
try {
lock.acquire(Lock.WRITE_LOCK);
boolean changed = false;
os.clear();
final VariableByteInput is = dbTokens.getAsStream(key);
//Does the token already has data in the index ?
if (is == null)
{continue;}
//try {
while (is.available() > 0) {
final int storedDocId = is.readInt();
final byte section = is.readByte();
final int gidsCount = is.readInt();
//Read (variable) length of node IDs + frequency + offsets
final int length = is.readFixedInt();
if (storedDocId != document.getDocId()) {
// data are related to another document:
// copy them to any existing data
os.writeInt(storedDocId);
os.writeByte(section);
os.writeInt(gidsCount);
os.writeFixedInt(length);
is.copyRaw(os, length);
} else {
// data are related to our document:
// skip them
changed = true;
is.skipBytes(length);
}
}
//Store new data, if relevant
if (changed) {
//Well, nothing to store : remove the existing data
if (os.data().size() == 0) {
dbTokens.remove(key);
} else {
if (dbTokens.put(key, os.data()) == BFile.UNKNOWN_ADDRESS) {
LOG.error("Could not put index data for token '" +
token + "' in '" + dbTokens.getFile().getName() + "'");
//TODO : throw an exception ?
}
}
}
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for '" +
dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
} catch (final IOException e) {
LOG.error(e.getMessage() + " in '" +
dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
} catch (final ReadOnlyException e) {
LOG.error(e.getMessage() + " in '" +
dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
} finally {
lock.release(Lock.WRITE_LOCK);
os.clear();
}
}
words[currentSection].clear();
}
}
/**
* Remove the entries in the current list from the index.
*/
//TODO: use VariableInputStream
public void remove() {
//Return early
if (doc == null)
{return;}
final int collectionId = this.doc.getCollection().getId();
final Lock lock = dbTokens.getLock();
for (byte currentSection = 0; currentSection <= QNAME_SECTION; currentSection++) {
//Not very necessary, but anyway...
switch (currentSection) {
case TEXT_SECTION :
case ATTRIBUTE_SECTION :
case QNAME_SECTION :
break;
default :
throw new IllegalArgumentException("Invalid section type in '" +
dbTokens.getFile().getName() + "' (inverted index)");
}
for (final Iterator i = words[currentSection].entrySet().iterator(); i.hasNext();) {
//Compute a key for the token
final Map.Entry entry = (Map.Entry) i.next();
final OccurrenceList storedOccurencesList = (OccurrenceList) entry.getValue();
final Object token = entry.getKey();
Value key;
if (currentSection == QNAME_SECTION) {
final QNameTerm term = (QNameTerm) token;
key = new QNameWordRef(collectionId, term.qname, term.term,
broker.getBrokerPool().getSymbols());
} else {
key = new WordRef(collectionId, token.toString());
}
final OccurrenceList newOccurencesList = new OccurrenceList();
os.clear();
try {
lock.acquire(Lock.WRITE_LOCK);
final Value value = dbTokens.get(key);
if (value == null)
{continue;}
//Add its data to the new list
final VariableByteArrayInput is = new VariableByteArrayInput(value.getData());
while (is.available() > 0) {
final int storedDocId = is.readInt();
final byte storedSection = is.readByte();
final int termCount = is.readInt();
//Read (variable) length of node IDs + frequency + offsets
final int length = is.readFixedInt();
if (storedSection != currentSection || storedDocId != this.doc.getDocId()) {
// data are related to another section or document:
// append them to any existing data
os.writeInt(storedDocId);
os.writeByte(storedSection);
os.writeInt(termCount);
os.writeFixedInt(length);
is.copyRaw(os, length);
} else {
// data are related to our section and document:
// feed the new list with the GIDs
NodeId previous = null;
for (int m = 0; m < termCount; m++) {
NodeId nodeId = broker.getBrokerPool()
.getNodeFactory().createFromStream(previous, is);
previous = nodeId;
final int freq = is.readInt();
// add the node to the new list if it is not
// in the list of removed nodes
if (!storedOccurencesList.contains(nodeId)) {
for (int n = 0; n < freq; n++) {
newOccurencesList.add(nodeId, is.readInt());
}
} else {
is.skip(freq);
}
}
}
}
//append the data from the new list
if (newOccurencesList.getSize() > 0) {
//Don't forget this one
newOccurencesList.sort();
os.writeInt(this.doc.getDocId());
os.writeByte(currentSection);
os.writeInt(newOccurencesList.getTermCount());
//Mark position
final int lenOffset = os.position();
//Dummy value : actual one will be written below
os.writeFixedInt(0);
NodeId previous = null;
for (int m = 0; m < newOccurencesList.getSize();) {
previous = newOccurencesList.getNode(m).write(previous, os);
int freq = newOccurencesList.getOccurrences(m);
os.writeInt(freq);
for (int n = 0; n < freq; n++) {
os.writeInt(newOccurencesList.getOffset(m + n));
}
m += freq;
}
//Write (variable) length of node IDs + frequency + offsets
os.writeFixedInt(lenOffset, os.position() -
lenOffset - LENGTH_NODE_IDS_FREQ_OFFSETS);
}
//Store the data
if(os.data().size() == 0)
{dbTokens.remove(key);}
else if (dbTokens.update(value.getAddress(), key,
os.data()) == BFile.UNKNOWN_ADDRESS) {
LOG.error("Could not update index data for token '" +
token + "' in '" + dbTokens.getFile().getName() +
"' (inverted index)");
//TODO : throw an exception ?
}
} catch (final LockException e) {
LOG.warn("Failed to acquire lock for '" +
dbTokens.getFile().getName() + "' (inverted index)", e);
//TODO : throw exception ? -pb
} catch (final IOException e) {
LOG.error(e.getMessage() + "' in '" +
dbTokens.getFile().getName() + "' (inverted index)", e);
//TODO : throw exception ? -pb
} finally {
lock.release(Lock.WRITE_LOCK);
os.clear();
}
}
words[currentSection].clear();
}
}
}
private class IndexCallback implements BTreeCallback {
List<String> matches = new ArrayList<String>();
TermMatcher matcher;
XQueryContext context;
public IndexCallback(XQueryContext context, TermMatcher matcher) {
this.matcher = matcher;
this.context = context;
}
public String[] getMatches() {
final String[] a = new String[matches.size()];
return matches.toArray(a);
}
/* (non-Javadoc)
* @see org.dbxml.core.filer.BTreeCallback#indexInfo(org.dbxml.core.data.Value, long)
*/
public boolean indexInfo(Value key, long pointer) throws TerminatedException {
if(context != null)
{context.proceed();}
final String word = new String(key.getData(), Collection.LENGTH_COLLECTION_ID,
key.getLength() - Collection.LENGTH_COLLECTION_ID, UTF_8);
if (matcher.matches(word))
{matches.add(word);}
return true;
}
}
private final class SearchCallback implements BTreeCallback {
DocumentSet docs;
TermMatcher matcher;
NodeSet result;
NodeSet contextSet;
int axis;
XQueryContext context;
XMLString word = new XMLString(64);
QName qname;
public SearchCallback(XQueryContext context, TermMatcher comparator,
NodeSet result, NodeSet contextSet, int axis, DocumentSet docs, QName qname) {
this.matcher = comparator;
this.result = result;
this.docs = docs;
this.contextSet = contextSet;
this.context = context;
this.qname = qname;
this.axis = axis;
}
public boolean indexInfo(Value key, long pointer) throws TerminatedException {
VariableByteInput is;
try {
is = dbTokens.getAsStream(pointer);
} catch (final IOException e) {
LOG.error(e.getMessage(), e);
return true;
}
word.reuse();
if (qname == null)
{WordRef.decode(key, word);}
else
{QNameWordRef.decode(key, word);}
if (matcher.matches(word)) {
try {
while (is.available() > 0) {
if(context != null)
{context.proceed();}
final int storedDocId = is.readInt();
final byte storedSection = is.readByte();
final int termCount = is.readInt();
//Read (variable) length of node IDs + frequency + offsets
final int length = is.readFixedInt();
final DocumentImpl storedDocument = docs.getDoc(storedDocId);
//Exit if the document is not concerned
if (storedDocument == null) {
is.skipBytes(length);
continue;
}
NodeId previous = null;
for (int m = 0; m < termCount; m++) {
NodeId nodeId = broker.getBrokerPool().getNodeFactory().createFromStream(previous, is);
previous = nodeId;
final int freq = is.readInt();
NodeProxy storedNode;
switch (storedSection) {
case TEXT_SECTION :
storedNode = new NodeProxy(storedDocument, nodeId, Node.TEXT_NODE);
break;
case ATTRIBUTE_SECTION :
storedNode = new NodeProxy(storedDocument, nodeId, Node.ATTRIBUTE_NODE);
break;
case QNAME_SECTION :
storedNode = new NodeProxy(storedDocument, nodeId,
qname.getNameType() == ElementValue.ATTRIBUTE ?
Node.ATTRIBUTE_NODE : Node.ELEMENT_NODE);
break;
default :
throw new IllegalArgumentException("Invalid section type in '" + dbTokens.getFile().getName() + "'");
}
if (contextSet != null) {
NodeProxy parentNode;
switch (storedSection) {
case TEXT_SECTION :
case QNAME_SECTION:
parentNode = contextSet.parentWithChild(storedNode,
false, true, NodeProxy.UNKNOWN_NODE_LEVEL);
break;
case ATTRIBUTE_SECTION :
if (contextSet instanceof VirtualNodeSet) {
parentNode = contextSet.parentWithChild(storedNode,
false, true, NodeProxy.UNKNOWN_NODE_LEVEL);
if (parentNode != null && parentNode.getNodeId().equals(nodeId))
{parentNode = null;}
} else {
parentNode = contextSet.get(storedNode);
}
break;
default :
throw new IllegalArgumentException("Invalid section type in '" +
dbTokens.getFile().getName() + "'");
}
if (parentNode != null) {
final Match match = new FTMatch(-1, nodeId, word.toString(), freq);
readOccurrences(freq, is, match, word.length());
final int sizeHint = contextSet.getSizeHint(storedDocument);
if (axis == NodeSet.ANCESTOR) {
parentNode.addMatch(match);
result.add(parentNode, sizeHint);
} else {
storedNode.addMatch(match);
result.add(storedNode, sizeHint);
}
} else
{is.skip(freq);}
} else {
final Match match = new FTMatch(-1, nodeId, word.toString(), freq);
readOccurrences(freq, is, match, word.length());
storedNode.addMatch(match);
result.add(storedNode, Constants.NO_SIZE_HINT);
}
}
}
} catch (final IOException e) {
LOG.error(e.getMessage() + " in '" + dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
}
}
//TOUNDERSTAND : why sort here ? -pb
if (contextSet != null)
{((NewArrayNodeSet) result).sort();}
return true;
}
}
private final class IndexScanCallback implements BTreeCallback{
private DocumentSet docs;
private NodeSet contextSet;
private Map map = new TreeMap();
private XMLString word = new XMLString(64);
private boolean byQName;
IndexScanCallback(DocumentSet docs, NodeSet contextSet, boolean byQName) {
this.docs = docs;
this.contextSet = contextSet;
this.byQName = byQName;
}
/* (non-Javadoc)
* @see org.dbxml.core.filer.BTreeCallback#indexInfo(org.dbxml.core.data.Value, long)
*/
public boolean indexInfo(Value key, long pointer) throws TerminatedException {
word.reuse();
if (byQName)
{QNameWordRef.decode(key, word);}
else
{WordRef.decode(key, word);}
final String term = word.toString();
VariableByteInput is;
try {
is = dbTokens.getAsStream(pointer);
} catch (final IOException e) {
LOG.error(e.getMessage(), e);
//TODO : throw exception ? -pb
return true;
}
try {
while (is.available() > 0) {
boolean docAdded = false;
final int storedDocId = is.readInt();
final byte storedSection = is.readByte();
final int termCount = is.readInt();
//Read (variable) length of node IDs + frequency + offsets
final int length = is.readFixedInt();
final DocumentImpl storedDocument = docs.getDoc(storedDocId);
//Exit if the document is not concerned
if (storedDocument == null) {
is.skipBytes(length);
continue;
}
NodeId previous = null;
for (int m = 0; m < termCount; m++) {
NodeId nodeId = broker.getBrokerPool().getNodeFactory()
.createFromStream(previous, is);
previous = nodeId;
final int freq = is.readInt();
is.skip(freq);
if (contextSet != null) {
boolean include = false;
final NodeProxy parentNode = contextSet.parentWithChild(storedDocument,
nodeId, false, true);
switch (storedSection) {
case TEXT_SECTION :
case QNAME_SECTION :
//TODO : also test on Node.TEXT_NODE like below ? -pb
include = (parentNode != null);
break;
case ATTRIBUTE_SECTION :
include = (parentNode != null &&
parentNode.getNodeType() == Node.ATTRIBUTE_NODE);
break;
default :
throw new IllegalArgumentException("Invalid section type in '" +
dbTokens.getFile().getName() + "'");
}
if (include) {
Occurrences oc = (Occurrences) map.get(term);
if (oc == null) {
oc = new Occurrences(term);
map.put(term, oc);
}
if (!docAdded) {
oc.addDocument(storedDocument);
docAdded = true;
}
oc.addOccurrences(freq);
}
}
}
}
} catch(final IOException e) {
LOG.error(e.getMessage() + " in '" + dbTokens.getFile().getName() + "'", e);
//TODO : throw exception ? -pb
}
return true;
}
}
private static class TermFrequencyList {
protected static class TermFreq implements Comparable {
long l;
int count = 1;
TermFreq next = null;
public TermFreq(long l) {
this.l = l;
}
public void increment() {
++count;
}
public int compareTo(Object o) {
final TermFreq other = (TermFreq)o;
if(l == other.l)
{return Constants.EQUAL;}
else
{return l < other.l ? Constants.INFERIOR : Constants.SUPERIOR;}
}
}
private TermFreq first = null;
private TermFreq last = null;
private int count = 0;
public void add(long l) {
if (first == null) {
first = new TermFreq( l );
last = first;
} else {
TermFreq next = new TermFreq( l );
last.next = next;
last = next;
}
++count;
}
public void incLastTerm() {
if(last != null)
{last.increment();}
}
public void setLastTermFreq(int freq) {
if (last != null)
{last.count = freq;}
}
public long getLast() {
if(last != null)
{return last.l;}
else
{return -1;}
}
public boolean contains(long l) {
TermFreq next = first;
while (next != null ) {
if(next.l == l)
{return true;}
next = next.next;
}
return false;
}
public int getSize() {
return count;
}
public TermFreq[] toArray() {
final TermFreq[] data = new TermFreq[count];
TermFreq next = first;
int i = 0;
while( next != null ) {
data[i++] = next;
next = next.next;
}
return data;
}
}
private final static class WordRef extends Value {
public static int LENGTH_IDX_TYPE = 1; //sizeof byte
public static int OFFSET_IDX_TYPE = 0;
public static int OFFSET_COLLECTION_ID = OFFSET_IDX_TYPE + WordRef.LENGTH_IDX_TYPE; //1
public static int OFFSET_WORD = OFFSET_COLLECTION_ID + Collection.LENGTH_COLLECTION_ID; //3
public WordRef(int collectionId) {
len = WordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID;
data = new byte[len];
data[OFFSET_IDX_TYPE] = IDX_GENERIC;
ByteConversion.intToByte(collectionId, data, OFFSET_COLLECTION_ID);
}
public WordRef(int collectionId, String word) {
len = WordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID + UTF8.encoded(word);
data = new byte[len];
data[OFFSET_IDX_TYPE] = IDX_GENERIC;
ByteConversion.intToByte(collectionId, data, OFFSET_COLLECTION_ID);
UTF8.encode(word, data, OFFSET_WORD);
}
public static XMLString decode(Value key, XMLString word) {
final int prefixLength = WordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID;
return UTF8.decode(key.getData(), prefixLength, key.getLength() - prefixLength, word);
}
public String toString() {
if (len > OFFSET_WORD)
{return new String(data, OFFSET_WORD, len - OFFSET_WORD);}
else {return "no word";}
}
}
//TODO : extend WordRef ?
private final static class QNameWordRef extends Value {
public static int LENGTH_IDX_TYPE = 1; //sizeof byte
public static int LENGTH_QNAME_TYPE = 1; //sizeof byte
public static int OFFSET_IDX_TYPE = 0;
public static int OFFSET_COLLECTION_ID = OFFSET_IDX_TYPE + QNameWordRef.LENGTH_IDX_TYPE; //1
public static int OFFSET_QNAME_TYPE = OFFSET_COLLECTION_ID + Collection.LENGTH_COLLECTION_ID; //4
public static int OFFSET_NS_URI = OFFSET_QNAME_TYPE + LENGTH_QNAME_TYPE; //4
public static int OFFSET_LOCAL_NAME = OFFSET_NS_URI + SymbolTable.LENGTH_NS_URI; //6
public static int OFFSET_WORD = OFFSET_LOCAL_NAME + SymbolTable.LENGTH_LOCAL_NAME; //8
public QNameWordRef(int collectionId) {
len = QNameWordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID;
data = new byte[len];
data[OFFSET_IDX_TYPE] = IDX_QNAME;
ByteConversion.intToByte(collectionId, data, OFFSET_COLLECTION_ID);
pos = OFFSET_IDX_TYPE;
}
public QNameWordRef(int collectionId, QName qname, SymbolTable symbols) {
len = QNameWordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID +
QNameWordRef.LENGTH_QNAME_TYPE + SymbolTable.LENGTH_NS_URI +
SymbolTable.LENGTH_LOCAL_NAME;
data = new byte[len];
final short namespaceId = symbols.getNSSymbol(qname.getNamespaceURI());
final short localNameId = symbols.getSymbol(qname.getLocalName());
data[OFFSET_IDX_TYPE] = IDX_QNAME;
ByteConversion.intToByte(collectionId, data, OFFSET_COLLECTION_ID);
data[OFFSET_QNAME_TYPE] = qname.getNameType();
ByteConversion.shortToByte(namespaceId, data, OFFSET_NS_URI);
ByteConversion.shortToByte(localNameId, data, OFFSET_LOCAL_NAME);
}
public QNameWordRef(int collectionId, QName qname, String word, SymbolTable symbols) {
len = UTF8.encoded(word) + QNameWordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID +
LENGTH_QNAME_TYPE + SymbolTable.LENGTH_NS_URI + SymbolTable.LENGTH_LOCAL_NAME;
data = new byte[len];
final short namespaceId = symbols.getNSSymbol(qname.getNamespaceURI());
final short localNameId = symbols.getSymbol(qname.getLocalName());
data[OFFSET_IDX_TYPE] = IDX_QNAME;
ByteConversion.intToByte(collectionId, data, OFFSET_COLLECTION_ID);
data[OFFSET_QNAME_TYPE] = qname.getNameType();
ByteConversion.shortToByte(namespaceId, data, OFFSET_NS_URI);
ByteConversion.shortToByte(localNameId, data, OFFSET_LOCAL_NAME);
UTF8.encode(word, data, OFFSET_WORD);
}
public static XMLString decode(Value key, XMLString word) {
final int prefixLength = QNameWordRef.LENGTH_IDX_TYPE + Collection.LENGTH_COLLECTION_ID +
QNameWordRef.LENGTH_QNAME_TYPE + SymbolTable.LENGTH_NS_URI + SymbolTable.LENGTH_LOCAL_NAME;
return UTF8.decode(key.getData(), prefixLength, key.getLength() - prefixLength, word);
}
public String toString() {
if (len > OFFSET_WORD)
{return new String(data, OFFSET_WORD, len - OFFSET_WORD);}
else {return "no word";}
}
}
}