Source Code of org.exist.xquery.functions.fn.ExtPhrase

/*
 * eXist Open Source Native XML Database
 * Copyright (C) 2005-2009 The eXist Project
 * http://exist-db.org
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *  
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *  
 *  $Id$
 */
package org.exist.xquery.functions.fn;


import org.apache.log4j.Logger;


import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;


import org.exist.EXistException;
import org.exist.dom.ExtArrayNodeSet;
import org.exist.dom.Match;
import org.exist.dom.NodeProxy;
import org.exist.dom.NodeSet;
import org.exist.numbering.NodeId;
import org.exist.storage.NativeTextEngine;
import org.exist.storage.analysis.TextToken;
import org.exist.storage.analysis.Tokenizer;
import org.exist.util.GlobToRegex;
import org.exist.xquery.Constants;
import org.exist.xquery.XPathException;
import org.exist.xquery.XQueryContext;
import org.exist.xquery.util.ExpressionDumper;
import org.exist.xquery.value.Sequence;


/**
 *  phrase() function : search an exact phrase in a NodeSet
 *
 *@author     Bruno Chatel <bcha@chadocs.com> (March 30, 2005)
 */
public class ExtPhrase extends ExtFulltext {
    private static final Logger logger = Logger.getLogger(ExtPhrase.class);


    /**
     * 
     * @param context
     */
    public ExtPhrase(XQueryContext context) {
        super(context, Constants.FULLTEXT_AND);
    }


    /**
     * 
     * @param searchArg
     * @param nodes 
     */
    public Sequence evalQuery(String searchArg,  NodeSet nodes) throws XPathException {
        String[] terms;
        try {
            terms = getSearchTerms(searchArg);
        } catch (final EXistException e) {
            throw new XPathException(e.getMessage());
        }
        final NodeSet hits = processQuery(terms, nodes);
        if (hits == null)
            {return Sequence.EMPTY_SEQUENCE;}
        boolean hasWildcards = false;
        for (int i = 0; i < terms.length; i++) {
            hasWildcards |=  NativeTextEngine.containsWildcards(terms[i]);
        }
        return hasWildcards  ? patternMatch(context, terms, hits) : exactMatch(context, terms, hits);
    }


    /**
     * 
     * @param context
     * @param result
     */
    private Sequence exactMatch(XQueryContext context, String[] terms, NodeSet result) {
        TextToken token;
        final NodeSet r = new ExtArrayNodeSet();
        final Tokenizer tok = context.getBroker().getTextEngine().getTokenizer();
        //Define search phrase for matches
        String matchTerm = "";
        for (int k = 0; k < terms.length ; k++) {
            matchTerm = matchTerm + terms[k];
            if (k != terms.length - 1)
                {matchTerm = matchTerm + "\\W*";} 
        }
        //Iterate on results
        for (final NodeProxy current : result) {
            final Vector<NodeId> matchNodeIDs = new Vector<NodeId>();
            //Get first match
            Match nextMatch = current.getMatches();
            //Remove previously found matches on current
            current.setMatches(null);
            //Iterate on attach matches, with unicity of related nodeproxy gid
            String term;
            while(nextMatch != null) {
                final NodeId nodeId= nextMatch.getNodeId();
                //If current node id has not been previously processed
                if (!matchNodeIDs.contains(nodeId)) {
                    final NodeProxy mcurrent = new NodeProxy(current.getDocument(), nodeId);
                    Match match = null;
                    int firstOffset = -1;
                    matchNodeIDs.add(nodeId);
                    final String value = mcurrent.getNodeValue();
                    tok.setText(value);
                    int j = 0;
                    if (j < terms.length)
                        {term = terms[j];}
                    else
                        {break;}
                    int frequency = 0;
                    while ((token = tok.nextToken()) != null) {
                        final String word = token.getText().toLowerCase();
                        if (word.equalsIgnoreCase(term)) {
                            j++;
                            if (j == terms.length) {
                                //All terms found
                                if (match == null)
                                    {match = nextMatch.createInstance(getExpressionId(),
                                        nodeId, matchTerm);}
                                if (firstOffset < 0)
                                    {firstOffset = token.startOffset();}
                                match.addOffset(firstOffset, token.endOffset() - firstOffset);
                                frequency++;
                                //Start again on fist term
                                j = 0;
                                term = terms[j];
                                continue;
                            } else {
                                term = terms[j];
                                if (firstOffset < 0)
                                    {firstOffset = token.startOffset();}
                            }
                        } else if (j > 0 && word.equalsIgnoreCase(terms[0])) {
                            //First search term found: start again
                            j = 1;
                            term = terms[j];
                            firstOffset = token.startOffset();
                            continue;
                        } else {
                            //Reset
                            j = 0;
                            firstOffset = -1;
                            term = terms[j];
                        }
                    }
                    //If phrase found
                    if (frequency != 0) {
                        //Add new match to current
                        current.addMatch(match);
                        //Add current to result
                        r.add(current);
                        //Reset frequency
                        frequency=0;
                    }
                }
                //Process next match
                nextMatch = nextMatch.getNextMatch();
            }
        }
        return r;
    }


    /**
     * 
     * @param context
     * @param result
     * 
     */
    private Sequence patternMatch(XQueryContext context, String[] terms, NodeSet result) {
        //Generate list of search term patterns
        final Pattern patterns[] = new Pattern[terms.length];
        final Matcher matchers[] = new Matcher[terms.length];
        for (int i = 0; i < patterns.length; i++) {
            try {
                patterns[i] = Pattern.compile(GlobToRegex.globToRegexp(terms[i]), 
                        Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
                matchers[i] = patterns[i].matcher("");
            } catch (final PatternSyntaxException e) {
                logger.error("malformed pattern" + e.getMessage());
                return Sequence.EMPTY_SEQUENCE;
            }
        }
        //Walk through hits 
        final ExtArrayNodeSet r = new ExtArrayNodeSet();
        final Tokenizer tok = context.getBroker().getTextEngine().getTokenizer();
        Matcher matcher;
        for (final NodeProxy current : result) {
            Match nextMatch;
            final Vector<NodeId> matchGid = new Vector<NodeId>();
            //Get first match
            nextMatch = current.getMatches();
            //Remove previously found matches on current
            current.setMatches(null);
            //Iterate on attach matches, with unicity of related nodeproxy gid
            while (nextMatch != null) {
                final Hashtable<String, Match> matchTable = new Hashtable<String, Match>();
                final NodeId nodeId = nextMatch.getNodeId(); 
                //If current node id has not been previously processed
                if (!matchGid.contains(nodeId)) {
                    final NodeProxy mcurrent = new NodeProxy(current.getDocument(), nodeId);
                    //Add it in node id array
                    matchGid.add(nodeId);
                    final String value = mcurrent.getNodeValue();
                    tok.setText(value);
                    int j = 0;
                    if (j < patterns.length) {
                        matcher = matchers[j];
                    } else
                        {break;}
                    String matchTerm = null;
                    TextToken token;
                    while ((token = tok.nextToken()) != null) {
                        String word = token.getText().toLowerCase();
                        matcher.reset(word);
                        matchers[0].reset(word);
                        if (matcher.matches()) {
                            j++;
                            if (matchTerm == null)
                                {matchTerm=word;}
                            else
                                {matchTerm = matchTerm + "\\W*" + word;}
                            if (j == patterns.length) {
                                //All terms found
                                if (matchTable.containsKey(matchTerm)) {
                                    //Previously found matchTerm
                                    final Match match = matchTable.get(matchTerm);
                                    match.addOffset(token.startOffset(), matchTerm.length());
                                } else {
                                    final Match match = nextMatch.createInstance(getExpressionId(),
                                        nodeId, matchTerm);
                                    match.addOffset(token.startOffset(), matchTerm.length());
                                    matchTable.put(matchTerm,match);
                                }
                                //Start again on fist term
                                j = 0;
                                matcher = matchers[j];
                                matchTerm = null;
                                continue;
                            } else {
                                matcher = matchers[j];
                            }
                        } else if (j > 0 && matchers[0].matches()) {
                            //First search term found: start again
                            j = 1;
                            //Pattern term = patterns[j];
                            matcher = matchers[j];
                            matchTerm = word;
                            continue;
                        } else {
                            //Reset
                            j = 0;
                            matcher = matchers[j];
                            matchTerm = null;
                            continue;
                        }
                    }
                    //One or more match found
                    if (matchTable.size() != 0) {
                        final Enumeration<Match> eMatch = matchTable.elements();
                        while (eMatch.hasMoreElements()){
                            final Match match = eMatch.nextElement();
                            current.addMatch(match);
                        }
                        //Add current to result
                        r.add(current);
                    }
                }
                //Process next match
                nextMatch = nextMatch.getNextMatch();
            }
        }
        return r;
    }


    /* (non-Javadoc)
     * @see org.exist.xquery.functions.ExtFulltext#dump(org.exist.xquery.util.ExpressionDumper)
     */
    public void dump(ExpressionDumper dumper) {
        dumper.display("phrase(");
        path.dump(dumper);
        dumper.display(", ");
        searchTerm.dump(dumper);
        dumper.display(")");
    }


    public String toString() {
        final StringBuilder buf = new StringBuilder();
        buf.append("phrase(");
        buf.append(path);
        buf.append(", ");
        buf.append(searchTerm);
        buf.append(")");
        return buf.toString();
    }


}
Source Code of org.exist.xquery.functions.fn.ExtPhrase

Related Classes of org.exist.xquery.functions.fn.ExtPhrase