package org.exist.xquery.functions.fn;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.exist.dom.QName;
import org.exist.memtree.MemTreeBuilder;
import org.exist.xquery.BasicFunction;
import org.exist.xquery.Cardinality;
import org.exist.xquery.Function;
import org.exist.xquery.FunctionSignature;
import org.exist.xquery.XPathException;
import org.exist.xquery.XQueryContext;
import org.exist.xquery.value.FunctionParameterSequenceType;
import org.exist.xquery.value.FunctionReturnSequenceType;
import org.exist.xquery.value.NodeValue;
import org.exist.xquery.value.Sequence;
import org.exist.xquery.value.SequenceType;
import org.exist.xquery.value.Type;
import org.xml.sax.helpers.AttributesImpl;
/**
* XPath and XQuery 3.0 F+O fn:analyze-string()
*
* @author Adam Retter <adam@exist-db.org>
* @serial 201211101626
*
* Corrections were made by to the previous buggy version
* by taking inspiration from the BaseX 7.3 version.
*/
public class FunAnalyzeString extends BasicFunction {
private final static QName fnAnalyzeString = new QName("analyze-string", Function.BUILTIN_FUNCTION_NS);
private final static QName QN_MATCH = new QName("match", Function.BUILTIN_FUNCTION_NS);
private final static QName QN_GROUP = new QName("group", Function.BUILTIN_FUNCTION_NS);
private final static QName QN_NR = new QName("nr");
private final static QName QN_NON_MATCH = new QName("non-match", Function.BUILTIN_FUNCTION_NS);
public final static FunctionSignature signatures[] = {
new FunctionSignature(
fnAnalyzeString,
"Analyzes a string using a regular expression, returning an XML " +
"structure that identifies which parts of the input string matched " +
"or failed to match the regular expression, and in the case of " +
"matched substrings, which substrings matched each " +
"capturing group in the regular expression.",
new SequenceType[] {
new FunctionParameterSequenceType("input", Type.STRING,
Cardinality.ZERO_OR_ONE, "The input string"),
new FunctionParameterSequenceType("pattern", Type.STRING,
Cardinality.EXACTLY_ONE, "The pattern")
},
new FunctionReturnSequenceType(Type.ELEMENT,
Cardinality.EXACTLY_ONE, "The result of the analysis")
),
new FunctionSignature(
fnAnalyzeString,
"Analyzes a string using a regular expression, returning an XML " +
"structure that identifies which parts of the input string matched " +
"or failed to match the regular expression, and in the case of " +
"matched substrings, which substrings matched each " +
"capturing group in the regular expression.",
new SequenceType[] {
new FunctionParameterSequenceType("input", Type.STRING,
Cardinality.ZERO_OR_ONE, "The input string"),
new FunctionParameterSequenceType("pattern", Type.STRING,
Cardinality.EXACTLY_ONE, "The pattern"),
new FunctionParameterSequenceType("flags", Type.STRING,
Cardinality.EXACTLY_ONE, "Flags"),
},
new FunctionReturnSequenceType(Type.ELEMENT,
Cardinality.EXACTLY_ONE, "The result of the analysis")
)
};
public FunAnalyzeString(final XQueryContext context, final FunctionSignature signature) {
super(context, signature);
}
@Override
public Sequence eval(final Sequence[] args, final Sequence contextSequence) throws XPathException {
final MemTreeBuilder builder = new MemTreeBuilder(context);
builder.startDocument();
builder.startElement(new QName("analyze-string-result", Function.BUILTIN_FUNCTION_NS), null);
String input = "";
if (!args[0].isEmpty()) {
input = args[0].itemAt(0).getStringValue();
}
if (!"".equals(input)) {
final String pattern = args[1].itemAt(0).getStringValue();
String flags = null;
if(args.length == 3) {
flags = args[2].itemAt(0).getStringValue();
}
analyzeString(builder, input, pattern, flags);
}
builder.endElement();
builder.endDocument();
return (NodeValue)builder.getDocument().getDocumentElement();
}
private void analyzeString(final MemTreeBuilder builder, final String input, final String pattern, final String flags) throws XPathException {
final Pattern ptn;
if (flags != null) {
final int iFlags = parseStringFlags(flags);
ptn = Pattern.compile(pattern, iFlags);
} else {
ptn = Pattern.compile(pattern);
}
final Matcher matcher = ptn.matcher(input);
int offset = 0;
while(matcher.find()) {
if(matcher.start() != offset) {
nonMatch(builder, input.substring(offset, matcher.start()));
}
match(builder, matcher, input, 0);
offset = matcher.end();
}
if(offset != input.length()) {
nonMatch(builder, input.substring(offset));
}
}
private class GroupPosition {
public GroupPosition(final int groupNumber, final int position) {
this.groupNumber = groupNumber;
this.position = position;
}
public int groupNumber;
public int position;
}
private GroupPosition match(final MemTreeBuilder builder, final Matcher matcher, final String input, final int group) {
if(group == 0) {
builder.startElement(QN_MATCH, null);
} else {
final AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", QN_NR.getLocalName(), QN_NR.getLocalName(), "int", Integer.toString(group));
builder.startElement(QN_GROUP, attributes);
}
final int groupStart = matcher.start(group);
final int groupEnd = matcher.end(group);
final int groupCount = matcher.groupCount();
GroupPosition groupAndPosition = new GroupPosition(group + 1, groupStart);
while(groupAndPosition.groupNumber <= groupCount && matcher.end(groupAndPosition.groupNumber) <= groupEnd) {
final int start = matcher.start(groupAndPosition.groupNumber);
if(start >= 0) { //group matched
if(groupAndPosition.position < start) {
builder.characters(input.substring(groupAndPosition.position, start));
}
groupAndPosition = match(builder, matcher, input, groupAndPosition.groupNumber);
} else {
groupAndPosition.groupNumber++; //skip to next group
}
}
if(groupAndPosition.position < groupEnd) {
builder.characters(input.substring(groupAndPosition.position, groupEnd));
groupAndPosition.position = groupEnd;
}
builder.endElement();
return groupAndPosition;
}
private void nonMatch(final MemTreeBuilder builder, final String nonMatch) {
builder.startElement(QN_NON_MATCH, null);
builder.characters(nonMatch);
builder.endElement();
}
private int parseStringFlags(final String flags) {
int iFlags = 0;
for (final char c : flags.toCharArray()) {
switch(c) {
case 's':
iFlags |= Pattern.DOTALL;
break;
case 'm':
iFlags |= Pattern.MULTILINE;
break;
case 'i':
iFlags |= Pattern.CASE_INSENSITIVE;
break;
case 'x' :
iFlags |= Pattern.CANON_EQ;
break;
case 'q' :
iFlags |= Pattern.LITERAL;
break;
}
}
return iFlags;
}
}