/**
* This file is part of Erjang - A JVM-based Erlang VM
*
* Copyright (c) 2010 by Trifork
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package erjang.m.re;
import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.lang.Character.UnicodeBlock;
import java.lang.Character.UnicodeScript;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import erjang.BIF;
import erjang.CharCollector;
import erjang.EAtom;
import erjang.EBigString;
import erjang.EBinary;
import erjang.ENative;
import erjang.EObject;
import erjang.ERT;
import erjang.ESeq;
import erjang.ESmall;
import erjang.EString;
import erjang.ETuple;
import erjang.ETuple2;
import erjang.ETuple4;
import erjang.NotImplemented;
import erjang.CharCollector.CollectingException;
import erjang.CharCollector.InvalidElementException;
import erjang.CharCollector.PartialDecodingException;
import erjang.driver.IO;
public class Native extends ENative {
/** Urgh! Returned index values needs to represent the index into the
* underlying UTF8 byte stream, if unicode is assumed... */
private static final boolean INDEX_COMPATIBLE = true;
public static EAtom am_nomatch = EAtom.intern("nomatch");
public static EAtom am_match = EAtom.intern("match");
public static EAtom am_latin1 = EAtom.intern("latin1");
public static EAtom am_unicode = EAtom.intern("unicode");
public static EAtom am_anchored = EAtom.intern("anchored");
public static EAtom am_caseless = EAtom.intern("caseless");
public static EAtom am_dollar_endonly = EAtom.intern("dollar_endonly");
public static EAtom am_dotall = EAtom.intern("dotall");
public static EAtom am_extended = EAtom.intern("extended");
public static EAtom am_firstline = EAtom.intern("firstline");
public static EAtom am_multiline = EAtom.intern("multiline");
public static EAtom am_no_auto_capture = EAtom.intern("no_auto_capture");
public static EAtom am_dupnames = EAtom.intern("dupnames");
public static EAtom am_ungreedy = EAtom.intern("ungreedy");
public static EAtom am_newline = EAtom.intern("newline");
public static EAtom am_bsr_anycrlf = EAtom.intern("bsr_anycrlf");
public static EAtom am_bsr_unicode = EAtom.intern("bsr_unicode");
public static EAtom am_cr = EAtom.intern("cr");
public static EAtom am_lf = EAtom.intern("lf");
public static EAtom am_crlf = EAtom.intern("crlf");
public static EAtom am_anycrlf = EAtom.intern("anycrlf");
public static EAtom am_any = EAtom.intern("any");
public static EAtom am_global = EAtom.intern("global");
public static EAtom am_none = EAtom.intern("none");
public static EAtom am_index = EAtom.intern("index");
public static EAtom am_binary = EAtom.intern("binary");
public static EAtom am_list = EAtom.intern("list");
public static EAtom am_all = EAtom.intern("all");
public static EAtom am_first = EAtom.intern("first");
public static EAtom am_all_but_first = EAtom.intern("all_but_first");
public static EAtom am_capture = EAtom.intern("capture");
public static EAtom am_offset = EAtom.intern("offset");
@BIF
public static EObject run(EObject subject, EObject pattern) {
return run(subject, pattern, ERT.NIL);
}
@BIF
static public EObject run(EObject subj, EObject re, EObject opts) {
try {
EObject res = run2(subj, re, opts);
//System.out.println("re:run("+subj+", "+re+", "+opts+") => "+res);
return res;
} catch (RuntimeException e) {
//System.out.println("re:run("+subj+", "+re+", "+opts+") => "+e);
// e.printStackTrace();
throw e;
}
}
static public EObject run2(EObject subj, EObject re, EObject opts) {
ECompiledRE regex;
if (re instanceof ECompiledRE) {
regex = (ECompiledRE) re;
} else {
ETuple2 res = compile(re, opts);
EObject val = res.elem2;
if (res.elem1 == ERT.am_ok && ( val instanceof ECompiledRE)) {
regex = (ECompiledRE) val;
} else {
return val;
}
}
ESeq o = opts.testSeq();
Options o2;
if (o.isNil()) {
o2 = regex.options;
} else {
o2 = regex.options.re_init(o);
if (o2 == null) {
throw ERT.badarg(subj, re, opts);
}
}
String subject = regex.options.decode(subj);
if (subject == null) {
throw ERT.badarg(subj, re, opts);
}
if (o2.offset > subject.length() || o2.offset < 0) {
throw ERT.badarg(subj, re, opts);
}
Matcher matcher = regex.patt.matcher(subject.substring(o2.offset ));
if (o2.global) {
ESeq result = ERT.NIL;
while (matcher.find()) {
MatchResult mr = matcher.toMatchResult();
ESeq list;
if (o2.capture_spec == am_all) {
ESeq l = ERT.NIL;
for (int i = mr.groupCount(); i >= 0; i--) {
l = l.cons( capture (subject, mr, i, o2) );
}
result = result.cons(l);
} else if ((list = o2.capture_spec.testSeq()) != null) {
ESeq l = ERT.NIL;
while (!list.isNil()) {
EObject group = list.head();
ESmall num;
EAtom nam;
EString nam2;
if ((num=group.testSmall()) != null)
{
l = l.cons( capture (subject, mr, num.value, o2 ));
} else if ((nam=group.testAtom()) != null) {
Integer groupNo = o2.named_groups.get(nam.getName());
if (groupNo != null) {
l = l.cons( capture (subject, mr, groupNo.intValue(), o2 ));
}
} else if ((nam2=group.testString()) != null) {
Integer groupNo = o2.named_groups.get(nam2.stringValue());
if (groupNo != null) {
l = l.cons( capture (subject, mr, groupNo.intValue(), o2 ));
}
} else {
throw new NotImplemented("named capture groups");
}
list = list.tail();
}
result = result.cons(l);
} else {
throw new NotImplemented("global and not all");
}
}
if (result == ERT.NIL) {
return am_nomatch;
} else {
return new ETuple2(am_match, result.reverse());
}
} else {
if (matcher.find()) {
if (o2.capture_spec == am_none) {
return am_match;
}
MatchResult mr = matcher.toMatchResult();
int max = mr.groupCount();
while( mr.start(max) == -1)
max -= 1;
ESeq il;
if (o2.capture_spec == am_all) {
ESeq l = ERT.NIL;
for (int i = max; i >= 0; i--) {
l = l.cons( capture (subject, mr, i, o2) );
}
return new ETuple2(am_match, l);
} else if (o2.capture_spec == am_all_but_first) {
ESeq l = ERT.NIL;
for (int i = max; i > 0; i--) {
l = l.cons( capture (subject, mr, i, o2) );
}
return new ETuple2(am_match, l);
} else if (o2.capture_spec == am_first) {
EObject l = capture (subject, mr, 0, o2);
return new ETuple2(am_match, l);
} else if ((il = o2.capture_spec.testSeq()) != null) {
ESeq out = ERT.NIL;
for (; !il.isNil(); il = il.tail()) {
EObject what = il.head();
ESmall idx = what.testSmall();
EAtom nam;
if (idx != null && mr.start(idx.value) != -1) {
EObject val = capture (subject, mr, idx.value, o2);
out = out.cons(val);
} else if ((nam=what.testAtom())!=null) {
Integer idx2 = o2.named_groups.get(nam.getName());
if (idx2 != null) {
EObject val = capture (subject, mr, idx2, o2);
out = out.cons(val);
} else {
// badarg?
}
} else {
out = out.cons(nocapture(o2));
}
}
return new ETuple2(am_match, out.reverse());
} else {
throw ERT.badarg(subj, re, opts);
}
} else {
return am_nomatch;
}
}
}
private static EObject nocapture(Options opts) {
if (opts.capture_type == am_binary) {
return EBinary.EMPTY;
} else if (opts.capture_type == am_list) {
return ERT.NIL;
} else if (opts.capture_spec == am_index) {
return am_nomatch;
} else {
throw new InternalError("bad capture_type "+opts.capture_type);
}
}
private static EObject capture(String subject, MatchResult mr, int group_index,
Options opts) {
int start = mr.start(group_index);
if (start == -1) {
return nocapture(opts);
}
int end = mr.end(group_index);
if (opts.capture_type == am_index) {
if (INDEX_COMPATIBLE && opts.unicode) {
try {
int istart = subject.substring(0, start).getBytes("UTF8").length;
int ilen = subject.substring(start, end).getBytes("UTF8").length;
return new ETuple2(ERT.box(istart), ERT.box(ilen));
} catch (UnsupportedEncodingException e) {
throw new InternalError();
}
}
return new ETuple2(ERT.box(start), ERT.box(end-start));
} else if (opts.capture_type == am_list) {
String sub = subject.substring(start, end);
EBigString ebs = EBigString.fromString(sub);
return erjang.m.unicode.Native.characters_to_list(ebs,
opts.unicode ? am_unicode : am_latin1);
} else if (opts.capture_type == am_binary) {
String sub = subject.substring(start, end);
EBigString ebs = EBigString.fromString(sub);
return erjang.m.unicode.Native.characters_to_binary(ebs,
opts.unicode ? am_unicode : am_latin1);
} else {
throw new InternalError("bad capture type: "+opts.capture_type);
}
}
@BIF
static public EObject compile(EObject obj1) {
return compile(obj1, ERT.NIL);
}
static class Options implements java.lang.Cloneable{
public int offset = 0;
public EObject capture_type = am_index;
public EObject capture_spec = am_all;
public boolean global;
boolean unicode = false;
boolean newline_cr = false;
boolean newline_lf = true;
boolean newline_crlf = false;
boolean newline_any = false;
int flags = 0;
boolean anchored = true;
Map<String,Integer> named_groups = new HashMap<>(3);
public int group_count;
Options re_init(ESeq opts) {
Options out;
try {
out = (Options) this.clone();
} catch (CloneNotSupportedException e) {
throw new InternalError();
}
if (!out.init(opts))
return null;
return out;
}
boolean init(ESeq opts) {
if (opts == null) return true;
for (; !opts.isNil(); opts = opts.tail()) {
EObject opt = opts.head();
// unicode | anchored | caseless | dollar_endonly | dotall |
// extended
// | firstline | multiline | no_auto_capture | dupnames |
// ungreedy
// | {newline, NLSpec}| bsr_anycrlf | bsr_unicode
ETuple tup;
ESmall off;
if (opt == am_unicode) {
unicode = true;
} else if (opt == am_anchored) {
anchored = true;
} else if (opt == am_global) {
global = true;
} else if (opt == am_caseless) {
flags |= Pattern.CASE_INSENSITIVE;
} else if (opt == am_dollar_endonly) {
throw new NotImplemented("regex option "+opt);
} else if (opt == am_dotall) {
flags |= Pattern.DOTALL;
} else if (opt == am_extended) {
flags |= Pattern.COMMENTS;
} else if (opt == am_firstline) {
throw new NotImplemented("regex option "+opt);
} else if (opt == am_multiline) {
flags |= Pattern.MULTILINE;
} else if (opt == am_no_auto_capture) {
throw new NotImplemented("regex option "+opt);
} else if (opt == am_dupnames) {
throw new NotImplemented("regex option "+opt);
} else if (opt == am_ungreedy) {
throw new NotImplemented("regex option "+opt);
} else if (opt == am_bsr_anycrlf) {
newline_cr = true;
newline_crlf = true;
newline_lf = true;
newline_any = false;
} else if (opt == am_bsr_unicode) {
newline_any = true;
} else if ((tup = opt.testTuple()) != null && tup.arity() == 2
&& tup.elm(1) == am_newline) {
newline_cr = false;
newline_crlf = false;
newline_lf = false;
newline_any = false;
EObject val = tup.elm(2);
if (val == am_cr) {
newline_cr = true;
} else if (val == am_lf) {
newline_lf = true;
} else if (val == am_crlf) {
newline_crlf = true;
} else if (val == am_anycrlf) {
newline_cr = true;
newline_lf = true;
newline_crlf = true;
} else if (val == am_any) {
newline_any = true;
} else {
return false;
}
} else if (tup != null && tup.arity() == 2 && tup.elm(1) == am_capture) {
this.capture_spec = tup.elm(2);
this.capture_type = am_index;
} else if (tup != null && tup.arity() == 3 && tup.elm(1) == am_capture) {
this.capture_spec = tup.elm(2);
this.capture_type = tup.elm(3);
} else if (tup != null && tup.arity() == 2
&& tup.elm(1) == am_offset
&& (off=tup.elm(2).testSmall()) != null) {
this.offset = off.value;
} else {
return false;
}
}
ESeq spec;
if (capture_spec == am_all
|| capture_spec == am_all_but_first
|| capture_spec == am_first
|| capture_spec == am_none
) {
// ok
} else if ((spec=capture_spec.testSeq()) != null) {
// if it is a sequence, make sure elements are integers
while (!spec.isNil()) {
EObject val = spec.head();
if (val.testSmall() == null && val.testString() == null && val.testAtom() == null)
return false;
spec = spec.tail();
}
// ok
} else {
return false;
}
if (capture_type == am_index
|| capture_type == am_list
|| capture_type == am_binary) {
// ok
} else {
return false;
}
if (unicode == true && ((flags & Pattern.CASE_INSENSITIVE) != 0)) {
flags |= Pattern.UNICODE_CASE;
}
newline_any |= (newline_lf & newline_cr & newline_crlf);
if (newline_any == true) {
// great, this is the Java default
} else if (newline_lf == true && newline_cr == false
&& newline_crlf == false) {
flags |= Pattern.UNIX_LINES;
} else {
// TODO: this combination not supported by Java.
throw new NotImplemented("regex newline options lf="
+ newline_lf + "; cr=" + newline_cr + "; crlf="
+ newline_crlf);
}
return true;
}
String decode(EObject io_or_char_list) {
if (io_or_char_list instanceof ECompiledRE) {
ECompiledRE cr = (ECompiledRE)io_or_char_list;
return cr.patt.pattern();
}
String pattern;
if (unicode) {
CharArrayWriter out = new CharArrayWriter();
CharCollector cc = new CharCollector(StandardCharsets.UTF_8, out);
try {
ESeq rest = io_or_char_list.collectCharList(cc, ERT.NIL);
cc.end();
} catch (CollectingException e) {
return null;
} catch (InvalidElementException e) {
return null;
} catch (IOException e) {
return null;
} catch (PartialDecodingException e) {
return null;
}
pattern = out.toString();
} else {
EBinary bin;
if ((bin = io_or_char_list.testBinary()) != null) {
return EString.make(bin).stringValue();
}
EString str;
if ((str = io_or_char_list.testString()) != null) {
return str.stringValue();
}
List<ByteBuffer> bb = new ArrayList<ByteBuffer>();
if (io_or_char_list.collectIOList(bb)) {
StringWriter sw = new StringWriter();
for (ByteBuffer b : bb) {
char ch;
while (b.hasRemaining())
{
ch = (char) b.get();
sw.append(ch);
}
}
pattern = sw.toString();
} else {
return null;
}
}
return pattern;
}
public boolean isUnicode() {
return unicode;
}
static Pattern NAMED_GROUP = Pattern.compile("\\(\\?<([a-zA-Z0-9_]+)>.*");
static Pattern PREDEFINED = Pattern.compile(".*(\\\\p\\{(?<name>[a-zA-Z][a-zA-Z0-9]*)\\}).*");
private String countGroups(String pattern) {
int start = 0;
StringBuilder sb = new StringBuilder();
boolean in_ch_class = false;
group_count = 0;
Matcher namedGroupMatcher = NAMED_GROUP.matcher( pattern );
int i;
Matcher prefinedMatcher = PREDEFINED.matcher(pattern);
for (i = 0; i < pattern.length(); i++) {
char ch = pattern.charAt(i);
char ch2;
if (ch == '\\') {
if (prefinedMatcher.find(i) && prefinedMatcher.start(1) == i) {
String predefined = prefinedMatcher.group(2);
String javaPredefName = predefined;
do {
try {
UnicodeScript found = UnicodeScript.forName(predefined);
javaPredefName = "Is" + predefined;
break;
} catch (IllegalArgumentException e) {
// ok
}
try {
UnicodeBlock found = UnicodeBlock.forName(predefined);
javaPredefName = "In" + predefined;
} catch (IllegalArgumentException e) {
// ok
}
} while(false);
sb.append( pattern.substring( start, i+3 )).append(javaPredefName).append('}');
i += predefined.length()+3;
start = i+1;
continue;
}
i += 1;
continue;
}
if (ch == '(' && !lookingAt(pattern, i+1, "?:")) {
group_count += 1;
if (namedGroupMatcher.find(i)) {
if (namedGroupMatcher.start() == i) {
String name = namedGroupMatcher.group(1);
named_groups.put(name, group_count);
sb.append( pattern.substring(start, i+1) );
start = i+4+name.length();
i = start-1;
}
}
} else if (ch == '[') {
if (!in_ch_class) {
in_ch_class = true;
} else {
sb.append( pattern.substring( start, i )).append('\\');
start = i;
}
} else if (ch == ']') {
in_ch_class = false;
// if seeing "{[^0-9]" then insert escape, as java complains about non-numeric
} else if (ch == '{'
&& i+1 < pattern.length()
&& (ch2 = pattern.charAt(i+1)) != '0'
&& !(ch2 >= '1' && ch2 <= '9')) {
sb.append( pattern.substring( start, i )).append('\\');
start = i;
}
}
if (start == 0)
return pattern;
sb.append( pattern.substring(start) );
return sb.toString();
}
private static boolean lookingAt(String base, int i, CharSequence find) {
if (base.length()-i < find.length())
return false;
for (int off = 0; off < find.length(); off += 1) {
if (find.charAt(off) != base.charAt(i+off))
return false;
}
return true;
}
public String process_erl2java(String pattern) {
return countGroups(pattern);
}
}
//
// This is an enormous ugly hack, but since Elixir jut has two special-case
// regular expressions, we'll encode them specifically into erjang.
//
// output of BEAM's re:compile(<<"\\(\\?<(?<G>[^>]*)>">>)
static String ELIXIR_GROUPS_PATTERN = "\\(\\?<(?<G>[^>]*)>";
static byte[] ELIXIR_GROUPS_PATTERN_BEAM = new byte[] {
69, 82, 67, 80, 80, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0, 40, 0, 62,
2, 48, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 71, 0, 93, 0, 21, 27, 40, 27, 63, 27, 60, 94, 0, 7, 0, 1, 43, 62, 84,
0, 7, 27, 62, 84, 0, 21, 0, -1, -1, -1
};
// output of BEAM's re:compile(<<"[.^$*+?()[{\\\|\s#]">>, [unicode]).
static String ELIXIR_ESCAPE_PATTERN = "[\\.\\^\\$\\*\\+\\?\\(\\)\\[\\{\\\\\\|\\s\\#]";
static byte[] ELIXIR_ESCAPE_PATTERN_BEAM = new byte[] {
69, 82, 67, 80, 88, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 93, 0,
36, 77, 0, 54, 0, 0, 25, 79, 0, -128, 0, 0, 0, 88, 0, 0, 0, 24, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 36, 0
};
static String is_special_pattern(byte[] raw, Options o) {
if (Arrays.equals(raw, ELIXIR_ESCAPE_PATTERN_BEAM)) {
o.unicode = true;
return ELIXIR_ESCAPE_PATTERN;
}
if (raw.length == ELIXIR_GROUPS_PATTERN_BEAM.length) {
for (int i = 0; i < raw.length-4; i++) {
if (raw[i] != ELIXIR_GROUPS_PATTERN_BEAM[i])
return null;
}
} else {
return null;
}
return ELIXIR_GROUPS_PATTERN;
}
@BIF
static public ETuple2 compile(EObject obj1, EObject obj2) {
if (obj1 instanceof ECompiledRE) {
return new ETuple2(ERT.am_ok,
obj1
);
}
ESeq opts = obj2.testSeq();
Options o = new Options();
String pattern;
ETuple4 tup = ETuple4.cast(obj1);
if (tup != null && tup.elem1 == ECompiledRE.am_re_pattern) {
EBinary b = tup.elem4.testBinary();
byte[] byteArray = b.getByteArray();
if (b != null && b.byteAt(0) == '/') {
byte[] raw = byteArray;
int end = raw.length - 1;
for (int i = b.byteSize()-1; i > 0; i--) {
if (b.byteAt(i*8) == '/') {
end = i;
break;
}
}
pattern = new String(raw, 1, end-1, IO.UTF8);
o.init( ECompiledRE.decode_options(raw, end+1) );
if (!o.init(opts)) {
throw ERT.badarg(obj1, obj2);
}
} else if ((pattern = is_special_pattern(byteArray, o)) != null) {
// ok //
} else {
System.out.println("byte data[] = { ");
for (int i = 0; i < byteArray.length; i++) {
System.out.print(", "+byteArray[i]);
}
System.out.println(" } ");
throw ERT.badarg(obj1, obj2);
}
} else {
if (!o.init(opts)) {
throw ERT.badarg(obj1, obj2);
}
pattern = o.decode(obj1);
}
if (pattern == null) {
throw ERT.badarg(obj1, obj2);
}
String stripped_pattern = o.process_erl2java(pattern);
try {
Pattern c = Pattern.compile(stripped_pattern, o.flags);
return new ETuple2(ERT.am_ok,
new ECompiledRE(o, c, pattern)
);
} catch (PatternSyntaxException e) {
return new ETuple2(ERT.am_error, new ETuple2(EString.fromString(e
.getDescription()+" in /"+stripped_pattern+"/"), ERT.box(e.getIndex())));
}
}
}