Source Code of org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude$Parser

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.search.aggregations.bucket.terms.support;


import com.carrotsearch.hppc.LongOpenHashSet;
import com.carrotsearch.hppc.LongSet;
import org.apache.lucene.index.RandomAccessOrds;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.*;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.aggregations.InternalAggregation;
import org.elasticsearch.search.aggregations.support.ValuesSource;
import org.elasticsearch.search.internal.SearchContext;


import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * Defines the include/exclude regular expression filtering for string terms aggregation. In this filtering logic,
 * exclusion has precedence, where the {@code include} is evaluated first and then the {@code exclude}.
 */
public class IncludeExclude {


    // The includeValue and excludeValue ByteRefs which are the result of the parsing 
    // process are converted into a LongFilter when used on numeric fields 
    // in the index.
    public static class LongFilter {
        private LongSet valids;
        private LongSet invalids;


        private LongFilter(int numValids, int numInvalids) {
            if (numValids > 0) {
                valids = new LongOpenHashSet(numValids);
            }
            if (numInvalids > 0) {
                invalids = new LongOpenHashSet(numInvalids);
            }
        }


        public boolean accept(long value) {
            return ((valids == null) || (valids.contains(value))) && ((invalids == null) || (!invalids.contains(value)));
        }


        private void addAccept(long val) {
            valids.add(val);
        }


        private void addReject(long val) {
            invalids.add(val);
        }
    }


    private final Matcher include;
    private final Matcher exclude;
    private final CharsRefBuilder scratch = new CharsRefBuilder();
    private Set<BytesRef> includeValues;
    private Set<BytesRef> excludeValues;
    private final boolean hasRegexTest;


    /**
     * @param include   The regular expression pattern for the terms to be included
     *                  (may only be {@code null} if one of the other arguments is none-null.
     * @param includeValues   The terms to be included
     *                  (may only be {@code null} if one of the other arguments is none-null.
     * @param exclude   The regular expression pattern for the terms to be excluded
     *                  (may only be {@code null} if one of the other arguments is none-null.
     * @param excludeValues   The terms to be excluded
     *                  (may only be {@code null} if one of the other arguments is none-null.
     */
    public IncludeExclude(Pattern include, Pattern exclude, Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
        assert includeValues != null || include != null || 
                exclude != null || excludeValues != null : "includes & excludes cannot both be null"; // otherwise IncludeExclude object should be null
        this.include = include != null ? include.matcher("") : null;
        this.exclude = exclude != null ? exclude.matcher("") : null;
        hasRegexTest = include != null || exclude != null;
        this.includeValues = includeValues;
        this.excludeValues = excludeValues;
    }


    /**
     * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
     */
    public boolean accept(BytesRef value) {


        if (hasRegexTest) {
            // We need to perform UTF8 to UTF16 conversion for use in the regex matching
            scratch.copyUTF8Bytes(value);            
        }
        return isIncluded(value, scratch.get()) && !isExcluded(value, scratch.get());
    }
    
    private boolean isIncluded(BytesRef value, CharsRef utf16Chars) {


        if ((includeValues == null) && (include == null)) {
            // No include criteria to be tested.
            return true;
        }
        
        if (include != null) {
            if (include.reset(scratch.get()).matches()) {
                return true;
            }
        }
        if (includeValues != null) {
            if (includeValues.contains(value)) {
                return true;
            }
        }
        // Some include criteria was tested but no match found
        return false;
    }
    
    private boolean isExcluded(BytesRef value, CharsRef utf16Chars) {
        if (exclude != null) {
            if (exclude.reset(scratch.get()).matches()) {
                return true;
            }
        }
        if (excludeValues != null) {
            if (excludeValues.contains(value)) {
                return true;
            }
        }
        // No exclude criteria was tested or no match found
        return false;
    }


    /**
     * Computes which global ordinals are accepted by this IncludeExclude instance.
     */
    public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) {
        LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
        // There are 3 ways of populating this bitset: 
        // 1) Looking up the global ordinals for known "include" terms
        // 2) Looking up the global ordinals for known "exclude" terms
        // 3) Traversing the term enum for all terms and running past regexes
        // Option 3 is known to be very slow in the case of high-cardinality fields and
        // should be avoided if possible.
        if (includeValues != null) {
            // optimize for the case where the set of accepted values is a set
            // of known terms, not a regex that would have to be tested against all terms in the index
            for (BytesRef includeValue : includeValues) {
                // We need to perform UTF8 to UTF16 conversion for use in the regex matching
                scratch.copyUTF8Bytes(includeValue); 
                if (!isExcluded(includeValue, scratch.get())) {
                    long ord = globalOrdinals.lookupTerm(includeValue);
                    if (ord >= 0) {
                        acceptedGlobalOrdinals.set(ord);
                    }
                }
            }
        } else {
            if(hasRegexTest) {
                // We have includeVals that are a regex or only regex excludes - we need to do the potentially 
                // slow option of hitting termsEnum for every term in the index.
                TermsEnum globalTermsEnum = valueSource.globalOrdinalsValues().termsEnum();
                try {
                    for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
                        if (accept(term)) {
                            acceptedGlobalOrdinals.set(globalTermsEnum.ord());
                        }
                    }
                } catch (IOException e) {
                    throw ExceptionsHelper.convertToElastic(e);
                }
            } else {
                // we only have a set of known values to exclude - create a bitset with all good values and negate the known bads
                acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
                for (BytesRef excludeValue : excludeValues) {
                    long ord = globalOrdinals.lookupTerm(excludeValue);
                    if (ord >= 0) {
                        acceptedGlobalOrdinals.clear(ord);
                    }
                }
                
            }
        }
        return acceptedGlobalOrdinals;
    }


    public static class Parser {


        private final String aggName;
        private final InternalAggregation.Type aggType;
        private final SearchContext context;


        String include = null;
        int includeFlags = 0; // 0 means no flags
        String exclude = null;
        int excludeFlags = 0; // 0 means no flags
        Set<BytesRef> includeValues;
        Set<BytesRef> excludeValues;


        public Parser(String aggName, InternalAggregation.Type aggType, SearchContext context) {
            this.aggName = aggName;
            this.aggType = aggType;
            this.context = context;
        }


        public boolean token(String currentFieldName, XContentParser.Token token, XContentParser parser) throws IOException {


            if (token == XContentParser.Token.VALUE_STRING) {
                if ("include".equals(currentFieldName)) {
                    include = parser.text();
                } else if ("exclude".equals(currentFieldName)) {
                    exclude = parser.text();
                } else {
                    return false;
                }
                return true;
            }
            
            if (token == XContentParser.Token.START_ARRAY) {
                if ("include".equals(currentFieldName)) {
                     includeValues = parseArrayToSet(parser);
                     return true;
                } 
                if ("exclude".equals(currentFieldName)) {
                      excludeValues = parseArrayToSet(parser);
                      return true;
                }
                return false;
            }


            if (token == XContentParser.Token.START_OBJECT) {
                if ("include".equals(currentFieldName)) {
                    while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
                        if (token == XContentParser.Token.FIELD_NAME) {
                            currentFieldName = parser.currentName();
                        } else if (token == XContentParser.Token.VALUE_STRING) {
                            if ("pattern".equals(currentFieldName)) {
                                include = parser.text();
                            } else if ("flags".equals(currentFieldName)) {
                                includeFlags = Regex.flagsFromString(parser.text());
                            }
                        } else if (token == XContentParser.Token.VALUE_NUMBER) {
                            if ("flags".equals(currentFieldName)) {
                                includeFlags = parser.intValue();
                            }
                        }
                    }
                } else if ("exclude".equals(currentFieldName)) {
                    while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
                        if (token == XContentParser.Token.FIELD_NAME) {
                            currentFieldName = parser.currentName();
                        } else if (token == XContentParser.Token.VALUE_STRING) {
                            if ("pattern".equals(currentFieldName)) {
                                exclude = parser.text();
                            } else if ("flags".equals(currentFieldName)) {
                                excludeFlags = Regex.flagsFromString(parser.text());
                            }
                        } else if (token == XContentParser.Token.VALUE_NUMBER) {
                            if ("flags".equals(currentFieldName)) {
                                excludeFlags = parser.intValue();
                            }
                        }
                    }
                } else {
                    return false;
                }
                return true;
            }


            return false;
        }
        private Set<BytesRef> parseArrayToSet(XContentParser parser) throws IOException {
            final Set<BytesRef> set = new HashSet<>();
            if (parser.currentToken() != XContentParser.Token.START_ARRAY) {
                throw new ElasticsearchParseException("Missing start of array in include/exclude clause");
            }
            while (parser.nextToken() != XContentParser.Token.END_ARRAY) {
                if (!parser.currentToken().isValue()) {
                    throw new ElasticsearchParseException("Array elements in include/exclude clauses should be string values");
                }
                set.add(new BytesRef(parser.text()));
            }
            return set;
        }
        
        public IncludeExclude includeExclude() {
            if (include == null && exclude == null && includeValues == null && excludeValues == null) {
                return null;
            }
            Pattern includePattern =  include != null ? Pattern.compile(include, includeFlags) : null;
            Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null;
            return new IncludeExclude(includePattern, excludePattern, includeValues, excludeValues);
        }
    }


    public boolean isRegexBased() {
        return hasRegexTest;
    }


    public LongFilter convertToLongFilter() {
        int numValids = includeValues == null ? 0 : includeValues.size();
        int numInvalids = excludeValues == null ? 0 : excludeValues.size();
        LongFilter result = new LongFilter(numValids, numInvalids);
        if (includeValues != null) {
            for (BytesRef val : includeValues) {
                result.addAccept(Long.parseLong(val.utf8ToString()));
            }
        }
        if (excludeValues != null) {
            for (BytesRef val : excludeValues) {
                result.addReject(Long.parseLong(val.utf8ToString()));
            }
        }
        return result;
    }
    public LongFilter convertToDoubleFilter() {
        int numValids = includeValues == null ? 0 : includeValues.size();
        int numInvalids = excludeValues == null ? 0 : excludeValues.size();
        LongFilter result = new LongFilter(numValids, numInvalids);
        if (includeValues != null) {
            for (BytesRef val : includeValues) {
                double dval=Double.parseDouble(val.utf8ToString());
                result.addAccept( NumericUtils.doubleToSortableLong(dval));
            }
        }
        if (excludeValues != null) {
            for (BytesRef val : excludeValues) {
                double dval=Double.parseDouble(val.utf8ToString());
                result.addReject( NumericUtils.doubleToSortableLong(dval));
            }
        }
        return result;
    }


}
Source Code of org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude$Parser

Related Classes of org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude$Parser