Package com.ikanow.infinit.e.harvest.enrichment.legacy

Source Code of com.ikanow.infinit.e.harvest.enrichment.legacy.RegexEntityExtractor$RegexEntityConfig

package com.ikanow.infinit.e.harvest.enrichment.legacy;

import java.io.IOException;
import java.lang.reflect.Type;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.regex.Pattern;

import au.com.bytecode.opencsv.CSVParser;

import com.google.common.collect.HashMultimap;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonElement;
import com.google.gson.JsonPrimitive;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDailyLimitExceededException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
import com.ikanow.infinit.e.data_model.interfaces.harvest.EntityExtractorEnum;
import com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo.Dimension;
import com.ikanow.infinit.e.data_model.utils.DimensionUtility;

//TODO: implement and document ... if printRegexThenOutput: true then just exist with the output in a runtime exception

public class RegexEntityExtractor implements IEntityExtractor {

  private static boolean _DEBUG = false;
 
  @Override
  public String getName() {
    return "regex";
  }

  @Override
  public void extractEntities(DocumentPojo partialDoc)
      throws ExtractorDailyLimitExceededException,
      ExtractorDocumentLevelException {
    // TODO Auto-generated method stub

  }

  @Override
  public void extractEntitiesAndText(DocumentPojo partialDoc)
      throws ExtractorDailyLimitExceededException,
      ExtractorDocumentLevelException {
    // TODO Auto-generated method stub

  }

  @Override
  public String getCapability(EntityExtractorEnum capability) {
    // TODO Auto-generated method stub
    return null;
  }

///////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////

  // TOP LEVEL LOGIC - INITIALIZATION
 
  public static class RegexEntityConfig {
    String regexSpec; // Source pattern for regex
    private Pattern regex = null; // Compiled regex object (from regexSpece + flags)
    int flags; // Source flags for regex
   
    Pattern getOrCompileRegex() {
      if (null == regex) {
        return Pattern.compile(regexSpec, flags);
      }
      else {
        return regex;
      }
    }
    String replace; // The replacement string (null if in compressed mode - run over "compressed" to get matches in this case)
    String type; // Entity type
    String dimension; // Entity dimension
    RegexEntityFieldSet fieldSet; // Set of fields to search over
    List<RegexEntityConfig> compressed; // The list of regexes that have been compressed into this object
    int numCompressed = 0;
  }
  public static class RegexEntityFieldSet {
    TreeSet<String> fields; // making it a tree set lets us do prefix matching
    HashMap<Integer, Pattern> fieldRegexes; // (more complex - regexes)
  }

  protected HashMultimap<String, RegexEntityConfig> _fullRegexList, _compressedRegexList;
  protected RegexEntityFieldSet _defaultFieldSet = null;
  protected String _defaultFieldSpec = null;

  private static final Pattern DIMENSIONMATCHER = Pattern.compile("Who|Where|What|When");

  ////////////////////////

  protected HashMultimap<String, RegexEntityConfig> intializeConfig(Map<String, String> config) {
    //(probably just for debugging)
    if (null != _fullRegexList) {
      _fullRegexList = null;
    }
    if (null != _compressedRegexList) {
      _compressedRegexList = null;
    }
   
    // Phase 1 ... creste field spec
    for (Map.Entry<String, String> kv: config.entrySet()) {
      try {   
        //DEBUG
        if (_DEBUG) System.out.println("iC1: Phase1: " + kv.getKey() + ": " + kv.getValue());
       
        parseKeyVal(kv.getKey(), kv.getValue(), true);
      }
      catch (Exception e) {}
    }
    if (null == _defaultFieldSet) {
      try {
        parseKeyVal("$", "fullText", true);
      }
      catch (Exception e) {
        if (_DEBUG) e.printStackTrace();               
      }
    }
    // Phase 2 ... create regexes
    for (Map.Entry<String, String> kv: config.entrySet()) {
      try {
        //DEBUG
        if (_DEBUG) System.out.println("iC2: Phase2: " + kv.getKey() + ": " + kv.getValue());
       
        parseKeyVal(kv.getKey(), kv.getValue(), false);
      }
      catch (Exception e) {
        if (_DEBUG) e.printStackTrace();       
      }
    }
    if (null != this._fullRegexList) {
      this.compressRegexes();     
    }
    //DEBUG
    if (_DEBUG) System.out.println("iC3: Compressed from " + _fullRegexList.size() + " to " + _compressedRegexList.size());
   
    return _compressedRegexList;
  }//TESTED (test1 etc)
 
  protected void parseKeyVal(String keyStr, String valueStr, boolean isPhase1) throws IOException
  {
    //step 1) parse key

    String dim = null;
    String fieldSpec = null;
    RegexEntityFieldSet fieldSet = null;
    String type = null;
   
    int entStart = keyStr.lastIndexOf('/');
    if (entStart <= 0) {
      if ('$' == keyStr.charAt(0)) { // saved type, do all these first       
        if (isPhase1) {
          RegexEntityFieldSet savedFieldSet = parseFieldSpec(valueStr, keyStr);
         
          //DEBUG
          if (_DEBUG) System.out.println("pKV1: $ DEFAULT: " + valueStr);
         
          if (null == savedFieldSet) { //(invalid just ignore)
            return;
          }
          if (1 == keyStr.length()) {
            _defaultFieldSet = savedFieldSet;
            _defaultFieldSpec = valueStr;
          }
        }
        return;
      }//TESTED (test1 - default, test2 - fields specified)
      else if (!isPhase1) { // just ent type       
        fieldSet = _defaultFieldSet;
        fieldSpec = _defaultFieldSpec;
        type = keyStr;
       
        //DEBUG
        if (_DEBUG) System.out.println("pKV2: SIMPLE KEY: " + type);             
      }//TESTED (test1)
      else return;
    }
    else { // one of "dim/type" or "fields/dim/type" or "fields/type"
      if (isPhase1) { // phase 1, just saved fields
        return;
      }
      type = keyStr.substring(entStart + 1);
     
      String preEntTypeKeyStr = keyStr.substring(0, entStart);
      int dimStart = preEntTypeKeyStr.lastIndexOf('/');
      if (dimStart > 0) { // 1 or 2 fields (if 1 contains a /)
        //DEBUG
        if (_DEBUG) System.out.println("pKV3a: Most complex key: " + dimStart + " vs " + preEntTypeKeyStr);                     
       
        String candidateDim = preEntTypeKeyStr.substring(dimStart + 1);
        if (DIMENSIONMATCHER.matcher(candidateDim).matches()) { // 2nd field is valid dim
          dim = candidateDim;
          fieldSpec = preEntTypeKeyStr.substring(0, dimStart);
        }
        else { // 2nd field isn't valid
          fieldSpec = preEntTypeKeyStr;           
        }
      }//TODO (TOTEST - both cases TEST4/patt2 - valid dimension)
      else if (DIMENSIONMATCHER.matcher(preEntTypeKeyStr).matches()) { // only 1 field, it's a valid dimension, ie default spec
        dim = preEntTypeKeyStr;
        fieldSet = _defaultFieldSet;
        fieldSpec = _defaultFieldSpec;
      }//TESTED (test1, test2)
      else { // Only 1 field, not a valid dim, so must be a spec
       
        fieldSpec = preEntTypeKeyStr;
      }//TESTED (test4, pattern 1)
      //DEBUG
      if (_DEBUG) System.out.println("pKV3+: MORE COMPLEX KEY " + fieldSpec + " from " + keyStr + " /DIM = " + dim);     
    }
    if (null == fieldSet) {
      fieldSet = parseFieldSpec(fieldSpec, null);
      if (null == fieldSet) { // (invalid)
        return;
      }
    }
    if (isPhase1) {
      return;
    }

    //step 2) parse the value

    CSVParser regexParser = new CSVParser('/', (char)0x0, '\\');

    String[] parsedRegex = regexParser.parseLine(valueStr);

    //DEBUG
    if (_DEBUG) System.out.println("pKV4: parsed regex = " + Arrays.toString(parsedRegex));
   
    //0 is "" or s
    //1 is the regex
    //2 is the replace
    //3 are the flags

    String replace = null;
    int flags = 0;
    if (parsedRegex[0].isEmpty()) { // /regex/flags
      if (parsedRegex.length > 2) {
        flags = parseFlags(parsedRegex[2]);
      }     
    }//TESTED (test2)
    else { // s/regex/replace/flags
      if (parsedRegex.length > 3) {
        flags = parseFlags(parsedRegex[3]);
      }
      if ((parsedRegex.length > 2) && !parsedRegex[2].isEmpty()) {
        replace = parsedRegex[2];
      }
    }//TESTED (test1)
   
    RegexEntityConfig regexConfig = new RegexEntityConfig();
    regexConfig.regexSpec = parsedRegex[1];
    regexConfig.flags = flags;
    regexConfig.replace = replace;
    regexConfig.fieldSet = fieldSet;
    regexConfig.compressed = null;
    regexConfig.type = type;
    if (null == dim) {
      Dimension x = DimensionUtility.getDimensionByType(type);
      if (null == x) {
        x = Dimension.What;
      }
      dim = x.toString();
     
      //DEBUG
      if (_DEBUG) System.out.println("pKV5: Guess dim = " + dim + " from " + type);
    }//TESTED (test1)
    regexConfig.dimension = dim;

    if (null == _fullRegexList) {
      _fullRegexList = HashMultimap.create();
    }
    _fullRegexList.put(fieldSpec + "/" + flags, regexConfig);
  }
 
////////////////////////

  protected final static int MAX_REGEXES_TO_COMPRESS = 10;
  protected void compressRegexes() {
    if (null == _compressedRegexList) {
      _compressedRegexList = HashMultimap.create();
    }

    String prevKey = null;
    RegexEntityConfig regexesWithGroups = null, directRegexes = null;
    for (Map.Entry<String, RegexEntityConfig> kv: this._fullRegexList.entries()) {

      //DEBUG
      if (_DEBUG) System.out.println("cR1: key=" + kv.getKey() + ", spec=" + kv.getValue().regexSpec + ", replace=" + kv.getValue().replace + " | PREV_KEY = " + prevKey);
     
      if ((null != prevKey) && prevKey.equals(kv.getKey())) {
        if (null == kv.getValue().replace) {
          directRegexes = combineRegexConfigs(kv.getKey(), directRegexes, kv.getValue());
        }
        else {
          regexesWithGroups = combineRegexConfigs(kv.getKey(), regexesWithGroups, kv.getValue());         
        }
      }//TODO: TOTEST
      else { // key change
        if ((null != prevKey) && (null != regexesWithGroups) && (null == regexesWithGroups.compressed)) {
          // (didn't manage to compress)
          this._compressedRegexList.put(prevKey, regexesWithGroups);

          //DEBUG
          if (_DEBUG) System.out.println("cR2a: added regexesWithGroups");
        }//TESTED (test1)
        if ((null != prevKey) && (null != directRegexes) && (null == directRegexes.compressed)) {
          // (didn't manage to compress)
          this._compressedRegexList.put(prevKey, directRegexes);
         
          //DEBUG
          if (_DEBUG) System.out.println("cR3a: added directRegexes");
        }
       
        prevKey = kv.getKey();
        directRegexes = null;
        regexesWithGroups = null;
        if (null == kv.getValue().replace) {
          directRegexes = kv.getValue();
        }
        else {
          regexesWithGroups = kv.getValue();
        }
      }
    }//end loop over uncompressed regexes
   
    // Handle any uncompressed keys at the end
    if ((null != prevKey) && (null != regexesWithGroups) && (null == regexesWithGroups.compressed)) {
      // (didn't manage to compress)
      this._compressedRegexList.put(prevKey, regexesWithGroups);

      //DEBUG
      if (_DEBUG) System.out.println("cR2b: added regexesWithGroups");
    }
    if ((null != prevKey) && (null != directRegexes) && (null == directRegexes.compressed)) {
      // (didn't manage to compress)
      this._compressedRegexList.put(prevKey, directRegexes);

      //DEBUG
      if (_DEBUG) System.out.println("cR3b: added directRegexes");
    }//TESTED (test1)
  }
 
////////////////////////

  private static final int MAX_COMPRESSED_REGEXES = 10;
  private RegexEntityConfig combineRegexConfigs(String key, RegexEntityConfig regexList, RegexEntityConfig newRegex) {
    if ((null == regexList.compressed) ||
        (regexList.numCompressed >= MAX_COMPRESSED_REGEXES))
    {
      RegexEntityConfig newRegexList = new RegexEntityConfig();
      newRegexList.compressed = new LinkedList< RegexEntityConfig>();
      newRegexList.replace = regexList.replace; // (just care if it's null or not, ie direct/groups)
      newRegexList.flags = regexList.flags; // (the same across all commpressions by construction of key)
      newRegexList.fieldSet = regexList.fieldSet; // (the same across all commpressions by construction of key)

      _compressedRegexList.put(key, newRegexList);
      if (null == regexList.compressed) {
        newRegexList.regexSpec = regexList.regexSpec;
        newRegexList.compressed.add(regexList);
        regexList.numCompressed = 1;
      }
      else {
        newRegexList.regexSpec = newRegex.regexSpec;
        regexList.numCompressed = 0;
      }
      regexList = newRegex;
    }
    else {
      regexList.regexSpec = regexList.regexSpec + "|" + newRegex.regexSpec;     
    }
    regexList.compressed.add(newRegex);
    regexList.numCompressed++;
    return regexList;
  }//TODO: TOTEST

////////////////////////

  private int parseFlags(String flagsStr) {
    int flags = 0;
    for (int i = 0; i < flagsStr.length(); ++i) {
      switch (flagsStr.charAt(i)) {
      case 'i':
        flags |= Pattern.CASE_INSENSITIVE;
        break;
      case 'x':
        flags |= Pattern.COMMENTS;
        break;
      case 's':
        flags |= Pattern.DOTALL;
        break;
      case 'm':
        flags |= Pattern.MULTILINE;
        break;
      case 'u':
        flags |= Pattern.UNICODE_CASE;
        break;
      case 'd':
        flags |= Pattern.UNIX_LINES;
        break;
      }
    }
    return flags;
  }//TESTED (test1, etc)

  private RegexEntityFieldSet combineFieldSets(RegexEntityFieldSet comboSet, RegexEntityFieldSet set) {
    if (null != set.fields) {
      if (null == comboSet.fields) {
        comboSet.fields = new TreeSet<String>();
      }
      comboSet.fields.addAll(set.fields);
    }
    if (null != set.fieldRegexes) {
      if (null == comboSet.fieldRegexes) {
        comboSet.fieldRegexes = set.fieldRegexes;
      }
      else {
        HashMap<Integer, Pattern> newPatterns = new HashMap<Integer, Pattern>();     
        for (Map.Entry<Integer, Pattern> kv: set.fieldRegexes.entrySet()) {
          Pattern comboPattern = comboSet.fieldRegexes.get(kv.getKey());
          if (null == comboPattern) {
            comboPattern = kv.getValue();
          }
          else {
            comboPattern = Pattern.compile(comboPattern.pattern() + "|" + kv.getValue().pattern(), kv.getKey());
          }
          newPatterns.put(kv.getKey(), comboPattern);
        }
        comboSet.fieldRegexes.putAll(newPatterns);
      }
    }
    return comboSet;
  }//TODO: TOTEST

  private HashMap<String, RegexEntityFieldSet> _cachedFieldSets = new HashMap<String, RegexEntityFieldSet>();
 
  private RegexEntityFieldSet parseFieldSpec(String fieldSpec, String savedName) {
    try {
      //DEBUG
      if (_DEBUG) System.out.println("pFS1: Building for spec " + fieldSpec);
     
      RegexEntityFieldSet cached = null;
      if (null == savedName) {
        cached = _cachedFieldSets.get(fieldSpec);
      }
      if (null != cached) {
        //DEBUG
        if (_DEBUG) System.out.println("pFS2: Found cached spec " + fieldSpec);
       
        return cached;
      }//TODO: TOTEST
      RegexEntityFieldSet fieldSet = new RegexEntityFieldSet();
      if ('/' == fieldSpec.charAt(0)) {
        int secondSlash = fieldSpec.lastIndexOf('/');
        if (0 == secondSlash) {
          return null;
        }
        String flags = fieldSpec.substring(secondSlash + 1);
        int regexFlags = 0;
        if (flags.isEmpty()) {
          regexFlags = this.parseFlags(flags);
        }
        fieldSet.fieldRegexes = new HashMap<Integer, Pattern>();
        fieldSet.fieldRegexes.put(regexFlags, Pattern.compile(fieldSpec.substring(1, secondSlash)));
       
        //DEBUG
        if (_DEBUG) System.out.println("pFS3: ADDED " + regexFlags + ": " + Pattern.compile(fieldSpec.substring(1, secondSlash)));       
      }//TESTED (test3)
      else { // parse fields
        String[] fields = fieldSpec.split("\\s*,\\s*");
        for (String field: fields) {
          //DEBUG
          if (_DEBUG) System.out.println("pFS4: FIELD: " + field);       
         
          if ('$' == field.charAt(0)) {
            cached = _cachedFieldSets.get(field);
            if (null != cached) {
              //DEBUG
              if (_DEBUG) System.out.println("pFS5: Combine");       
             
              combineFieldSets(fieldSet, cached);
            }
          }//TODO: TOTEST
          else { // just a doc field
            if (null == fieldSet.fields) {
              fieldSet.fields = new TreeSet<String>();
            }
            fieldSet.fields.add(field);
          }//TESTED (test2)
        }
        //DEBUG
        if (_DEBUG) if (null != fieldSet.fields) System.out.println("pFS6: fields " + Arrays.toString(fieldSet.fields.toArray()));       
      }
      return fieldSet;
    }
    catch (Exception e) {
      if (_DEBUG) e.printStackTrace();
     
      return null;     
    }
  }//TODO: TOTEST

  /////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////
 
  // TEST CODE

  private static LinkedHashMap<String, String> createEngineConfig(String ...kvs) {
    LinkedHashMap<String, String> config = new LinkedHashMap<String, String>();
    for (int i = 0; i < kvs.length; i += 2) {
      config.put(kvs[i+0], kvs[i+1]);
    }
    return config;
  }
  private static String serializeResult(HashMultimap<String, RegexEntityConfig> result, boolean prettyPrint, boolean encodedForStrings) {
    StringBuffer sb = new StringBuffer();
    GsonBuilder gb = new GsonBuilder();
    if (prettyPrint) {
      gb.setPrettyPrinting();
    }
    gb.registerTypeAdapter(Pattern.class, new RegexSerializer());
    Gson gson = gb.create();
    for (Map.Entry<String, RegexEntityConfig> entry: result.entries()) {
      sb.append(gson.toJson(entry)).append('\n');
    }
    String s = sb.toString();
    if (encodedForStrings) { //put in a format that can be pasted into a string
      return s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\n", "\\n");
    }
    return s;
  }
 
  public static void main(String[] args) {
    //(tests ascertained that pattern() does not include flags, integrated flags apply everywhere, eg across |s)   
   
    RegexEntityExtractor moduleUnderTest = new RegexEntityExtractor();
   
    Map<String, String> config;
    String result;
    String expectedResult;
    HashMultimap<String, RegexEntityConfig> resultMap;
   
    // TEST 1 - simple regexes
    RegexEntityExtractor._DEBUG = false;
    //DEBUG
    //RegexEntityExtractor._DEBUG = true;
   
    config = RegexEntityExtractor.createEngineConfig(
        "Sha256Hash", "/[0-9a-fA-F]{64}/",
        "Who/ExternalIp", "s/(?:^|[^0-9a-z])([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)($|[^0-9a-z])/hash:$1/i"
        );
   
    resultMap = moduleUnderTest.intializeConfig(config);
   
    result = RegexEntityExtractor.serializeResult(resultMap, true, false);
    expectedResult = "{\n  \"key\": \"fullText/2\",\n  \"value\": {\n    \"regexSpec\": \"(?:^|[^0-9a-z])([0-9]+.[0-9]+.[0-9]+.[0-9]+)($|[^0-9a-z])\",\n    \"flags\": 2,\n    \"replace\": \"hash:$1\",\n    \"type\": \"ExternalIp\",\n    \"dimension\": \"Who\",\n    \"fieldSet\": {\n      \"fields\": [\n        \"fullText\"\n      ]\n    },\n    \"numCompressed\": 0\n  }\n}\n{\n  \"key\": \"fullText/0\",\n  \"value\": {\n    \"regexSpec\": \"[0-9a-fA-F]{64}\",\n    \"flags\": 0,\n    \"type\": \"Sha256Hash\",\n    \"dimension\": \"What\",\n    \"fieldSet\": {\n      \"fields\": [\n        \"fullText\"\n      ]\n    },\n    \"numCompressed\": 0\n  }\n}\n";
    if (!result.equals(expectedResult)) {
      System.out.println("TEST1 FAIL\n" + expectedResult + "\n...VS...\n" + result);     
    }
    else {
      System.out.println("(TEST1 passed)");
    }
   
    // TESTs 2 and 3 - define the default fields over which to search   
    RegexEntityExtractor._DEBUG = false;
    //DEBUG
    //RegexEntityExtractor._DEBUG = true;
   
    config = RegexEntityExtractor.createEngineConfig(
        "$", "fullText,description,title",
        "Where/StreetAddress", "/[0-9]+ [a-z_-]+ (?:Road|Street|Avenue)/i"
        );
   
    resultMap = moduleUnderTest.intializeConfig(config);
   
    result = RegexEntityExtractor.serializeResult(resultMap, true, false);
    expectedResult = "{\n  \"key\": \"fullText,description,title/2\",\n  \"value\": {\n    \"regexSpec\": \"[0-9]+ [a-z_-]+ (?:Road|Street|Avenue)\",\n    \"flags\": 2,\n    \"type\": \"StreetAddress\",\n    \"dimension\": \"Where\",\n    \"fieldSet\": {\n      \"fields\": [\n        \"description\",\n        \"fullText\",\n        \"title\"\n      ]\n    },\n    \"numCompressed\": 0\n  }\n}\n";
    if (!result.equals(expectedResult)) {
      System.out.println("TEST2 FAIL\n" + expectedResult + "\n...VS...\n" + result);     
    }
    else {
      System.out.println("(TEST2 passed)");
    }
   
    //(TEST3)
    RegexEntityExtractor._DEBUG = false;
    //DEBUG
    //RegexEntityExtractor._DEBUG = true;
   
    config = RegexEntityExtractor.createEngineConfig(
        "$", "/(?:fullText|description|metadata\\..*\\.address.*)/",
        "Where/StreetAddress", "/[0-9]+ *,? *[a-z_-]+ *(?:Road|Street|Avenue)/i"
        );
   
    resultMap = moduleUnderTest.intializeConfig(config);
   
    result = RegexEntityExtractor.serializeResult(resultMap, true, false);
    expectedResult = "{\n  \"key\": \"/(?:fullText|description|metadata\\\\..*\\\\.address.*)//2\",\n  \"value\": {\n    \"regexSpec\": \"[0-9]+ *,? *[a-z_-]+ *(?:Road|Street|Avenue)\",\n    \"flags\": 2,\n    \"type\": \"StreetAddress\",\n    \"dimension\": \"Where\",\n    \"fieldSet\": {\n      \"fieldRegexes\": {\n        \"0\": \"/(?:fullText|description|metadata\\\\..*\\\\.address.*)/0\"\n      }\n    },\n    \"numCompressed\": 0\n  }\n}\n";
    if (!result.equals(expectedResult)) {
      System.out.println("TEST3 FAIL\n" + expectedResult + "\n...VS...\n" + result);     
    }
    else {
      System.out.println("(TEST3 passed)");
    }
   
    // TEST 4 - Specify different regexes for different fields
    RegexEntityExtractor._DEBUG = false;
    /**/
    //DEBUG
    RegexEntityExtractor._DEBUG = true;
   
    config = RegexEntityExtractor.createEngineConfig(
          "url,sourceUrl/FileType", "s/\\.([a-z]{3})$/$1/i",
          "/[^.]*url$|metadata\\..*filename.*/i/What/FileName", "s/[^\\/]+\\.[a-z]{3}$/i"
        );
   
    resultMap = moduleUnderTest.intializeConfig(config);
   
    result = RegexEntityExtractor.serializeResult(resultMap, true, false);
    expectedResult = "";
    if (!result.equals(expectedResult)) {
      System.out.println("TEST4 FAIL\n" + expectedResult + "\n...VS...\n" + result);     
    }
    else {
      System.out.println("(TEST4 passed)");
    }
   
    //TODO: other tests ... compression big and small
    //TODO: "dimension-like-but-not-dimension-spec"
   
  }
  protected static class RegexSerializer implements JsonSerializer<Pattern>
  {
    @Override
    public JsonElement serialize(Pattern pattern, Type typeOfT, JsonSerializationContext context)
    {
      return new JsonPrimitive("/" + pattern.pattern() + "/" + Integer.toHexString(pattern.flags()));
    }
  }
}
TOP

Related Classes of com.ikanow.infinit.e.harvest.enrichment.legacy.RegexEntityExtractor$RegexEntityConfig

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.