Package com.cloudera.cdk.morphline.stdlib

Source Code of com.cloudera.cdk.morphline.stdlib.GrokBuilder

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.morphline.stdlib;

import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.cloudera.cdk.morphline.api.Command;
import com.cloudera.cdk.morphline.api.CommandBuilder;
import com.cloudera.cdk.morphline.api.MorphlineContext;
import com.cloudera.cdk.morphline.api.Record;
import com.cloudera.cdk.morphline.base.AbstractCommand;
import com.cloudera.cdk.morphline.base.Configs;
import com.cloudera.cdk.morphline.base.Validator;
import com.cloudera.cdk.morphline.shaded.com.google.code.regexp.GroupInfo;
import com.cloudera.cdk.morphline.shaded.com.google.code.regexp.Matcher;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;

/**
* The Grok command uses regular expression pattern matching to extract structured fields from
* unstructured log data.
* <p>
* It is perfect for syslog logs, apache and other webserver logs, mysql logs, and in general, any
* log format that is generally written for humans and not computer consumption.
*/
public final class GrokBuilder implements CommandBuilder {

  /*
   * Uses a shaded version of com.google.code.regexp-0.2.3 to minimize potential dependency issues.
   * See https://github.com/tony19/named-regexp
   */
 
  @Override
  public Collection<String> getNames() {
    return Collections.singletonList("grok");
  }
 
  @Override
  public Command build(Config config, Command parent, Command child, MorphlineContext context) {
    return new Grok(this, config, parent, child, context);
  }
 
 
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  private static final class Grok extends AbstractCommand {

    private final Map<String, Matcher> regexes = new HashMap();
    private final boolean extract;
    private final boolean extractInPlace;
    private final NumRequiredMatches numRequiredMatches;
    private final boolean findSubstrings;
    private final boolean addEmptyStrings;
    private final String firstKey; // cached value
    private final String renderedConfig; // cached value

    public Grok(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
      super(builder, config, parent, child, context);
     
      GrokDictionaries dict = new GrokDictionaries(config, getConfigs());
      Config exprConfig = getConfigs().getConfig(config, "expressions", ConfigFactory.empty());
      for (Map.Entry<String, Object> entry : new Configs().getEntrySet(exprConfig)) {
        String expr = entry.getValue().toString();
        this.regexes.put(entry.getKey(), dict.compileExpression(expr).matcher(""));
      }
      this.firstKey = (regexes.size() == 0 ? null : regexes.entrySet().iterator().next().getKey());

      String extractStr = getConfigs().getString(config, "extract", "true");
      this.extractInPlace = extractStr.equals("inplace");
      if (extractInPlace) {
        this.extract = true;
      } else {
        this.extract = getConfigs().getBoolean(config, "extract", true);
      }
     
      this.numRequiredMatches = new Validator<NumRequiredMatches>().validateEnum(
          config,
          getConfigs().getString(config, "numRequiredMatches", NumRequiredMatches.atLeastOnce.toString()),
          NumRequiredMatches.class);
      this.findSubstrings = getConfigs().getBoolean(config, "findSubstrings", false);
      this.addEmptyStrings = getConfigs().getBoolean(config, "addEmptyStrings", false);
      validateArguments();
      this.renderedConfig = config.root().render();
    }
   
    @Override
    protected boolean doProcess(Record inputRecord) {
      Record outputRecord;
      outputRecord = ((extractInPlace || !extract) ? inputRecord : inputRecord.copy());
      if (extractInPlace) {
        // Ensure that we mutate the record inplace only if *all* expressions match.
        // To ensure this we potentially run doMatch() twice: the first time to check, the second
        // time to mutate
        boolean isFast;
        if (regexes.size() == 0) {
          isFast = true;
        } else if (regexes.size() > 1) {
          isFast = false;
        } else if (numRequiredMatches == NumRequiredMatches.atLeastOnce) {
          isFast = true;
        } else { // all or once
          assert regexes.size() == 1;
          assert firstKey != null;
          isFast = (inputRecord.get(firstKey).size() <= 1);
        }
       
        if (!isFast) {
          if (!doMatch(inputRecord, outputRecord, false)) {
            return false;
          }
        } else {
          ; // no need to do anything
          // This is a performance enhancement for some cases with a single expression:
          // By the time we find a regex match we know that the whole command will succeed,
          // so there's really no need to run doMatch() twice.
        }
      }
      if (!doMatch(inputRecord, outputRecord, extract)) {
        return false;
      }
     
      // pass record to next command in chain:
      return super.doProcess(outputRecord);
    }

    private boolean doMatch(Record inputRecord, Record outputRecord, boolean doExtract) {
      for (Map.Entry<String, Matcher> regexEntry : regexes.entrySet()) {
        Matcher matcher = regexEntry.getValue();
        List values = inputRecord.get(regexEntry.getKey());
        int todo = values.size();
        int minMatches = 1;
        int maxMatches = Integer.MAX_VALUE;
        switch (numRequiredMatches) {
          case once : {
            maxMatches = 1;
            break;
          }
          case all : {
            minMatches = todo;
            break;
          }
          default: {
            break;
          }
        }       
        int numMatches = 0;
        for (Object value : values) {
          matcher.reset(value.toString());
          if (!findSubstrings) {
            if (matcher.matches()) {
              numMatches++;
              if (numMatches > maxMatches) {
                LOG.debug("grok failed because it found too many matches for values: {} for grok command: {}",
                          values, renderedConfig);
                return false;
              }
              extract(outputRecord, matcher, doExtract);
            }
          } else {
            int previousNumMatches = numMatches;
            while (matcher.find()) {
              if (numMatches == previousNumMatches) {
                numMatches++;
                if (numMatches > maxMatches) {
                  LOG.debug("grok failed because it found too many matches for values: {} for grok command: {}",
                            values, renderedConfig);
                  return false;
                }
                if (!doExtract && numMatches >= minMatches && maxMatches == Integer.MAX_VALUE) {
                  break; // fast path
                }
              }
              extract(outputRecord, matcher, doExtract);
            }
          }
          todo--;
          if (!doExtract && numMatches >= minMatches && maxMatches == Integer.MAX_VALUE) {
            break; // fast path
          }
        }
        if (numMatches + todo < minMatches) {
          LOG.debug("grok failed because it found too few matches for values: {} for grok command: {}",
                    values, renderedConfig);
          return false;         
        }
      }
      return true;
    }

    private void extract(Record outputRecord, Matcher matcher, boolean doExtract) {
      if (doExtract) {
        extractFast(outputRecord, matcher);
      }
    }

    private void extractFast(Record outputRecord, Matcher matcher) {
      for (Map.Entry<String, List<GroupInfo>> entry : matcher.namedPattern().groupInfo().entrySet()) {
        String groupName = entry.getKey();
        List<GroupInfo> list = entry.getValue();
        int idx = list.get(0).groupIndex();
        int group = idx > -1 ? idx + 1 : -1; // TODO cache that number (perf)?
        String value = matcher.group(group);
        if (value != null && (value.length() > 0 || addEmptyStrings)) {
          outputRecord.put(groupName, value);
        }
      }
    }

   
    ///////////////////////////////////////////////////////////////////////////////
    // Nested classes:
    ///////////////////////////////////////////////////////////////////////////////
    private static enum NumRequiredMatches {
      atLeastOnce,
      once,
      all    
    }    

  }
 
}
TOP

Related Classes of com.cloudera.cdk.morphline.stdlib.GrokBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.