Package com.linkedin.databus2.ggParser.staxparser.validator

Source Code of com.linkedin.databus2.ggParser.staxparser.validator.XmlFormatTrailParser

/*
*
* Copyright 2013 LinkedIn Corp. All rights reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
package com.linkedin.databus2.ggParser.staxparser.validator;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.RandomAccessFile;
import java.io.SequenceInputStream;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;

import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.apache.log4j.Logger;

import com.linkedin.databus.core.ConcurrentAppendableCompositeFileInputStream;
import com.linkedin.databus2.ggParser.XmlStateMachine.ColumnState;
import com.linkedin.databus2.ggParser.XmlStateMachine.DbUpdateState;
import com.linkedin.databus2.ggParser.XmlStateMachine.TokenState;

/**
* A parser that reads XMLFORMAT GoldenGate trail file and validates it.
*/
public class XmlFormatTrailParser implements Runnable
{
  private static final long POSITION_REPORT_GAP = 10L * 1024L * 1024;
  private static final long POSITION_REPORT_TIME_MS = 30 * 1000;
  private static final int ERROR_CONTEXT_LEN = 100;

  private static final String TOKEN_XID = "TK-XID";

  public static final String DTD =
      "<!DOCTYPE root [ \n" +
        "<!ELEMENT root (transaction)*> \n" +
        "<!ELEMENT transaction (dbupdate)*> \n" +
        "<!ATTLIST transaction \n" +
        "    timestamp CDATA #REQUIRED \n" +
        ">\n" +
        "<!ELEMENT dbupdate (columns, tokens)> \n" +
        "<!ATTLIST dbupdate \n" +
        "    table CDATA #REQUIRED \n" +
        "    type  (insert|delete|update) #REQUIRED \n" +
        ">\n" +
        "<!ELEMENT columns (column)+>\n" +
        "<!ELEMENT column (#PCDATA)>\n" +
        "<!ATTLIST column \n" +
        "    name   CDATA #REQUIRED \n" +
        "    key    (true) #IMPLIED \n" +
        "    status CDATA #IMPLIED \n" +
        ">\n" +
        "<!ELEMENT tokens (token)+>\n" +
        "<!ELEMENT token (#PCDATA)>\n" +
        "<!ATTLIST token \n" +
        "    name CDATA #REQUIRED \n" +
        ">\n" +
      "]>\n";

  private static enum DbupdateType
  {
    INSERT,
    DELETE,
    UPDATE
  };

  private final Logger _log;
  /** The STAX XML reader factory*/
  private final XMLInputFactory _xmlInputFactory;
  /** The STAX XML reader */
  private final XMLStreamReader _xmlStreamReader;
  /** The input stream that coaslesces all trail files into a single stream*/
  private final ConcurrentAppendableCompositeFileInputStream _inputStream;
  /** A hack to add missing XML root element in the trails */
  private final InputStream _realInputStream;
  /** A flag to shutdown the parsing */
  private final AtomicBoolean _shutdownRequested = new AtomicBoolean(false);
  /** The last observed error */
  private Throwable _lastError = null;
  /** The currently processed trail file name */
  private String _lastFileName = null;
  /** The offset of the last block of data read by the XML reader because it does
   * internal buffering */
  private long _lastPosition = 0;
  /** The state of the parser */
  private String _currentTableName;
  /** A map from dbname.tablename to a set of column names we've seen for that table */
  private Map<String, Set<String>> _tableColumns = new HashMap<String, Set<String>>();
  /** The set of columns we've seen for the current table */
  private Set<String> _curTableColumns = new HashSet<String>();
  /** The set of GG tokens we've seen for the current table */
  private Set<String> _curTableTokens = new HashSet<String>();
  /** The type of the current <dbupdate> */
  private DbupdateType _dbupdateType;
  private final boolean _continueOnError;
  private final PrintStream _errOut;
  private long _errorCount = 0;

  public XmlFormatTrailParser(ConcurrentAppendableCompositeFileInputStream inputStream,
                              boolean validating,
                              Logger log,
                              String errorLogFile) throws XMLStreamException, IOException
  {
    super();
    _log = (null == log) ? Logger.getLogger(XmlFormatTrailParser.class) : log;
    _inputStream = inputStream;
    _realInputStream = wrapStreamWithXmlTags(_inputStream);
    _xmlInputFactory =  createXmlInputFactory(validating);
    _xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
    _xmlInputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.TRUE);
    _xmlStreamReader = _xmlInputFactory.createXMLStreamReader(_realInputStream);
    _continueOnError = null != errorLogFile;
    if (_continueOnError)
    {
      _errOut = new PrintStream(errorLogFile);
    }
    else
    {
      _errOut = null;
    }
  }

  private XMLInputFactory createXmlInputFactory(boolean validating)
      throws FactoryConfigurationError
  {
    XMLInputFactory result = null;
    Throwable createError = null;
    try
    {
      @SuppressWarnings("unchecked")
      Class<XMLInputFactory> woodstoxFactory =
          (Class<XMLInputFactory>)Class.forName("com.ctc.wstx.stax.WstxInputFactory");
      result = woodstoxFactory.newInstance();
      if (validating)
      {
        _log.info("found woodstox library: DTD validation will be enabled");
        result.setProperty(XMLInputFactory.IS_VALIDATING, validating);
      }
    }
    catch (ClassNotFoundException e)
    {
      createError = e;
    }
    catch (InstantiationException e)
    {
      createError = e;
    }
    catch (IllegalAccessException e)
    {
      createError = e;
    }
    catch (RuntimeException e)
    {
      createError = e;
    }

    if (null != createError)
    {
      _log.info("unable to find woodstox library, defaulting to Java: " + createError);
      if (validating)
      {
        _log.warn("default implementation does not support DTD validation");
      }
      result = XMLInputFactory.newInstance();
    }

    return result;
  }

  /** Make the trail files input look like real XML */
  private InputStream wrapStreamWithXmlTags(InputStream compositeInputStream)
  {

    String xmlStart = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n" + DTD + "\n<root>";
    String xmlEnd = "</root>";
    _log.info("The xml start tag used is:" + xmlStart);
    List<InputStream> xmlTagsList = Arrays.asList(new InputStream[]
                                         {
                                             new ByteArrayInputStream(xmlStart.getBytes(Charset.forName("ISO-8859-1"))),
                                             compositeInputStream,
                                             new ByteArrayInputStream(xmlEnd.getBytes(Charset.forName("ISO-8859-1"))),
                                         });
    Enumeration<InputStream> streams = Collections.enumeration(xmlTagsList);
    SequenceInputStream seqStream = new SequenceInputStream(streams);
    return seqStream;
  }

  public void shutdownAsyncronously()
  {
    _shutdownRequested.set(true);
  }

  /**
   * @see java.lang.Runnable#run()
   */
  @Override
  public void run()
  {
    long lastReportTs = 0;
    long lastReportedPosition = _lastPosition;
    long savePos = _lastPosition;
    try
    {
      while (! _shutdownRequested.get() && _xmlStreamReader.hasNext())
      {
        int eventType = _xmlStreamReader.next();
        switch (eventType)
        {
        case XMLStreamConstants.START_ELEMENT: processStartElement(); break;
        case XMLStreamConstants.END_ELEMENT: processEndElement(); break;
//        case XMLStreamConstants.ATTRIBUTE: processAttribute(); break;
        default: break; //do nothing -- just log progress
        }

        File curFile = _inputStream.getCurrentFile();
        if (null != curFile && ! _inputStream.isClosed())
        {
          String curFileName = curFile.getName();
          long currentTs = System.currentTimeMillis();
          long curPos = _inputStream.getCurrentPosition();
          if (curPos != savePos)
          {
            //the XML reader seems to do internal buffering which makes it harder to
            //guess the position where a problem happened.
            //At any point the parser is processing between the bytes [_lastPosition, curPos).
            _lastPosition = savePos;
            savePos = curPos;
          }
          if (! curFileName.equals(_lastFileName) ||
              (curPos - lastReportedPosition) >= POSITION_REPORT_GAP ||
              (currentTs - lastReportTs) >= POSITION_REPORT_TIME_MS)
          {
            lastReportedPosition = curPos;
            lastReportTs = currentTs;
            _log.info("file: " + curFileName +  "; pos: " +  curPos);
            logTablesSeen();
          }
          _lastFileName = curFileName;
        }
      }
    }
    catch (XMLStreamException e)
    {
      _log.error("xml stream error: " + e, e);
      _lastError = e;
    }
    catch (RuntimeException e)
    {
      _log.error("runtime error: " + e, e);
      _lastError = e;
    }
  }

  private void processError(String msg, Throwable e)
  {
    ++_errorCount;
    if (_continueOnError)
    {
      _log.error("PARSE ERROR: " + msg);
      _errOut.println("=========================================");
      _errOut.println(String.format("PARSE ERROR %s", msg) );
      try
      {
        printErrorContext(_errOut);
      }
      catch (IOException e1)
      {
        _errOut.println("I/O error: " + e1);
      }
      _errOut.println("=========================================");
    }
    else
    {
      if (null != e)
      {
        throw new RuntimeException(msg);
      }
      else
      {
        throw new RuntimeException(msg, e);
      }
    }
  }

  /** In case an error print out the trail file context. Because of the
   * XML reader buffering, we can only get the block (8K). It has to be
   * visually inspected */
  protected void printErrorContext(PrintStream errOut) throws IOException
  {
    final File file = _inputStream.getCurrentFile();
    final long position = _inputStream.getCurrentPosition();
    File lastFile = new File(file.getParentFile(), getLastFileName());
    errOut.println("error between " + lastFile + " @ " + getLastPosition() +
              " and " + _inputStream.getCurrentFile() +
              " @ " + _inputStream.getCurrentPosition());
    RandomAccessFile f = new RandomAccessFile(file, "r");
    try
    {
      long startPos = lastFile.equals(file) ? getLastPosition() : 0;
      long endPos = position + ERROR_CONTEXT_LEN;
      int contextSize = (int)(endPos - startPos);
      byte[] context = new byte[contextSize];
      f.seek(startPos);
      if (f.read(context, 0, contextSize) > 0)
      {
        errOut.println("context: " + new String(context, "ISO-8859-1"));
      }
      else
      {
        errOut.println("unable to read XML error context");
      }
    }
    finally
    {
      f.close();
    }
  }

  /**
   * Logs the currently discovered tables
   */
  private void logTablesSeen()
  {
    if (_log.isInfoEnabled())
    {
      _log.info("Discovered tables:" + _tableColumns);
    }
  }

  /**
   * Processes a GG token for the current table update
   */
  private void processToken(String tokenName)
  {
    _curTableTokens.add(tokenName);
    if (_log.isDebugEnabled())
    {
      _log.debug("added token " + tokenName + " for table " + _currentTableName);
    }
  }

  /**
   * Processes a new column for the current table update
   */
  private void processColumn(String columnName, boolean isKey)
  {
    if (isKey)
    {
      _curTableColumns.add("*" + columnName + "*");
    }
    else
    {
      _curTableColumns.add(columnName);
    }
    if (_log.isDebugEnabled())
    {
      _log.debug("added column " + columnName + " for table " + _currentTableName);
    }

  }

  /**
   *
   */
  private void processEndElement()
  {
    final String elemName = _xmlStreamReader.getLocalName();
    if (ColumnState.COLUMNSTATE.equals(elemName))
    {
      endColumnElement();
    }
    else if (TokenState.TOKENSTATE.equals(elemName))
    {
      endTokenElement();
    }
    else if (DbUpdateState.DBUPDATE.equals(elemName))
    {
      endDbupdateElement();
    }
  }

  /**
   * End processing a <dbupdate> element
   */
  private void endDbupdateElement()
  {
    validateTokens();
    validateColumns();
    _dbupdateType = null;
  }

  /**
   * Validate the columns we've seen for a table update:
   * (1) GG-specific columns are present
   *
   * <p>The method also keeps track of all columns we've seen for a table
   */
  private void validateColumns()
  {
    if (_log.isDebugEnabled())
    {
      _log.debug("validating columns " + _curTableColumns + " for table " + _currentTableName);
    }
    /** GG_STATUS and GG_MODI_TS may or may not be present for delete operations */
    if (DbupdateType.DELETE != _dbupdateType)
    {
      if (! _curTableColumns.contains("GG_STATUS"))
      {
        processError("missing GG_STATUS column for table " + _currentTableName + " update type:" + _dbupdateType, null);
      }
      if (! _curTableColumns.contains("GG_MODI_TS"))
      {
        processError("missing column GG_MODI_TS for table " + _currentTableName + " update type:" + _dbupdateType, null);
      }
    }

    if (_tableColumns.containsKey(_currentTableName))
    {
      Set<String> cols = _tableColumns.get(_currentTableName);
      cols.addAll(_curTableColumns);
    }
    else
    {
      _tableColumns.put(_currentTableName, new HashSet<String>(_curTableColumns));
    }
  }

  /**
   * Validates the token for the table update:
   * (1) make sure that the transaction id is there
   * (2) make sure that the SCN is there
   */
  private void validateTokens()
  {
    if (_log.isDebugEnabled())
    {
      _log.debug("validating tokens " + _curTableTokens + " for table " + _currentTableName);
    }
    if (! _curTableTokens.contains(TOKEN_XID))
    {
      processError("missing transaction id TK-XID for table "  + _currentTableName, null);
    }
    if (! _curTableTokens.contains(TokenState.TOKENSCN))
    {
      processError("missing SCN TK-CSN for table "  + _currentTableName, null);
    }
  }

  /**
   * End processing a <token> element
   */
  private void endTokenElement()
  {
  }

  /**
   * End processing a <column> element
   */
  private void endColumnElement()
  {
  }

  /** Process the start of an XML element - see if it is a element we care about */
  private void processStartElement()
  {
    final String elemName = _xmlStreamReader.getLocalName();
    if (ColumnState.COLUMNSTATE.equals(elemName))
    {
      startColumnElement();
    }
    else if (TokenState.TOKENSTATE.equals(elemName))
    {
      startTokenElement();
    }
    else if (DbUpdateState.DBUPDATE.equals(elemName))
    {
      startDbupdateElement();
    }
  }

  /**
   * Start processing a <dbupdate> element
   */
  private void startDbupdateElement()
  {
    _curTableColumns.clear();
    _curTableTokens.clear();
    _currentTableName = _xmlStreamReader.getAttributeValue(null, DbUpdateState.TABLEATTR);
    String dbupdateTypeStr = _xmlStreamReader.getAttributeValue(null, DbUpdateState.UPDATEATTRNAME);
    if (null == dbupdateTypeStr)
    {
      processError("missing type for <dbupdate> element for table " + _currentTableName, null);
    }
    try
    {
      _dbupdateType = DbupdateType.valueOf(dbupdateTypeStr.toUpperCase());
    }
    catch (IllegalArgumentException e)
    {
      processError("unknown <dbupdate> type:" + dbupdateTypeStr, null);
    }
  }

  /**
   * Start processing a <token> element
   */
  private void startTokenElement()
  {
    processToken(_xmlStreamReader.getAttributeValue(null, TokenState.TOKENATTRNAME));
  }

  /**
   * Start processing a <column> element
   */
  private void startColumnElement()
  {
    processColumn(_xmlStreamReader.getAttributeValue(null, ColumnState.FIELDNAMEATTR),
                  null != _xmlStreamReader.getAttributeValue(null, ColumnState.KEYNAMEATTR));
  }

  public Logger getLog()
  {
    return _log;
  }

  /**
   * Last parsing or validation error
   */
  public Throwable getLastError()
  {
    return _lastError;
  }

  /**
   * @return the lastPosition
   */
  public long getLastPosition()
  {
    return _lastPosition;
  }

  /**
   * @return the lastFileName
   */
  public String getLastFileName()
  {
    return _lastFileName;
  }

  /**
   * @return the errorCount
   */
  public long getErrorCount()
  {
    return _errorCount;
  }


}
TOP

Related Classes of com.linkedin.databus2.ggParser.staxparser.validator.XmlFormatTrailParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.