Package org.apache.uima.cas.impl

Source Code of org.apache.uima.cas.impl.XCASDeserializer$FSInfo

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.cas.impl;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.SofaFS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.internal.util.IntVector;
import org.apache.uima.internal.util.StringUtils;
import org.apache.uima.internal.util.rb_trees.RedBlackTree;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

/**
* XCAS Deserializer. Takes an XCAS and reads it into a CAS.
*
*
*/
public class XCASDeserializer {

  private static class FSInfo {

    private int addr;

    private IntVector indexRep;

    private FSInfo(int addr, IntVector indexRep) {
      super();
      this.addr = addr;
      this.indexRep = indexRep;
    }

  }

  private class XCASDeserializerHandler extends DefaultHandler {

    // ///////////////////////////////////////////////////////////////////////
    // Internal states for the parser.

    // Expect the start of the XML document.
    private static final int DOC_STATE = 0;

    // At the top level. Expect a FS, or the document text element, or the end of the
    // XML input.
    private static final int FS_STATE = 1;

    // Inside a FS. Expect features, or the end of the FS.
    private static final int FEAT_STATE = 2;

    // Inside FS. We have seen a _content attribute, and expect text.
    private static final int CONTENT_STATE = 3;

    // Inside a feature element. We expect the feature value.
    private static final int FEAT_CONTENT_STATE = 4;

    // Inside an array element. Expect array element value.
    private static final int ARRAY_ELE_CONTENT_STATE = 5;

    // Inside an array FS. Expect an array element, or the end of the FS.
    private static final int ARRAY_ELE_STATE = 6;

    // Inside the document text element. Expect the doc text.
    private static final int DOC_TEXT_STATE = 7;

    // Inside an Out-Of-Typesystem FS. Expect features, or the end of the FS.
    private static final int OOTS_FEAT_STATE = 8;

    // Inside an Out-Of-Typesystem FS. We have seen a _content attribute,
    // and expect text.
    private static final int OOTS_CONTENT_STATE = 9;

    // Default feature name for contents of an FS element, if not specified by _content attribute.
    private static final String DEFAULT_CONTENT_FEATURE = "value";

    // End parser states.
    // ///////////////////////////////////////////////////////////////////////

    private static final String reservedAttrPrefix = "_";

    // For error message printing, if the Locator object can't provide source
    // of XML input.
    private static final String unknownXMLSource = "<unknown>";

    // private long time;

    // SAX locator. Used for error message generation.
    private Locator locator;

    // The CAS we're filling.
    private CASImpl cas;

    // Store FSs with ID in a search tree (for later reference resolution).
    private RedBlackTree fsTree;

    // Store IDless FSs in a vector;
    private ArrayList idLess;

    // What we expect next.
    private int state;

    // StringBuffer to accumulate text.
    private StringBuffer buffer;

    // The address of the most recently created FS. Needed for array elements
    // and embedded feature values.
    private int currentAddr;

    // The name of the content feature, if we've seen one.
    private String currentContentFeat = DEFAULT_CONTENT_FEATURE;

    // The current position when parsing array elements.
    private int arrayPos;

    // Stores out of type system data (APL)
    private OutOfTypeSystemData outOfTypeSystemData;

    // Current out of type system FS
    private FSData currentOotsFs;

    // SofaFS type
    private int sofaTypeCode;

    // AnnotationBase type
    private Type annotBaseType;

    // Store IndexRepositories in a vector;
    private ArrayList indexRepositories;

    // and Views too
    private ArrayList views;

    // for processing v1.x format XCAS
    // map from sofa int values to id references
    private IntVector sofaRefMap;

    // map incoming _indexed values
    private IntVector indexMap;

    // working with initial view
    private int nextIndex;

    private XCASDeserializerHandler(CASImpl aCAS, OutOfTypeSystemData ootsData) {
      super();
      this.cas = aCAS.getBaseCAS();
      // Reset the CAS.
      cas.resetNoQuestions();
      this.fsTree = new RedBlackTree();
      this.idLess = new ArrayList();
      this.buffer = new StringBuffer();
      this.outOfTypeSystemData = ootsData;
      this.indexRepositories = new ArrayList();
      this.views = new ArrayList();
      // using the baseCas for indexing Sofas
      indexRepositories.add(this.cas.getBaseIndexRepository());
      // There should always be another index for the Initial View
      indexRepositories.add(this.cas.getView(CAS.NAME_DEFAULT_SOFA).getIndexRepository());
      this.sofaTypeCode = cas.ll_getTypeSystem().ll_getCodeForType(
              cas.getTypeSystem().getType(CAS.TYPE_NAME_SOFA));
      this.annotBaseType = this.cas.getAnnotationType();
      this.sofaRefMap = new IntVector();
      this.indexMap = new IntVector();
      // add entry for baseCAS ... point non-compliant annotations at first Sofa
      sofaRefMap.add(1);
      // add entry for baseCAS ... _indexed=0 stays in 0
      indexMap.add(0);
    }

    private final void resetBuffer() {
      // this.buffer.delete(0, this.buffer.length());
      this.buffer = new StringBuffer();
    }

    /*
     * (non-Javadoc)
     *
     * @see org.xml.sax.ContentHandler#startDocument()
     */
    public void startDocument() throws SAXException {
      // Do setup work in the constructor.
      this.state = DOC_STATE;
      // System.out.println("Starting to read document.");
      // time = System.currentTimeMillis();
    }

    /*
     * (non-Javadoc)
     *
     * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String,
     *      java.lang.String, org.xml.sax.Attributes)
     */
    public void startElement(String nameSpaceURI, String localName, String qualifiedName,
            Attributes attrs) throws SAXException {
      // org.apache.vinci.debug.Debug.p("startElement: " + qualifiedName);
      // if (attrs != null) {
      // for (int i=0; i<attrs.getLength(); i++) {
      // org.apache.vinci.debug.Debug.p("a: " + attrs.getQName(i) + " v: " + attrs.getValue(i));
      // }
      // }
      resetBuffer();
      switch (state) {
        case DOC_STATE: {
          if (!qualifiedName.equals(XCASSerializer.casTagName)) {
            throw createException(XCASParsingException.WRONG_ROOT_TAG, qualifiedName);
          }
          this.state = FS_STATE;
          break;
        }
        case FS_STATE: {
          this.currentContentFeat = DEFAULT_CONTENT_FEATURE;
          if (qualifiedName.equals(getDocumentTypeName())) {
            readDocument(attrs);
          } else {
            readFS(qualifiedName, attrs);
          }
          break;
        }
        case ARRAY_ELE_STATE: {
          readArrayElement(qualifiedName, attrs);
          break;
        }
        default: {
          // If we're not in an element expecting state, raise an error.
          throw createException(XCASParsingException.TEXT_EXPECTED, qualifiedName);
        }
      }
    }

    // Get ready to read document text.
    private void readDocument(Attributes attrs) {
      this.state = DOC_TEXT_STATE;
    }

    // Get ready to read array element.
    private void readArrayElement(String eleName, Attributes attrs) throws SAXParseException {
      if (!eleName.equals(XCASSerializer.ARRAY_ELEMENT_TAG)) {
        throw createException(XCASParsingException.ARRAY_ELE_EXPECTED, eleName);
      }
      if (attrs.getLength() > 0) {
        throw createException(XCASParsingException.ARRAY_ELE_ATTRS);
      }
      this.state = ARRAY_ELE_CONTENT_STATE;
      // resetBuffer();
    }

    // Create a new FS.
    private void readFS(String qualifiedName, Attributes attrs) throws SAXParseException {
      String typeName = getCasTypeName(qualifiedName);
      TypeImpl type = (TypeImpl) ts.getType(typeName);
      if (type == null) {
        if (typeName.equals("uima.cas.SofA")) {
          // temporary fix for XCAS written with pre-public version of Sofas
          type = (TypeImpl) ts.getType("uima.cas.Sofa");
        }
      }
      if (type == null) {
        if (this.outOfTypeSystemData == null) {
          throw createException(XCASParsingException.UNKNOWN_TYPE, typeName);
        } else {
          // add this FS to out-of-typesystem data - this also sets the
          // parser state appropriately (APL)
          addToOutOfTypeSystemData(typeName, attrs);
        }
      } else {
        if (cas.isArrayType(type.getCode())) {
          readArray(type, attrs);
          return;
        }
        final int addr = cas.ll_createFS(type.getCode());
        readFS(addr, attrs, true);
      }
    }

    /**
     *
     * @param addr
     * @param attrs
     * @param toIndex
     *          Special hack to accomodate document annotation, which is already in the index.
     * @throws SAXParseException
     */
    private void readFS(final int addr, Attributes attrs, boolean toIndex) throws SAXParseException {
      // Hang on address for setting content feature
      this.currentAddr = addr;
      /**
       * if (this.currentFS==null) { this.currentFS = new FeatureStructureImplC(cas,addr); } else
       * this.currentFS.setUp(cas,addr);
       */
      int id = -1;
      IntVector indexRep = new IntVector(); // empty means not indexed
      String attrName, attrValue;
      final int heapValue = cas.getHeapValue(addr);
      final Type type = cas.ll_getTypeSystem().ll_getTypeForCode(cas.ll_getFSRefType(addr));

      // Special handling for Sofas
      if (sofaTypeCode == heapValue) {
        // create some maps to handle v1 format XCAS ...
        // ... where the sofa feature of annotations was an int not a ref

        // determine if this is the one and only initial view Sofa
        boolean isInitialView = false;
        String sofaID = attrs.getValue(CAS.FEATURE_BASE_NAME_SOFAID);
        if (sofaID.equals("_DefaultTextSofaName")) {
          sofaID = CAS.NAME_DEFAULT_SOFA;
        }
        if (uimaContext != null) {
          // Map incoming SofaIDs
          sofaID = uimaContext.mapToSofaID(sofaID).getSofaID();
        }
        if (sofaID.equals(CAS.NAME_DEFAULT_SOFA)) {
          isInitialView = true;
        }
        // get the sofaNum
        String sofaNum = attrs.getValue(CAS.FEATURE_BASE_NAME_SOFANUM);
        int thisSofaNum = Integer.parseInt(sofaNum);

        // get the sofa's FeatureStructure id
        int sofaFsId = Integer.parseInt(attrs.getValue(XCASSerializer.ID_ATTR_NAME));

        // for v1 and v2 formats, create the index map
        // ***we assume Sofas are always received in Sofanum order***
        // Two scenarios ... the initial view is the first sofa, or not.
        // If not, the _indexed values need to be remapped to leave room for the initial view,
        // which may or may not be in the received CAS.
        if (this.indexMap.size() == 1) {
          if (isInitialView) {
            // the first Sofa an initial view
            if (thisSofaNum == 2) {
              // this sofa was mapped to the initial view
              this.indexMap.add(-1); // for this CAS, there should not be a sofanum = 1
              this.indexMap.add(1); // map 2 to 1
              this.nextIndex = 2;
            } else {
              this.indexMap.add(1);
              this.nextIndex = 2;
            }
          } else {
            if (thisSofaNum > 1) {
              // the first Sofa not initial, but sofaNum > 1
              // must be a v2 format, and sofaNum better be 2
              this.indexMap.add(1);
              assert (thisSofaNum == 2);
              this.indexMap.add(2);
              this.nextIndex = 3;
            } else {
              // must be v1 format
              this.indexMap.add(2);
              this.nextIndex = 3;
            }
          }
        } else {
          // if the new Sofa is the initial view, always map to 1
          if (isInitialView) {
            // the initial view is not the first
            // if v2 format, space already reserved in mapping
            if (this.indexMap.size() == thisSofaNum) {
              // v1 format, add mapping for initial view
              this.indexMap.add(1);
            }
          } else {
            this.indexMap.add(this.nextIndex);
            this.nextIndex++;
          }
        }

        // Now update the mapping from annotation int to ref values
        if (this.sofaRefMap.size() == thisSofaNum) {
          // Sofa received in sofaNum order, add new one
          this.sofaRefMap.add(sofaFsId);
        } else if (this.sofaRefMap.size() > thisSofaNum) {
          // new Sofa has lower sofaNum than last one
          this.sofaRefMap.set(thisSofaNum, sofaFsId);
        } else {
          // new Sofa has skipped ahead more than 1
          this.sofaRefMap.setSize(thisSofaNum + 1);
          this.sofaRefMap.set(thisSofaNum, sofaFsId);
        }

      }

      for (int i = 0; i < attrs.getLength(); i++) {
        attrName = attrs.getQName(i);
        attrValue = attrs.getValue(i);
        if (attrName.startsWith(reservedAttrPrefix)) {
          if (attrName.equals(XCASSerializer.ID_ATTR_NAME)) {
            try {
              id = Integer.parseInt(attrValue);
            } catch (NumberFormatException e) {
              throw createException(XCASParsingException.ILLEGAL_ID, attrValue);
            }
          } else if (attrName.equals(XCASSerializer.CONTENT_ATTR_NAME)) {
            this.currentContentFeat = attrValue;
            // this.state = CONTENT_STATE; APL-6/28/04 - removed, see below
          } else if (attrName.equals(XCASSerializer.INDEXED_ATTR_NAME)) {
            // if (attrValue.equals(XCASSerializer.TRUE_VALUE) && toIndex)
            String[] arrayvals = parseArray(attrValue);
            for (int s = 0; s < arrayvals.length; s++) {
              indexRep.add(Integer.parseInt(arrayvals[s]));
            }
          } else {
            handleFeature(type, addr, attrName, attrValue, false);
          }
        } else {
          if (sofaTypeCode == heapValue) {
            if (attrName.equals(CAS.FEATURE_BASE_NAME_SOFAID)) {
              if (attrValue.equals("_DefaultTextSofaName")) {
                // First change old default Sofa name into the new one
                attrValue = CAS.NAME_DEFAULT_SOFA;
              }
              if (uimaContext != null) {
                // Map incoming SofaIDs
                attrValue = uimaContext.mapToSofaID(attrValue).getSofaID();
              }
            }
          }
          handleFeature(type, addr, attrName, attrValue, false);
        }
      }

      if (sofaTypeCode == heapValue) {
        // If a Sofa, create CAS view to get new indexRepository
        SofaFS sofa = (SofaFS) cas.createFS(addr);
        // also add to indexes so we can retrieve the Sofa later
        cas.getBaseIndexRepository().addFS(sofa);
        CAS view = cas.getView(sofa);
        if (sofa.getSofaRef() == 1) {
          cas.registerInitialSofa();
        } else {
          // add indexRepo for views other than the initial view
          indexRepositories.add(cas.getSofaIndexRepository(sofa));
        }
        ((CASImpl) view).registerView(sofa);
        views.add(view);
      }
      FSInfo fsInfo = new FSInfo(addr, indexRep);
      if (id < 0) {
        idLess.add(fsInfo);
      } else {
        fsTree.put(id, fsInfo);
      }
      // Set the state; we're either expecting features, or _content.
      // APL - 6/28/04 - even if _content attr is not specified, we can still have content, which
      // would
      // be assigned to the "value" feature, as per XCAS spec. FEAT_STATE did not really seem to be
      // working, anyway.
      this.state = CONTENT_STATE;
      // if (this.state != CONTENT_STATE)
      // {
      // this.state = FEAT_STATE;
      // }
    }

    // Create a new array FS.
    private void readArray(TypeImpl type, Attributes attrs) throws SAXParseException {
      String attrName, attrVal;
      // No entries in indexRep means not indexed
      IntVector indexRep = new IntVector();
      int size = 0;
      int id = -1;
      for (int i = 0; i < attrs.getLength(); i++) {
        attrName = attrs.getQName(i);
        attrVal = attrs.getValue(i);
        if (attrName.equals(XCASSerializer.ID_ATTR_NAME)) {
          try {
            id = Integer.parseInt(attrVal);
          } catch (NumberFormatException e) {
            throw createException(XCASParsingException.ILLEGAL_ID, attrVal);
          }
        } else if (attrName.equals(XCASSerializer.ARRAY_SIZE_ATTR)) {
          try {
            size = Integer.parseInt(attrVal);
            if (size < 0) {
              throw createException(XCASParsingException.ILLEGAL_ARRAY_SIZE, attrVal);
            }
          } catch (NumberFormatException e) {
            throw createException(XCASParsingException.INTEGER_EXPECTED, attrVal);
          }
        } else if (attrName.equals(XCASSerializer.INDEXED_ATTR_NAME)) {
          String[] arrayvals = parseArray(attrVal);
          for (int s = 0; s < arrayvals.length; s++) {
            indexRep.add(Integer.parseInt(arrayvals[s]));
          }
        } else {
          throw createException(XCASParsingException.ILLEGAL_ARRAY_ATTR, attrName);
        }
      }
      FeatureStructureImplC fs;
      if (cas.isBooleanArrayType(type)) {
        fs = (FeatureStructureImplC) cas.createBooleanArrayFS(size);
      } else if (cas.isByteArrayType(type)) {
        fs = (FeatureStructureImplC) cas.createByteArrayFS(size);
      } else if (cas.isShortArrayType(type)) {
        fs = (FeatureStructureImplC) cas.createShortArrayFS(size);
      } else if (cas.isLongArrayType(type)) {
        fs = (FeatureStructureImplC) cas.createLongArrayFS(size);
      } else if (cas.isDoubleArrayType(type)) {
        fs = (FeatureStructureImplC) cas.createDoubleArrayFS(size);
      } else {
        int addr = cas.createTempArray(type.getCode(), size);
        fs = new FeatureStructureImplC(cas, addr);
      }

      final int addr = fs.getAddress();
      FSInfo fsInfo = new FSInfo(addr, indexRep);
      if (id >= 0) {
        fsTree.put(id, fsInfo);
      } else {
        idLess.add(fsInfo);
      }
      // Hang on to those for setting array values.
      this.currentAddr = addr;
      this.arrayPos = 0;

      this.state = ARRAY_ELE_STATE;
    }

    // The definition of a null value. Any other value must be in the expected
    // format.
    private final boolean emptyVal(String val) {
      return ((val == null) || (val.length() == 0));
    }

    // Create a feature value from a string representation.
    private void handleFeature(int addr, String featName, String featVal, boolean lenient)
            throws SAXParseException {
      int typeCode = cas.ll_getFSRefType(addr);
      Type type = cas.ll_getTypeSystem().ll_getTypeForCode(typeCode);
      handleFeature(type, addr, featName, featVal, lenient);
    }

    private void handleFeature(final Type type, int addr, String featName, String featVal,
            boolean lenient) throws SAXParseException {
      // The FeatureMap approach is broken because it assumes feature short names
      // are unique. This is my quick fix. -APL
      // final FeatureImpl feat = (FeatureImpl) featureMap.get(featName);

      // handle v1.x format annotations, mapping int to ref values
      if (featName.equals("sofa") && ts.subsumes(this.annotBaseType, type)) {
        featVal = Integer.toString(this.sofaRefMap.get(Integer.parseInt(featVal)));
      }

      // handle v1.x sofanum values, remapping so that _InitialView always == 1
      if (featName.equals(CAS.FEATURE_BASE_NAME_SOFAID)
              && this.sofaTypeCode == cas.getHeapValue(addr)) {
        Type sofaType = ts.ll_getTypeForCode(this.sofaTypeCode);
        final FeatureImpl sofaNumFeat = (FeatureImpl) sofaType
                .getFeatureByBaseName(CAS.FEATURE_BASE_NAME_SOFANUM);
        int sofaNum = cas.getFeatureValue(addr, sofaNumFeat.getCode());
        cas.setFeatureValue(addr, sofaNumFeat.getCode(), this.indexMap.get(sofaNum));
      }

      String realFeatName;
      if (featName.startsWith(XCASSerializer.REF_PREFIX)) {
        realFeatName = featName.substring(XCASSerializer.REF_PREFIX.length());
      } else {
        realFeatName = featName;
      }
      final FeatureImpl feat = (FeatureImpl) type.getFeatureByBaseName(realFeatName);
      // System.out.println("DEBUG - Feature map result: " + featName + " = " + feat.getName());
      if (feat == null) { // feature does not exist in typesystem
        if (outOfTypeSystemData != null) {
          // Add to Out-Of-Typesystem data (APL)
          Integer addrInteger = new Integer(addr);
          List ootsAttrs = (List) outOfTypeSystemData.extraFeatureValues.get(addrInteger);
          if (ootsAttrs == null) {
            ootsAttrs = new ArrayList();
            outOfTypeSystemData.extraFeatureValues.put(addrInteger, ootsAttrs);
          }
          ootsAttrs.add(new String[] { featName, featVal });
        } else if (!lenient) {
          throw createException(XCASParsingException.UNKNOWN_FEATURE, featName);
        }
      } else {
        if (cas.ll_isRefType(ts.range(feat.getCode()))) {
          cas.setFeatureValue(addr, feat.getCode(), Integer.parseInt(featVal));
        } else {
          cas.setFeatureValueFromString(addr, feat.getCode(), featVal);
        }

      }
    }

    /*
     * (non-Javadoc)
     *
     * @see org.xml.sax.ContentHandler#characters(char[], int, int)
     */
    public void characters(char[] chars, int start, int length) throws SAXException {
      switch (this.state) {
        case DOC_TEXT_STATE:
        case CONTENT_STATE:
        case OOTS_CONTENT_STATE:
        case ARRAY_ELE_CONTENT_STATE:
        case FEAT_CONTENT_STATE:
          buffer.append(chars, start, length);
          break;
        default:
      }
    }

    boolean isAllWhitespace(StringBuffer b) {
      final int len = b.length();
      for (int i = 0; i < len; i++) {
        if (!Character.isWhitespace(b.charAt(i))) {
          return false;
        }
      }
      return true;
    }

    /*
     * (non-Javadoc)
     *
     * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String,
     *      java.lang.String)
     */
    public void endElement(String nsURI, String localName, String qualifiedName)
            throws SAXException {
      switch (this.state) {
        case DOC_STATE: {
          // Do nothing.
          break;
        }
        case FS_STATE: {
          this.state = DOC_STATE;
          break;
        }
        case FEAT_STATE: {
          this.state = FS_STATE;
          break;
        }
        case CONTENT_STATE: {
          // Set the value of the content feature.
          if (!isAllWhitespace(buffer)) {
            try {
              handleFeature(currentAddr, currentContentFeat, buffer.toString(), true);
            } catch (XCASParsingException x) {
              // Not sure why we are calling handleFeature for WF content
            }
          }
          this.state = FS_STATE;
          break;
        }
        case FEAT_CONTENT_STATE: {
          // Create a feature value from an element.
          handleFeature(currentAddr, qualifiedName, buffer.toString(), false);
          this.state = FEAT_STATE;
          break;
        }
        case ARRAY_ELE_CONTENT_STATE: {
          // Create an array value.
          addArrayElement(buffer.toString());
          this.state = ARRAY_ELE_STATE;
          break;
        }
        case ARRAY_ELE_STATE: {
          this.state = FS_STATE;
          break;
        }
        case DOC_TEXT_STATE: {
          // Assume old style CAS with one text Sofa
          SofaFS newSofa = cas.createInitialSofa("text");
          CASImpl tcas = (CASImpl) cas.getInitialView();
          tcas.registerView(newSofa);
          // Set the document text without creating a documentAnnotation
          tcas.setDocTextFromDeserializtion(buffer.toString());

          // and assume the new Sofa is at location 1!
          int addr = 1;
          int id = 1;
          this.sofaRefMap.add(id);

          // and register the id for this Sofa
          FSInfo fsInfo = new FSInfo(addr, new IntVector());
          fsTree.put(id, fsInfo);

          this.state = FS_STATE;
          break;
        }
        case OOTS_CONTENT_STATE: {
          // Set the value of the content feature.
          if (!isAllWhitespace(buffer)) {
            // Set the value of the content feature.
            currentOotsFs.featVals.put(currentContentFeat, buffer.toString());
          }
          this.state = FS_STATE;
          break;
        }
        case OOTS_FEAT_STATE: {
          this.state = FS_STATE;
          break;
        }
      }
    }

    private void addArrayElement(String content) throws SAXParseException {
      if (arrayPos >= cas.ll_getArraySize(currentAddr)) {
        throw createException(XCASParsingException.EXCESS_ARRAY_ELE);
      }
      try {
        if (!emptyVal(content)) {
          if (cas.isArrayType(cas.getHeap().heap[currentAddr])) {
            cas.setArrayValueFromString(currentAddr, arrayPos, content);
          } else {
            System.out.println(" not a known array type ");
          }
        }
      } catch (NumberFormatException e) {
        throw createException(XCASParsingException.INTEGER_EXPECTED, content);
      }

      ++arrayPos;
    }

    /*
     * (non-Javadoc)
     *
     * @see org.xml.sax.ContentHandler#endDocument()
     */
    public void endDocument() throws SAXException {
      // time = System.currentTimeMillis() - time;
      // System.out.println("Done reading xml data in " + new TimeSpan(time));
      // System.out.println(
      // "Resolving references for id data (" + fsTree.size() + ").");
      // time = System.currentTimeMillis();
      // Resolve references, index.
      Iterator it = fsTree.iterator();
      while (it.hasNext()) {
        finalizeFS((FSInfo) it.next());
      }
      // time = System.currentTimeMillis() - time;
      // System.out.println("Done in " + new TimeSpan(time));
      // System.out.println(
      // "Resolving references for non-id data (" + idLess.size() + ").");
      // time = System.currentTimeMillis();
      for (int i = 0; i < idLess.size(); i++) {
        finalizeFS((FSInfo) idLess.get(i));
      }
      // time = System.currentTimeMillis() - time;
      // System.out.println("Done in " + new TimeSpan(time));

      // also finalize Out-Of-TypeSystem FSs and features (APL)
      if (outOfTypeSystemData != null) {
        it = outOfTypeSystemData.fsList.iterator();
        while (it.hasNext()) {
          finalizeOutOfTypeSystemFS((FSData) it.next());
        }
        finalizeOutOfTypeSystemFeatures();
      }

      for (int i = 0; i < views.size(); i++) {
        ((CASImpl) views.get(i)).updateDocumentAnnotation();
      }
    }

    private void finalizeFS(FSInfo fsInfo) {
      final int addr = fsInfo.addr;
      if (fsInfo.indexRep.size() >= 0) {
        // Now add FS to all specified index repositories
        for (int i = 0; i < fsInfo.indexRep.size(); i++) {
          if (indexMap.size() == 1) {
            ((FSIndexRepositoryImpl) indexRepositories.get(fsInfo.indexRep.get(i))).addFS(addr);
          } else {
            ((FSIndexRepositoryImpl) indexRepositories.get(indexMap.get(fsInfo.indexRep.get(i))))
                    .addFS(addr);
          }
        }
      }
      final int type = cas.getHeapValue(addr);
      if (cas.isArrayType(type)) {
        finalizeArray(type, addr, fsInfo);
        return;
      }
      int[] feats = cas.getTypeSystemImpl().ll_getAppropriateFeatures(type);
      int feat;
      FSInfo fsValInfo;
      for (int i = 0; i < feats.length; i++) {
        feat = feats[i];
        if (cas.ll_isRefType(ts.range(feats[i]))) {
          int featVal = cas.getFeatureValue(addr, feat);
          fsValInfo = (FSInfo) fsTree.get(featVal);
          if (fsValInfo == null) {
            cas.setFeatureValue(addr, feat, CASImpl.NULL);
            // this feature may be a ref to an out-of-typesystem FS.
            // add it to the Out-of-typesystem features list (APL)
            if (featVal != 0 && outOfTypeSystemData != null) {
              Integer addrInteger = new Integer(addr);
              List ootsAttrs = (List) outOfTypeSystemData.extraFeatureValues.get(addrInteger);
              if (ootsAttrs == null) {
                ootsAttrs = new ArrayList();
                outOfTypeSystemData.extraFeatureValues.put(addrInteger, ootsAttrs);
              }
              String featFullName = ts.ll_getFeatureForCode(feat).getName();
              int separatorOffset = featFullName.indexOf(TypeSystem.FEATURE_SEPARATOR);
              String featName = "_ref_" + featFullName.substring(separatorOffset + 1);
              ootsAttrs.add(new String[] { featName, Integer.toString(featVal) });
            }
          } else {
            cas.setFeatureValue(addr, feat, fsValInfo.addr);
          }
        }
      }
    }

    private void finalizeArray(int type, int addr, FSInfo fsInfo) {
      if (!cas.isFSArrayType(type)) {
        // Nothing to do.
        return;
      }
      final int size = cas.ll_getArraySize(addr);
      FSInfo fsValInfo;
      for (int i = 0; i < size; i++) {
        int arrayVal = cas.getArrayValue(addr, i);
        fsValInfo = (FSInfo) fsTree.get(arrayVal);
        if (fsValInfo == null) {
          cas.setArrayValue(addr, i, CASImpl.NULL);
          // this element may be a ref to an out-of-typesystem FS.
          // add it to the Out-of-typesystem array elements list (APL)
          if (arrayVal != 0 && outOfTypeSystemData != null) {
            Integer arrayAddrInteger = new Integer(addr);
            List ootsElements = (List) outOfTypeSystemData.arrayElements.get(arrayAddrInteger);
            if (ootsElements == null) {
              ootsElements = new ArrayList();
              outOfTypeSystemData.arrayElements.put(arrayAddrInteger, ootsElements);
            }
            // the "value" of the refrence is the ID, but we prefix with a letter to indicate
            // that this ID refers to an OOTS FS
            ArrayElement ootsElem = new ArrayElement(i, "a" + Integer.toString(arrayVal));
            ootsElements.add(ootsElem);
          }
        } else {
          cas.setArrayValue(addr, i, fsValInfo.addr);
        }
      }
    }

    /**
     * Finalizes an Out Of Type System FS by assigning a unique ID (prepending a letter) and
     * remapping ID references appropriately (both In-Type-System and Out-Of-TypeSystem refs).
     */
    private void finalizeOutOfTypeSystemFS(FSData aFS) {
      // make ID unique by prefixing a letter
      aFS.id = 'a' + aFS.id;
      // remap ref features
      Iterator it = aFS.featVals.entrySet().iterator();
      while (it.hasNext()) {
        Map.Entry entry = (Map.Entry) it.next();
        String attrName = (String) entry.getKey();
        if (attrName.startsWith("_ref_")) {
          int val = Integer.parseInt((String) entry.getValue());
          if (val >= 0) // negative numbers represent null and are left unchanged
          {
            // attempt to locate target in type system
            FSInfo fsValInfo = (FSInfo) fsTree.get(val);
            if (fsValInfo != null) {
              entry.setValue(Integer.toString(fsValInfo.addr));
            } else
            // out of type system - remap by prepending letter
            {
              entry.setValue("a" + val);
            }
          }
        }

      }
    }

    /**
     * Finalizes the Out Of Type System features (extra features on in-typesystem types).
     */
    private void finalizeOutOfTypeSystemFeatures() {
      // remap ref features
      Iterator it = outOfTypeSystemData.extraFeatureValues.values().iterator();
      while (it.hasNext()) {
        List attrs = (List) it.next();
        Iterator attrIt = attrs.iterator();
        while (attrIt.hasNext()) {
          String[] attr = (String[]) attrIt.next();
          if (attr[0].startsWith("_ref_")) {
            int val = Integer.parseInt(attr[1]);
            if (val >= 0) // negative numbers represent null and are left unchanged
            {
              // attempt to locate target in type system
              FSInfo fsValInfo = (FSInfo) fsTree.get(val);
              if (fsValInfo != null) {
                attr[1] = Integer.toString(fsValInfo.addr);
              } else
              // out of type system - remap by prepending letter
              {
                attr[1] = "a" + val;
              }
            }
          }
        }
      }
    }

    private XCASParsingException createException(int code) {
      XCASParsingException e = new XCASParsingException(code);
      String source = unknownXMLSource;
      String line = unknownXMLSource;
      String col = unknownXMLSource;
      if (locator != null) {
        source = locator.getSystemId();
        if (source == null) {
          source = locator.getPublicId();
        }
        if (source == null) {
          source = unknownXMLSource;
        }
        line = Integer.toString(locator.getLineNumber());
        col = Integer.toString(locator.getColumnNumber());
      }
      e.addArgument(source);
      e.addArgument(line);
      e.addArgument(col);
      return e;
    }

    private XCASParsingException createException(int code, String arg) {
      XCASParsingException e = createException(code);
      e.addArgument(arg);
      return e;
    }

    /*
     * (non-Javadoc)
     *
     * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
     */
    public void error(SAXParseException e) throws SAXException {
      throw e;
    }

    /*
     * (non-Javadoc)
     *
     * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
     */
    public void fatalError(SAXParseException e) throws SAXException {
      throw e;
    }

    /*
     * (non-Javadoc)
     *
     * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int)
     */
    public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
      // Since we're not validating, we don't need to do anything; this won't
      // be called.
    }

    /*
     * (non-Javadoc)
     *
     * @see org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator)
     */
    public void setDocumentLocator(Locator loc) {
      // System.out.println("Setting document locator.");
      this.locator = loc;
    }

    /*
     * (non-Javadoc)
     *
     * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
     */
    public void warning(SAXParseException e) throws SAXException {
      throw e;
    }

    /**
     * Adds a feature sturcture to the out-of-typesystem data, and sets the parser's state
     * appropriately. (APL)
     */
    private void addToOutOfTypeSystemData(String typeName, Attributes attrs)
            throws XCASParsingException {
      if (this.outOfTypeSystemData != null) {
        FSData fs = new FSData();
        fs.type = typeName;
        fs.indexRep = null; // not indexed
        String attrName, attrValue;
        for (int i = 0; i < attrs.getLength(); i++) {
          attrName = attrs.getQName(i);
          attrValue = attrs.getValue(i);
          if (attrName.startsWith(reservedAttrPrefix)) {
            if (attrName.equals(XCASSerializer.ID_ATTR_NAME)) {
              fs.id = attrValue;
            } else if (attrName.equals(XCASSerializer.CONTENT_ATTR_NAME)) {
              this.currentContentFeat = attrValue;
            } else if (attrName.equals(XCASSerializer.INDEXED_ATTR_NAME)) {
              fs.indexRep = attrValue;
            } else {
              fs.featVals.put(attrName, attrValue);
            }
          } else {
            fs.featVals.put(attrName, attrValue);
          }
        }
        this.outOfTypeSystemData.fsList.add(fs);
        this.currentOotsFs = fs;
        // Set the state; we're ready to accept the "content" feature,
        // if one is specified
        this.state = OOTS_CONTENT_STATE;
      }
    }

    /**
     * Parse a multi-valued attribute into a String array, by splitting on whitespace.
     *
     * @param val
     *          attribute value
     * @return an array with each array value as an element
     */
    private String[] parseArray(String val) {
      String[] arrayVals;
      val = val.trim();
      if (emptyVal(val)) {
        arrayVals = new String[0];
      } else {
        arrayVals = val.split("\\s+");
      }
      return arrayVals;
    }

    /**
     * Gets the CAS type name corresponding to an XCAS tag name. The type name is usually equal to
     * the tag name, but the characters : and - are translated into the sequences _colon_ and
     * _dash_, respectively.
     *
     * @param aTagName
     *          XCAS tag name
     * @return CAS type name corresponding to this tag
     */
    private String getCasTypeName(String aTagName) {
      if (aTagName.indexOf(':') == -1 && aTagName.indexOf('-') == -1) {
        return aTagName;
      } else {
        // Note: This is really slow so we avoid if possible. -- RJB
        return StringUtils.replaceAll(StringUtils.replaceAll(aTagName, ":", "_colon_"), "-",
                "_dash_");
      }
    }
  }

  private TypeSystemImpl ts;

  private UimaContext uimaContext;

  // private HashMap featureMap; -APL
  // ///private int[] featureType;

  // name of tag to contain document text
  private String docTypeName = XCASSerializer.DEFAULT_DOC_TYPE_NAME;

  /**
   * Create a new deserializer from a type system. Note: all CAS arguments later supplied to
   * <code>getXCASHandler()</code> must have this type system as their type system.
   *
   * @param ts
   *          The type system of the CASes to be deserialized.
   */
  public XCASDeserializer(TypeSystem ts, UimaContext uimaContext) {
    super();
    this.ts = (TypeSystemImpl) ts;
    this.uimaContext = uimaContext;
    // this.featureMap = new HashMap(); - APL
  }

  public XCASDeserializer(TypeSystem ts) {
    this(ts, null);
  }

  /**
   * Create a default handler for deserializing an XCAS into the <code>cas</code> parameter.
   * <p>
   * Warning: for efficiency reasons, the deserializer does not do much type checking for features
   * and their values. It is expected that the incoming XCAS conforms to the type system provided.
   * If it doesn't, the results are undefined.
   *
   * @param cas
   *          This CAS will be used to hold the data of the serialized XCAS.
   * @return The <code>DefaultHandler</code> to pass to the SAX parser.
   */
  public DefaultHandler getXCASHandler(CAS cas) {
    return getXCASHandler(cas, null);
  }

  /**
   * Create a default handler for deserializing an XCAS into the <code>cas</code> parameter. This
   * version causes the deserializer to store out-of-typesystem data for later use. (APL)
   * <p>
   * Warning: for efficiency reasons, the deserializer does not do much type checking for features
   * and their values. It is expected that the incoming XCAS conforms to the type system provided.
   * If it doesn't, the results are undefined.
   *
   * @param cas
   *          This CAS will be used to hold the data of the serialized XCAS.
   * @param outOfTypeSystemData
   *          An object that stores FSs that do not conform to the CAS's type system
   * @return The <code>DefaultHandler</code> to pass to the SAX parser.
   */
  public DefaultHandler getXCASHandler(CAS cas, OutOfTypeSystemData outOfTypeSystemData) {
    return new XCASDeserializerHandler((CASImpl) cas, outOfTypeSystemData);
  }

  /**
   * Gets the name of the type representing the document. This will become the name of the XML
   * element that will hold the document text.
   *
   * @return the document type name
   */
  public String getDocumentTypeName() {
    return docTypeName;
  }

  /**
   * Gets the name of the type representing the document. This will become the name of the XML
   * element that will hold the document text. If not set, defaults to
   * {@link XCASSerializer#DEFAULT_DOC_TYPE_NAME XCASSerializer.DEFAULT_DOC_TYPE_NAME}.
   *
   * @param aDocTypeName
   *          the document type name
   */
  public void setDocumentTypeName(String aDocTypeName) {
    docTypeName = aDocTypeName;
  }

  /**
   * Deserializes an XCAS from a stream. By default this is not lenient, meaning that if the XCAS
   * references Types that are not in the Type System, an Exception will be thrown. Use
   * {@link XCASDeserializer#deserialize(InputStream,CAS,boolean)} to turn on lenient mode and
   * ignore any unknown types.
   *
   * @param aStream
   *          input stream from which to read the XCAS XML document
   * @param aCAS
   *          CAS into which to deserialize. This CAS must be set up with a type system that is
   *          compatible with that in the XCAS
   *
   * @throws SAXException
   *           if an XML Parsing error occurs
   * @throws IOException
   *           if an I/O failure occurs
   */
  public static void deserialize(InputStream aStream, CAS aCAS) throws SAXException, IOException {
    XCASDeserializer.deserialize(aStream, aCAS, false);
  }

  /**
   * Deserializes an XCAS from a stream.
   *
   * @param aStream
   *          input stream from which to read the XCAS XML document
   * @param aCAS
   *          CAS into which to deserialize. This CAS must be set up with a type system that is
   *          compatible with that in the XCAS.
   * @param aLenient
   *          if true, unknown Types will be ignored. If false, unknown Types will cause an
   *          exception. The default is false.
   *
   * @throws SAXException
   *           if an XML Parsing error occurs
   * @throws IOException
   *           if an I/O failure occurs
   */
  public static void deserialize(InputStream aStream, CAS aCAS, boolean aLenient)
          throws SAXException, IOException {
    XMLReader xmlReader = XMLReaderFactory.createXMLReader();
    XCASDeserializer deser = new XCASDeserializer(aCAS.getTypeSystem());
    ContentHandler handler;
    if (aLenient) {
      handler = deser.getXCASHandler(aCAS, new OutOfTypeSystemData());
    } else {
      handler = deser.getXCASHandler(aCAS);
    }
    xmlReader.setContentHandler(handler);
    xmlReader.parse(new InputSource(aStream));
  }

}
TOP

Related Classes of org.apache.uima.cas.impl.XCASDeserializer$FSInfo

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.