Package net.sf.cram

Source Code of net.sf.cram.Sam2CramRecordFactory

/*******************************************************************************
* Copyright 2012 EMBL-EBI, Hinxton outstation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package net.sf.cram;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import net.sf.cram.encoding.read_features.BaseQualityScore;
import net.sf.cram.encoding.read_features.Deletion;
import net.sf.cram.encoding.read_features.HardClip;
import net.sf.cram.encoding.read_features.InsertBase;
import net.sf.cram.encoding.read_features.Padding;
import net.sf.cram.encoding.read_features.ReadFeature;
import net.sf.cram.encoding.read_features.RefSkip;
import net.sf.cram.encoding.read_features.SoftClip;
import net.sf.cram.encoding.read_features.Substitution;
import net.sf.cram.mask.RefMaskUtils;
import net.sf.picard.util.Log;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecord.SAMTagAndValue;
import net.sf.samtools.SAMTag;

public class Sam2CramRecordFactory {

  public static final String UNKNOWN_READ_GROUP_ID = "UNKNOWN";
  public static final String UNKNOWN_READ_GROUP_SAMPLE = "UNKNOWN";

  private enum TREAT_TYPE {
    IGNORE, ALIGNMENT, INSERTION, SOFT_CLIP
  };

  /**
   * Reserved for later use.
   */
  private TREAT_TYPE treatSoftClipsAs = TREAT_TYPE.SOFT_CLIP;

  public final static byte QS_asciiOffset = 33;
  public final static byte unsetQualityScore = 32;
  public final static byte ignorePositionsWithQualityScore = -1;

  private byte[] refBases;
  private byte[] refSNPs;
  private RefMaskUtils.RefMask refPile;

  public boolean captureUnmappedBases = true;
  public boolean captureUnmappedScores = false;

  private ByteBuffer insertionBuf = ByteBuffer.allocate(1024);

  private static Log log = Log.getInstance(Sam2CramRecordFactory.class);

  private Map<String, Integer> readGroupMap = new HashMap<String, Integer>();

  private long landedRefMaskScores = 0;
  private long landedPiledScores = 0;
  private long landedTotalScores = 0;

  private boolean captureInsertScores = false;
  private boolean captureSubtitutionScores = false;
  private int uncategorisedQualityScoreCutoff = 0;
  public boolean captureAllTags = false;
  public boolean preserveReadNames = false;
  public Set<String> captureTags = new TreeSet<String>();
  public Set<String> ignoreTags = new TreeSet<String>();
  {
    ignoreTags.add(SAMTag.NM.name());
    ignoreTags.add(SAMTag.MD.name());
    ignoreTags.add(SAMTag.RG.name());
  }

  public boolean losslessQS = false;

  private List<ReadTag> readTagList = new ArrayList<ReadTag>();

  public Sam2CramRecordFactory(byte[] refBases, SAMFileHeader samFileHeader) {
    this.refBases = refBases;

    List<SAMReadGroupRecord> readGroups = samFileHeader.getReadGroups();
    for (int i = 0; i < readGroups.size(); i++) {
      SAMReadGroupRecord readGroupRecord = readGroups.get(i);
      readGroupMap.put(readGroupRecord.getId(), i);
    }
  }

  public CramRecord createCramRecord(SAMRecord record) {
    CramRecord cramRecord = new CramRecord();
    if (record.getReadPairedFlag()) {
      cramRecord.mateAlignmentStart = record.getMateAlignmentStart();
      cramRecord.mateUmapped = record.getMateUnmappedFlag();
      cramRecord.mateNegativeStrand = record.getMateNegativeStrandFlag();
      cramRecord.mateSequnceID = record.getMateReferenceIndex();
    } else cramRecord.mateSequnceID = -1;
    cramRecord.sequenceId = record.getReferenceIndex();
    cramRecord.setReadName(record.getReadName());
    cramRecord.setAlignmentStart(record.getAlignmentStart());

    cramRecord.multiFragment = record.getReadPairedFlag();
    cramRecord.properPair = record.getReadPairedFlag()
        && record.getProperPairFlag();
    cramRecord.segmentUnmapped = record.getReadUnmappedFlag();
    cramRecord.negativeStrand = record.getReadNegativeStrandFlag();
    cramRecord.firstSegment = record.getReadPairedFlag()
        && record.getFirstOfPairFlag();
    cramRecord.lastSegment = record.getReadPairedFlag()
        && record.getSecondOfPairFlag();
    cramRecord.secondaryALignment = record.getNotPrimaryAlignmentFlag();
    cramRecord.vendorFiltered = record.getReadFailsVendorQualityCheckFlag();
    cramRecord.duplicate = record.getDuplicateReadFlag();

    cramRecord.setReadLength(record.getReadLength());
    cramRecord.setMappingQuality(record.getMappingQuality());
    cramRecord.setDuplicate(record.getDuplicateReadFlag());

    cramRecord.templateSize = record.getInferredInsertSize();

    SAMReadGroupRecord readGroup = record.getReadGroup();
    Integer rgIndex = 0;
    if (readGroup != null)
//      rgIndex = readGroupMap.get(readGroup.getId());
      cramRecord.setReadGroupID(readGroupMap.get(readGroup.getId()));
    else
      cramRecord.setReadGroupID(-1);
//      rgIndex = readGroupMap.get(UNKNOWN_READ_GROUP_ID);

//    if (rgIndex == null)
//      throw new RuntimeException("Read group index not found: "
//          + readGroup.getId());

//    cramRecord.setReadGroupID(rgIndex);

    if (!record.getReadPairedFlag())
      cramRecord.setLastFragment(false);
    else {
      if (record.getFirstOfPairFlag())
        cramRecord.setLastFragment(false);
      else if (record.getSecondOfPairFlag())
        cramRecord.setLastFragment(true);
      else
        cramRecord.setLastFragment(true);
    }

    if (!cramRecord.segmentUnmapped) {
      List<ReadFeature> features = checkedCreateVariations(cramRecord,
          record);
      cramRecord.setReadFeatures(features);
    }

    cramRecord.setReadBases(record.getReadBases());
    cramRecord.setQualityScores(record.getBaseQualities());
    landedTotalScores += cramRecord.getReadLength();

    readTagList.clear();
    if (captureAllTags) {
      List<SAMTagAndValue> attributes = record.getAttributes();
      for (SAMTagAndValue tv : attributes) {
        if (ignoreTags.contains(tv.tag))
          continue;
        readTagList.add(ReadTag.deriveTypeFromValue(tv.tag, tv.value));
      }
    } else {
      if (!captureTags.isEmpty()) {
        List<SAMTagAndValue> attributes = record.getAttributes();
        cramRecord.tags = new ReadTag[attributes.size()];
        int i = 0;
        for (SAMTagAndValue tv : attributes) {
          if (captureTags.contains(tv.tag)) {
            readTagList.add(ReadTag.deriveTypeFromValue(tv.tag, tv.value));
          }
        }
      }
    }
    cramRecord.tags = (ReadTag[]) readTagList
        .toArray(new ReadTag[readTagList.size()]);

    cramRecord.vendorFiltered = record.getReadFailsVendorQualityCheckFlag();

    if (preserveReadNames)
      cramRecord.setReadName(record.getReadName());

//    for (ReadFeature rf:cramRecord.getReadFeatures()) {
//      if (rf instanceof Substitution) {
//        Substitution s = (Substitution) rf ;
//        System.out.printf("pos=%d, ref=%c, base=%c, code=%d\n", s.getPosition(), s.getRefernceBase(), s.getBase(), s.getCode());
//      }
//    }

    return cramRecord;
  }

  /**
   * A wrapper method to provide better diagnostics for
   * ArrayIndexOutOfBoundsException.
   *
   * @param cramRecord
   * @param samRecord
   * @return
   */
  private List<ReadFeature> checkedCreateVariations(CramRecord cramRecord,
      SAMRecord samRecord) {
    try {
      return createVariations(cramRecord, samRecord);
    } catch (ArrayIndexOutOfBoundsException e) {
      log.error("Reference bases array length=" + refBases.length);
      log.error("Offensive CRAM record: " + cramRecord.toString());
      log.error("Offensive SAM record: " + samRecord.getSAMString());
      throw e;
    }
  }

  private List<ReadFeature> createVariations(CramRecord cramRecord,
      SAMRecord samRecord) {
    List<ReadFeature> features = new LinkedList<ReadFeature>();
    int zeroBasedPositionInRead = 0;
    int alignmentStartOffset = 0;
    int cigarElementLength = 0;

    List<CigarElement> cigarElements = samRecord.getCigar()
        .getCigarElements();

    byte[] bases = samRecord.getReadBases();
    byte[] qualityScore = samRecord.getBaseQualities();

    for (CigarElement cigarElement : cigarElements) {
      cigarElementLength = cigarElement.getLength();
      CigarOperator operator = cigarElement.getOperator();

      switch (operator) {
      case D:
        features.add(new Deletion(zeroBasedPositionInRead + 1,
            cigarElementLength));
        break;
      case N:
        features.add(new RefSkip(zeroBasedPositionInRead + 1,
            cigarElementLength));
        break;
      case P:
        features.add(new Padding(zeroBasedPositionInRead + 1,
            cigarElementLength));
        break;
      case H:
        addHardClip(features, zeroBasedPositionInRead,
            cigarElementLength, bases, qualityScore);
        break;
      case S:
        addSoftClip(features, zeroBasedPositionInRead,
            cigarElementLength, bases, qualityScore);
        break;
      case I:
        addInsertion(features, zeroBasedPositionInRead,
            cigarElementLength, bases, qualityScore);
        break;
      case M:
      case X:
      case EQ:
        addSubstitutionsAndMaskedBases(cramRecord, features,
            zeroBasedPositionInRead, alignmentStartOffset,
            cigarElementLength, bases, qualityScore);
        break;
      default:
        throw new IllegalArgumentException(
            "Unsupported cigar operator: "
                + cigarElement.getOperator());
      }

      if (cigarElement.getOperator().consumesReadBases())
        zeroBasedPositionInRead += cigarElementLength;
      if (cigarElement.getOperator().consumesReferenceBases())
        alignmentStartOffset += cigarElementLength;
    }

    return features;
  }

  private void addSoftClip(List<ReadFeature> features,
      int zeroBasedPositionInRead, int cigarElementLength, byte[] bases,
      byte[] scores) {
    byte[] insertedBases = Arrays.copyOfRange(bases,
        zeroBasedPositionInRead, zeroBasedPositionInRead
            + cigarElementLength);

    SoftClip v = new SoftClip(zeroBasedPositionInRead + 1, insertedBases);
    features.add(v);
  }

  private void addHardClip(List<ReadFeature> features,
      int zeroBasedPositionInRead, int cigarElementLength, byte[] bases,
      byte[] scores) {
    byte[] insertedBases = Arrays.copyOfRange(bases,
        zeroBasedPositionInRead, zeroBasedPositionInRead
            + cigarElementLength);

    HardClip v = new HardClip(zeroBasedPositionInRead + 1, insertedBases);
    features.add(v);
  }

  private void addInsertion(List<ReadFeature> features,
      int zeroBasedPositionInRead, int cigarElementLength, byte[] bases,
      byte[] scores) {
    byte[] insertedBases = Arrays.copyOfRange(bases,
        zeroBasedPositionInRead, zeroBasedPositionInRead
            + cigarElementLength);

    for (int i = 0; i < insertedBases.length; i++) {
      // single base insertion:
      InsertBase ib = new InsertBase();
      ib.setPosition(zeroBasedPositionInRead + 1 + i);
      ib.setBase(insertedBases[i]);
      features.add(ib);
      if (losslessQS || scores == null || scores.length < bases.length)
        continue;
      boolean qualityMasked = (scores[i] < uncategorisedQualityScoreCutoff);
      if (captureInsertScores || qualityMasked) {
        byte score = (byte) (QS_asciiOffset + scores[zeroBasedPositionInRead
            + i]);
        // if (score >= QS_asciiOffset) {
        features.add(new BaseQualityScore(zeroBasedPositionInRead + 1
            + i, score));
        landedTotalScores++;
        // }
      }
    }
  }

  private void addSubstitutionsAndMaskedBases(CramRecord cramRecord,
      List<ReadFeature> features, int fromPosInRead,
      int alignmentStartOffset, int nofReadBases, byte[] bases,
      byte[] qualityScore) {
    int oneBasedPositionInRead;
    boolean noQS = (qualityScore.length == 0);

    int i = 0;
    boolean qualityAdded = false;
    boolean qualityMasked = false;
    byte refBase;
    for (i = 0; i < nofReadBases; i++) {
      oneBasedPositionInRead = i + fromPosInRead + 1;
      int refCoord = (int) (cramRecord.getAlignmentStart() + i + alignmentStartOffset) - 1;
      qualityAdded = false;
      if (refCoord >= refBases.length)
        refBase = 'N';
      else
        refBase = refBases[refCoord];

      if (bases[i + fromPosInRead] != refBase) {
        Substitution sv = new Substitution();
        sv.setPosition(oneBasedPositionInRead);
        sv.setBase(bases[i + fromPosInRead]);
        sv.setRefernceBase(refBase);
        // sv.setBaseChange(new BaseChange(sv.getRefernceBase(), sv
        // .getBase()));
        sv.setBaseChange(null);

        features.add(sv);

        if (losslessQS || noQS)
          continue;

        if (captureSubtitutionScores) {
          byte score = (byte) (QS_asciiOffset + qualityScore[i
              + fromPosInRead]);
          features.add(new BaseQualityScore(oneBasedPositionInRead,
              score));
          qualityAdded = true;
        }
      }

      if (noQS)
        continue;

      if (!qualityAdded && refSNPs != null) {
        byte snpOrNot = refSNPs[refCoord];
        if (snpOrNot != 0) {
          byte score = (byte) (QS_asciiOffset + qualityScore[i
              + fromPosInRead]);
          features.add(new BaseQualityScore(oneBasedPositionInRead,
              score));
          qualityAdded = true;
          landedRefMaskScores++;
        }
      }

      if (!qualityAdded && refPile != null) {
        if (refPile.shouldStore(refCoord, refBase)) {
          byte score = (byte) (QS_asciiOffset + qualityScore[i
              + fromPosInRead]);
          features.add(new BaseQualityScore(oneBasedPositionInRead,
              score));
          qualityAdded = true;
          landedPiledScores++;
        }
      }

      qualityMasked = (qualityScore[i + fromPosInRead] < uncategorisedQualityScoreCutoff);
      if (!qualityAdded && qualityMasked) {
        byte score = (byte) (QS_asciiOffset + qualityScore[i
            + fromPosInRead]);
        features.add(new BaseQualityScore(oneBasedPositionInRead, score));
        qualityAdded = true;
      }

      if (qualityAdded)
        landedTotalScores++;
    }
  }

  public boolean isCaptureInsertScores() {
    return captureInsertScores;
  }

  public void setCaptureInsertScores(boolean captureInsertScores) {
    this.captureInsertScores = captureInsertScores;
  }

  public boolean isCaptureSubtitutionScores() {
    return captureSubtitutionScores;
  }

  public void setCaptureSubtitutionScores(boolean captureSubtitutionScores) {
    this.captureSubtitutionScores = captureSubtitutionScores;
  }

  public int getUncategorisedQualityScoreCutoff() {
    return uncategorisedQualityScoreCutoff;
  }

  public void setUncategorisedQualityScoreCutoff(
      int uncategorisedQualityScoreCutoff) {
    this.uncategorisedQualityScoreCutoff = uncategorisedQualityScoreCutoff;
  }

  public long getLandedRefMaskScores() {
    return landedRefMaskScores;
  }

  public long getLandedPiledScores() {
    return landedPiledScores;
  }

  public long getLandedTotalScores() {
    return landedTotalScores;
  }

  public boolean isCaptureUnmappedBases() {
    return captureUnmappedBases;
  }

  public void setCaptureUnmappedBases(boolean captureUnmappedBases) {
    this.captureUnmappedBases = captureUnmappedBases;
  }

  public boolean isCaptureUnmappedScores() {
    return captureUnmappedScores;
  }

  public void setCaptureUnmappedScores(boolean captureUnmappedScores) {
    this.captureUnmappedScores = captureUnmappedScores;
  }

  public byte[] getRefBases() {
    return refBases;
  }

  public void setRefBases(byte[] refBases) {
    this.refBases = refBases;
  }

  public byte[] getRefSNPs() {
    return refSNPs;
  }

  public void setRefSNPs(byte[] refSNPs) {
    this.refSNPs = refSNPs;
  }

  public RefMaskUtils.RefMask getRefPile() {
    return refPile;
  }

  public Map<String, Integer> getReadGroupMap() {
    return readGroupMap;
  }

  public void setRefPile(RefMaskUtils.RefMask refPile) {
    this.refPile = refPile;
  }

}
TOP

Related Classes of net.sf.cram.Sam2CramRecordFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.