Package org.bioviz.protannot

Source Code of org.bioviz.protannot.Xml2GenometryParser

package org.bioviz.protannot;

import com.affymetrix.genometryImpl.BioSeq;
import com.affymetrix.genometryImpl.MutableSeqSpan;
import com.affymetrix.genometryImpl.SeqSpan;
import com.affymetrix.genometryImpl.comparator.SeqSymStartComparator;
import com.affymetrix.genometryImpl.span.SimpleMutableSeqSpan;
import com.affymetrix.genometryImpl.span.SimpleSeqSpan;
import com.affymetrix.genometryImpl.symmetry.MutableSeqSymmetry;
import com.affymetrix.genometryImpl.symmetry.SeqSymmetry;
import com.affymetrix.genometryImpl.symmetry.SimpleMutableSeqSymmetry;
import com.affymetrix.genometryImpl.symmetry.SimpleSymWithProps;
import com.affymetrix.genometryImpl.symmetry.SymWithProps;
import com.affymetrix.genometryImpl.symmetry.TypeContainerAnnot;
import com.affymetrix.genometryImpl.util.SeqUtils;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;

/**
* Reads xml file to convert it into a Genometry.
*/

final class Xml2GenometryParser {

    private static final boolean DEBUG = false;
    private Map<String,BioSeq> mrna_hash;
    private Map<String,BioSeq> prot_hash;
    // instance variables needed during the parse
    private List<int[]> transCheckExons;  // used to sanity-check exon translation
  private static final String end_codon = "Z";

  public static final String STARTSTR = "start";
  public static final String ENDSTR = "end";
  public static final String TYPESTR = "type";
  public static final String NAMESTR = "name";
  public static final String EXONSTR = "exon";
  public static final String IDSTR = "id";
  public static final String RESIDUESSTR = "residues";
  public static final String MRNASTR = "mrna";
  public static final String STRANDSTR = "strand";
  public static final String CDSSTR = "cds";
  public static final String METHODSTR = "method";
  public static final String AA_START = "aa_start";
  public static final String AA_END = "aa_end";
  public static final String AA_LENGTH = "aa_length";

  /**
   * Create a new BioSeq and add annotations to it.
   * @param doc
   * @return
   * @throws Exception
   */
    BioSeq parse(Document doc) throws Exception{
    mrna_hash = new HashMap<String,BioSeq>();
    prot_hash = new HashMap<String,BioSeq>();

        try {
            BioSeq ret_genomic = processDocument(doc);

      return ret_genomic;

        } catch (Exception ex) {
      ex.printStackTrace();
      throw ex;
        }
    }

    /**
    <dnaseq>
    <genesearch>
    <gene>
    <primarytranscript>
    <mrna>
    <exon />
    <exon />
    <cds>
    <cdsseg />
    <cdsseg />
    </cds>
    </mrna>
    </primarytranscript>
    </gene>
    </genesearch>
    </dnaseq>
     */
    /**
     * Takes in Document object to parse it and convert into BioSeq.
     * @param   seqdoc  Document object name
     * @return          Returns BioSeq of given document object.
     * @see     com.affymetrix.genometryImpl.BioSeq
     */
  private BioSeq processDocument(Document seqdoc) {
       Element top_element = seqdoc.getDocumentElement();
    String name = top_element.getTagName();
    if (!name.equalsIgnoreCase("dnaseq")) {
      return null;
    }
    if (DEBUG) {
      System.err.println("processing dna seq");
    }
    String version = "";
    try {
      version = top_element.getAttribute("version");
    } catch (Exception e) {
      // ignore exception
    }
    String seq = "genome";
    try {
      seq = top_element.getAttribute("seq");
    } catch (Exception e) {
      // ignore exception
    }

    BioSeq chrom = buildChromosome(top_element, seq, version);

    processDNASeq(chrom, top_element);

    return chrom;
    }

    private static BioSeq buildChromosome(Element top_element, String seq, String version)
            throws DOMException {
        BioSeq chrom = null;
        NodeList children = top_element.getChildNodes();
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String cname = child.getNodeName();
            if (cname != null && cname.equalsIgnoreCase(RESIDUESSTR)) {
                Text resnode = (Text) child.getFirstChild();
                String residues = resnode.getData();
                chrom = new BioSeq(seq, version, residues.length());
                chrom.setResidues(residues);
            }
        }
        return chrom;
    }

    /**
     * Process dna in BioSeq for each child node of element provided.
     * @param   genomic
     * @param   elem        Node in genomic for which dna is to be processed
     * @see     com.affymetrix.genometryImpl.BioSeq
     */
    private void processDNASeq(BioSeq genomic, Element elem) {
        NodeList children = elem.getChildNodes();
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String name = child.getNodeName();
            if (name != null) {
                if (name.equalsIgnoreCase("genesearch")) {
                    processGeneSearch(genomic, (Element) child);
                } else {
                    if (name.equalsIgnoreCase(MRNASTR)) {
                        processMRNA(genomic, (Element) child);
                    }
                }
            }
        }

        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String name = child.getNodeName();
            if (name != null && name.equalsIgnoreCase("aaseq")) {
                processProtein(prot_hash, (Element) child);
            }
        }
    }

    /**
     Process protein in BioSeq for each child node of element provided.
     * @param   elem        Node for which protein is to be processed
     * @see     com.affymetrix.genometryImpl.BioSeq
     */
    private static void processProtein(Map<String,BioSeq> prot_hash, Element elem) {
        String pid = elem.getAttribute(IDSTR);
        BioSeq protein = prot_hash.get(pid);
        if (protein == null) {
            System.err.println("Error: no bioseq matching id: " + pid
                    + ". Skipping it.");
            return;
        }
        if (DEBUG) {
            System.err.println("aaseq: id = " + pid + ",  " + protein);
        }

        NodeList children = elem.getChildNodes();
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String name = child.getNodeName();
            if (name != null && name.equalsIgnoreCase("simsearch")) {
                processSimSearch(protein, (Element) child);
            }
        }
    }

    /**
     *
     * @param   query_seq
     * @param   elem
     * @see     com.affymetrix.genometryImpl.BioSeq
     */
    private static void processSimSearch(BioSeq query_seq, Element elem) {
        NodeList children = elem.getChildNodes();
        String method = elem.getAttribute(METHODSTR);
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String name = child.getNodeName();
            if (name != null && name.equalsIgnoreCase("simhit")) {
                processSimHit(query_seq, (Element) child, method);
            }
        }
    }

    /**
     *
     * @param   query_seq
     * @param   elem
     * @see     com.affymetrix.genometryImpl.BioSeq
     * @see     com.affymetrix.genometryImpl.MutableSeqSpan
     * @see     com.affymetrix.genometryImpl.SeqSpan
     * @see     com.affymetrix.genometryImpl.symmetry.SeqSymmetry
     * @see     com.affymetrix.genometryImpl.symmetry.SymWithProps
     * @see     com.affymetrix.genometryImpl.symmetry.TypeContainerAnnot
     * @see     com.affymetrix.genometryImpl.util.SeqUtils
     */
    private static void processSimHit(BioSeq query_seq, Element elem, String method) {
        // method can never be null -- if it is, the XML is wrong
        TypeContainerAnnot hitSym = new TypeContainerAnnot(method);
        addDescriptors(elem, hitSym);

        String hit_name = elem.getAttribute(NAMESTR);
        String hit_descr = elem.getAttribute("desc");

        if (hit_name != null && hit_name.length() > 0) {
            hitSym.setProperty(NAMESTR, hit_name);
        }
        if (hit_descr != null && hit_descr.length() > 0) {
            hitSym.setProperty("descr", hit_descr);
        }

        SeqSpan hitSpan = null;
        NodeList children = elem.getChildNodes();
        int num_spans = 0, aa_start = Integer.MAX_VALUE, aa_end = Integer.MIN_VALUE;
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String name = child.getNodeName();
            if (child instanceof Element) {
                Element chelem = (Element) child;
                if (name.equalsIgnoreCase("simspan")) {
                    SeqSymmetry spanSym = processSimSpan(query_seq, chelem);
                    ((SymWithProps) spanSym).setProperty(METHODSTR, method);
                    hitSym.addChild(spanSym);
                    SeqSpan spanSpan = spanSym.getSpan(query_seq);
                    if (hitSpan == null) {
                        hitSpan = new SimpleMutableSeqSpan(spanSpan.getMin(), spanSpan.getMax(), query_seq);
                    } else {
                        SeqUtils.encompass(hitSpan, spanSpan, (MutableSeqSpan) hitSpan);
                    }
                    //hitSym.setProperty(TYPESTR, "hitspan");
          int start = Integer.valueOf(((SymWithProps)spanSym).getProperty(AA_START).toString());
          int end = Integer.valueOf(((SymWithProps)spanSym).getProperty(AA_END).toString());
          aa_start = Math.min(aa_start, start);
          aa_end = Math.max(aa_end, end);
                    num_spans++;
                }
            }
        }
        String prop =  (Integer.valueOf(num_spans)).toString();
        hitSym.setProperty("num_spans", prop);
        hitSym.setProperty(TYPESTR, "simHit");
    hitSym.setProperty(AA_START, String.valueOf(aa_start));
    hitSym.setProperty(AA_END, String.valueOf(aa_end));
    hitSym.setProperty(AA_LENGTH, String.valueOf(aa_end - aa_start));
        hitSym.addSpan(hitSpan);
        hitSym.setID("");
        query_seq.addAnnotation(hitSym);
    }

    /**
     * Adds description from elem to sym.
     * @param   elem    Source from which description is to added.
     * @param   sym     Target to which description is added.
     * @see     com.affymetrix.genometryImpl.symmetry.SimpleSymWithProps
     */
    private static void addDescriptors(Element elem, SimpleSymWithProps sym) {

        NodeList children = elem.getChildNodes();
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String name = child.getNodeName();
            if (child instanceof Element) {
                Element chelem = (Element) child;
                if (name.equalsIgnoreCase("descriptor")) {
                    String desc_name = chelem.getAttribute(TYPESTR);
                    Text tnode = (Text) chelem.getFirstChild();
                    if (tnode != null) {
                        String desc_text = tnode.getData();
                        sym.setProperty(desc_name, desc_text);
                    }
                }
            }
        }
        Object test = sym.getProperty("domain_pos");
        if (test != null) {
            sym.setProperty(NAMESTR, test);
        }
    }

    /**
     *
     * @param   query_seq
     * @param   elem
     * @return  SeqSymmetry
     * @see     com.affymetrix.genometryImpl.symmetry.SeqSymmetry
     * @see     com.affymetrix.genometryImpl.BioSeq
     * @see     com.affymetrix.genometryImpl.SeqSpan
     * @see     com.affymetrix.genometryImpl.symmetry.SimpleSymWithProps
     */
    private static SeqSymmetry processSimSpan(BioSeq query_seq, Element elem) {
        int start = Integer.parseInt(elem.getAttribute("query_start"));
        int end;
        //  need to standardize on which tag to use!
        try {
            end = Integer.parseInt(elem.getAttribute("query_end"));
        } catch (Exception ex) {
            end = Integer.parseInt(elem.getAttribute("query_stop"));
        }

        SimpleSymWithProps spanSym = new SimpleSymWithProps();
        addDescriptors(elem, spanSym);
        String prop = (Integer.valueOf(start)).toString();
        spanSym.setProperty(AA_START, prop);
        prop = (Integer.valueOf(end)).toString();
        spanSym.setProperty(AA_END, prop);
        prop = (Integer.valueOf(end - start)).toString();
        spanSym.setProperty(AA_LENGTH, prop);
    //Multiplying start and end by 3. Because three letters forms one amino acid.
        SeqSpan qspan = new SimpleSeqSpan((start*3)+query_seq.getMin(), (end*3)+query_seq.getMin(), query_seq);
        spanSym.addSpan(qspan);
        return spanSym;
    }

    /**
     *
     * @param   genomic
     * @param   elem
     * @see     com.affymetrix.genometryImpl.BioSeq
     */
    private void processGeneSearch(BioSeq genomic, Element elem) {
        NodeList children = elem.getChildNodes();
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String nodename = child.getNodeName();
            if (null != nodename && nodename.equalsIgnoreCase("gene")) {
                processGene(genomic, (Element) child);
            }
        }
    }

    /**
     *
     * @param   genomic
     * @param   elem
     * @see     com.affymetrix.genometryImpl.BioSeq
     */
    private void processGene(BioSeq genomic, Element elem) {

        if (DEBUG) {
      int start = Integer.parseInt(elem.getAttribute(STARTSTR));
      int end = Integer.parseInt(elem.getAttribute(ENDSTR));
            System.err.println("gene:  start = " + start + "  end = " + end);
        }

        NodeList children = elem.getChildNodes();
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String nodename = child.getNodeName();
            if (null != nodename && nodename.equalsIgnoreCase("primarytranscript")) {
                processTranscript(genomic, (Element) child);
            }
        }
    }

    /**
     *
     * @param   genomic
     * @param   elem
     * @see     com.affymetrix.genometryImpl.BioSeq
     */
    private void processTranscript(BioSeq genomic, Element elem) {
        if (DEBUG) {
      int start = Integer.parseInt(elem.getAttribute(STARTSTR));
      int end = Integer.parseInt(elem.getAttribute(ENDSTR));
            System.err.println("transcript:  start = " + start + "  end = " + end);
        }
        NodeList children = elem.getChildNodes();
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String nodename = child.getNodeName();
            if (null != nodename && nodename.equalsIgnoreCase(MRNASTR)) {
                processMRNA(genomic, (Element) child);
            }
        }
    }

    /**
     *
     * @param   genomic
     * @param   elem
     * @see     com.affymetrix.genometryImpl.BioSeq
     * @see     com.affymetrix.genometryImpl.symmetry.SimpleSymWithProps
     * @see     com.affymetrix.genometryImpl.symmetry.MutableSeqSymmetry
     * @see     com.affymetrix.genometryImpl.SeqSpan
     * @see     com.affymetrix.genometryImpl.symmetry.SeqSymmetry
     * @see     com.affymetrix.genometryImpl.symmetry.SymWithProps
     * @see     com.affymetrix.genometryImpl.symmetry.TypeContainerAnnot
     * @see     com.affymetrix.genometryImpl.util.SeqUtils
     */
    private void processMRNA(BioSeq genomic, Element elem) {
        int start = Integer.parseInt(elem.getAttribute(STARTSTR));
        int end = Integer.parseInt(elem.getAttribute(ENDSTR));

        if (DEBUG) {
            System.err.println("mrna:  start = " + start + "  end = " + end);
        }
        NodeList children = elem.getChildNodes();
        SeqSpan span = new SimpleSeqSpan(start, end, genomic);

        TypeContainerAnnot m2gSym = new TypeContainerAnnot(elem.getAttribute("method"));
        m2gSym.addSpan(span);
        addDescriptors(elem, m2gSym);
        m2gSym.setProperty(TYPESTR, "mRNA");
        boolean forward = (span.isForward());


    transCheckExons = new ArrayList<int[]>();
        List<SeqSymmetry> exon_list = new ArrayList<SeqSymmetry>();
        List<Node> exon_insert_list = new ArrayList<Node>();
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            String nodename = child.getNodeName();
            if (nodename != null) {
                if (nodename.equalsIgnoreCase(EXONSTR)) {
                    SymWithProps exSym = processExon(genomic, (Element) child);
                    exSym.setProperty(TYPESTR, EXONSTR);
                    exon_list.add(exSym);
                } else if (nodename.equalsIgnoreCase("exon_insert")) {
                    exon_insert_list.add(child);
                }
            }
        }

        // need to sort exon inserts...
        //    5' to 3' along transcript.  Otherwise, trying to insert a 5'
        //    after a 3' has been inserted ill mess up coordinates of 3'.
        // assuming for now that exon inserts are ordered in the XML

        // sorting exons, so that later position calculations are accurate

        Collections.sort(exon_list, new SeqSymStartComparator( genomic, forward));
        for (SeqSymmetry esym : exon_list) {
            m2gSym.addChild(esym);
        }

    BioSeq mrna = addSpans(m2gSym, genomic, exon_insert_list, start);
   
    String protein_id = determineProteinID(children);

    String amino_acid = getAminoAcid(m2gSym);
    processCDS(children, genomic, m2gSym, mrna, protein_id, amino_acid);

        m2gSym.setID("");
        genomic.addAnnotation(m2gSym);
        mrna.addAnnotation(m2gSym);
    }

  private static String getAminoAcid(TypeContainerAnnot m2gSym){
    String residue = (String) m2gSym.getProperty("protein sequence");

    if(residue == null) {
      return "";
    }
    else {
      residue += end_codon;
    }

    return residue;
  }

  private static String determineProteinID(NodeList children) throws DOMException {
    for (int i = 0; i < children.getLength(); i++) {
      Node child = children.item(i);
      String nodename = child.getNodeName();
      if (nodename != null && nodename.equalsIgnoreCase("descriptor")) {
        Element el = (Element) child;
        String type = el.getAttribute(TYPESTR);
        if (type != null && type.equalsIgnoreCase("protein_product_id")) {
          Text tnode = (Text) el.getFirstChild();
          return tnode.getData();
        }
      }
    }
    return null;
  }


  private BioSeq addSpans(TypeContainerAnnot m2gSym, BioSeq genomic, List exon_insert_list, int start)
      throws NumberFormatException {
    int exoncount = m2gSym.getChildCount();
    int mrnalength = determinemRNALength(exoncount, m2gSym, genomic, exon_insert_list);
    int end = 0;
    String mrna_id = MRNASTR;
    BioSeq mrna = new BioSeq(mrna_id, null, mrnalength);
    mrna.setBounds(start, start+mrnalength);
    mrna_hash.put(mrna_id, mrna);
    SeqSpan mrna_span = new SimpleSeqSpan(mrna.getMin(), mrna.getMax(), mrna);
    m2gSym.addSpan(mrna_span);
    for (int i = 0; i < exoncount; i++) {
      SimpleSymWithProps esym = (SimpleSymWithProps) m2gSym.getChild(i);
      SeqSpan gspan = esym.getSpan(genomic);
      end = start + gspan.getLength();
      List<Element> hit_inserts = new ArrayList<Element>();
      end = determineOverlappingExons(exon_insert_list, gspan, hit_inserts, end);
      SeqSpan tspan = new SimpleSeqSpan(start, end, mrna);
      esym.addSpan(tspan);
      if (!hit_inserts.isEmpty()) {
        processExonInsert((MutableSeqSymmetry) esym, hit_inserts, genomic, mrna);
      }
      start = end;
    }
    return mrna;
  }

  /**
   * check each exon_insert, figure out which (if any) exons it overlaps
   * @param exon_insert_list
   * @param gspan
   * @param hit_inserts
   * @param end
   * @return
   * @throws NumberFormatException
   */
  private static int determineOverlappingExons(List exon_insert_list, SeqSpan gspan, List<Element> hit_inserts, int end) throws NumberFormatException {
    for (int insert_index = 0; insert_index < exon_insert_list.size(); insert_index++) {
      Element iel = (Element) exon_insert_list.get(insert_index);
      int istart = Integer.parseInt(iel.getAttribute("insert_at"));
      int ilength = Integer.parseInt(iel.getAttribute("insert_length"));
      if (SeqUtils.contains(gspan, (SeqSpan) iel)) {
        // need to add children to this exon symmetry to indicate an insertion
        //   (or possibly deletion?) of bases in the transcript relative to the genomic
        //      processExonInsert(esym, istart, ilength);
        System.err.println("insert: insertion_start = " + istart + ", length = " + ilength);
        // remove this exon_insert from list to consider in future passes
        //    need to also decrement the insert_index to make sure removal doesn't cause
        //    next exon_insert to not be considered...
        exon_insert_list.remove(insert_index);
        hit_inserts.add(iel);
        insert_index--;
        end += ilength;
      }
    }
    return end;
  }

  private static int determinemRNALength(int exoncount, TypeContainerAnnot m2gSym, BioSeq genomic, List exon_insert_list) throws NumberFormatException {
    int mrnalength = 0;
    for (int i = 0; i < exoncount; i++) {
      SeqSymmetry esym = m2gSym.getChild(i);
      SeqSpan gspan = esym.getSpan(genomic);
      mrnalength += gspan.getLength();
    }
    for (int i = 0; i < exon_insert_list.size(); i++) {
      Element iel = (Element) exon_insert_list.get(i);
      int ilength = Integer.parseInt(iel.getAttribute("insert_length"));
      mrnalength += ilength;
    }
    return mrnalength;
  }


    /**
     *
     * @param   exonSym
     * @param   hit_inserts
     * @param   genomic
     * @param   mrna
     * @see     com.affymetrix.genometryImpl.symmetry.MutableSeqSymmetry
     * @see     com.affymetrix.genometryImpl.BioSeq
     * @see     com.affymetrix.genometryImpl.SeqSpan
     */
    private static void processExonInsert(MutableSeqSymmetry exonSym, List<Element> hit_inserts,
            BioSeq genomic, BioSeq mrna) {
        // assumes that hit_inserts are in order 5' to 3' along transcript
        // assumes that each exon_insert in hit_inserts actually is contained in the exon
        // assumes that the genomic and transcript spans of the exon are already
        //       part of the exonSym and that the transcript span already correctly takes into account
        //       the additional bases introduced by the exon inserts

        //   map from genomic coords over to transcript coords to figure out where to "split" the
        //       exonSym into children

        SeqSpan egSpan = exonSym.getSpan(genomic);
        SeqSpan etSpan = exonSym.getSpan(mrna);

        int genStart = egSpan.getStart();
        int transStart = etSpan.getStart();

        for (int insert_index = 0; insert_index < hit_inserts.size(); insert_index++) {
            Element iel = hit_inserts.get(insert_index);
            int istart = Integer.parseInt(iel.getAttribute("insert_at"));
            int ilength = Integer.parseInt(iel.getAttribute("insert_length"));
            int genLength = Math.abs(istart - genStart);
            int transEnd = transStart + genLength;

            // split out exon seg between last insert (or start of exon) and current insert
            //   [unless start of exon and the insert is actually at exact beginning of exon]
            if (istart != genStart) {
                MutableSeqSymmetry segSym = new SimpleMutableSeqSymmetry();
                SeqSpan gSpan = new SimpleSeqSpan(genStart, istart, genomic)// start of insert is end of exon seg
                SeqSpan tSpan = new SimpleSeqSpan(transStart, transEnd, mrna);
                segSym.addSpan(gSpan);
                segSym.addSpan(tSpan);
                exonSym.addChild(segSym);
            }
            // now add exon seg for the current insert
            transStart = transEnd;
            transEnd += ilength;
            SeqSpan insert_tspan = new SimpleSeqSpan(transStart, transEnd, mrna);
            SeqSpan insert_gspan = new SimpleSeqSpan(istart, istart, genomic);
            MutableSeqSymmetry isegSym = new SimpleMutableSeqSymmetry();
            isegSym.addSpan(insert_tspan);
            // experimenting with adding a zero-length placeholder for exon insert relative to genomic
            isegSym.addSpan(insert_gspan);
            exonSym.addChild(isegSym);

            // set current genomic start point for next loop to location of current insert
            genStart = istart;
            transStart = transEnd;

        }

        // if last insert is not _exactly_ at end of exon, then need to add last exon seg
        //   after finished looping through inserts
        if (genStart != egSpan.getEnd()) {
            SeqSpan gSpan = new SimpleSeqSpan(genStart, egSpan.getEnd(), genomic);
            SeqSpan tSpan = new SimpleSeqSpan(transStart, etSpan.getEnd(), mrna);
            MutableSeqSymmetry endSym = new SimpleMutableSeqSymmetry();
            endSym.addSpan(gSpan);
            endSym.addSpan(tSpan);
            exonSym.addChild(endSym);
        }
    }

    /**
     *
     * @param   genomic
     * @param   elem
     * @return  SymWithProps
     * @see     com.affymetrix.genometryImpl.BioSeq
     * @see     com.affymetrix.genometryImpl.SeqSpan
     * @see     com.affymetrix.genometryImpl.symmetry.SimpleSymWithProps
     * @see     com.affymetrix.genometryImpl.symmetry.SymWithProps
     */
    private SymWithProps processExon(BioSeq genomic, Element elem) {
         // should not be any nodes underneath exon tags (at least in current pseudo-DTD
        //  GAH 10-6-2001
        int start = Integer.parseInt(elem.getAttribute(STARTSTR));
        int end = Integer.parseInt(elem.getAttribute(ENDSTR));

        transCheckExons.add(new int[]{start,end});

        SeqSpan span = new SimpleSeqSpan(start, end, genomic);
        SimpleSymWithProps exonsym = new SimpleSymWithProps();
        addDescriptors(elem, exonsym);
        exonsym.setProperty(STARTSTR, elem.getAttribute(STARTSTR));
        exonsym.setProperty(ENDSTR, elem.getAttribute(ENDSTR));
    exonsym.setProperty("length", String.valueOf(end - start));
        exonsym.addSpan(span);
        return exonsym;
    }

  private void processCDS(NodeList children, BioSeq genomic, TypeContainerAnnot m2gSym, BioSeq mrna, String protein_id, String amino_acid) {
    for (int i = 0; i < children.getLength(); i++) {
      Node child = children.item(i);
      String nodename = child.getNodeName();
      if (nodename != null && nodename.equalsIgnoreCase(CDSSTR)) {
          processCDS(genomic, (Element) child, m2gSym, mrna, protein_id, amino_acid);
      }
    }
  }


    /**
     *
     * @param   genomic
     * @param   elem
     * @param   m2gSym
     * @param   mrna
     * @param   protein_id
     * @see     com.affymetrix.genometryImpl.BioSeq
     * @see     com.affymetrix.genometryImpl.symmetry.SimpleSymWithProps
     * @see     com.affymetrix.genometryImpl.symmetry.MutableSeqSymmetry
     * @see     com.affymetrix.genometryImpl.SeqSpan
     * @see     com.affymetrix.genometryImpl.symmetry.SeqSymmetry
     * @see     com.affymetrix.genometryImpl.symmetry.TypeContainerAnnot
     * @see     com.affymetrix.genometryImpl.util.SeqUtils
     */
    private void processCDS(BioSeq genomic, Element elem, SimpleSymWithProps m2gSym,
            BioSeq mrna, String protein_id, String amino_acid) {

        String attr = elem.getAttribute("transstart");
        if (attr == null || attr.length() == 0) {
            attr = elem.getAttribute(STARTSTR);
        }
        int start = Integer.parseInt(attr);

    // transstop indicates last base of actual translation
        attr = elem.getAttribute("transstop");
        if (attr == null || attr.length() == 0) {
            attr = elem.getAttribute(ENDSTR);
        }
        int end = Integer.parseInt(attr);

        checkTranslationLength(transCheckExons,start,end);

        // could just do this as a single seq span (start, end, seq), but then would end up recreating
        //   the cds segments, which will get ignored afterwards...
        SeqSpan gstart_point = new SimpleSeqSpan(start, start, genomic);
        SeqSpan gend_point = new SimpleSeqSpan(end, end, genomic);
        SimpleSymWithProps result = new SimpleSymWithProps();
        result.addSpan(gstart_point);
        SeqSymmetry[] m2gPath = new SeqSymmetry[]{m2gSym};
        SeqUtils.transformSymmetry((MutableSeqSymmetry) result, m2gPath);
        SeqSpan mstart_point = result.getSpan(mrna);

    if(mstart_point == null) {
      throw new NullPointerException("Conflict with start and end in processCDS.");
    }

        result = new SimpleSymWithProps();

        result.addSpan(gend_point);
        SeqUtils.transformSymmetry((MutableSeqSymmetry) result, m2gPath);
        SeqSpan mend_point = result.getSpan(mrna);

    if(mend_point == null) {
      throw new NullPointerException("Conflict with start and end in processCDS.");
    }

        TypeContainerAnnot m2pSym = new TypeContainerAnnot(elem.getAttribute(METHODSTR));

        SeqSpan mspan = new SimpleSeqSpan(mstart_point.getStart(), mend_point.getEnd(), mrna);
        BioSeq protein = new BioSeq(protein_id, null, mspan.getLength());
    protein.setResidues(processAminoAcid(amino_acid));
    protein.setBounds(mspan.getMin(), mspan.getMin() + mspan.getLength());

        prot_hash.put(protein_id, protein);
        SeqSpan pspan = new SimpleSeqSpan(protein.getMin(), protein.getMax(), protein);
        if (DEBUG) {
            System.err.println("protein: length = " + pspan.getLength());
        }
        m2pSym.addSpan(mspan);
        m2pSym.addSpan(pspan);

        m2pSym.setID("");
        protein.addAnnotation(m2pSym);
        mrna.addAnnotation(m2pSym);

        // Use genometry manipulations to map cds start/end on genome to cds start/end on transcript
        //    (so that cds becomes mrna2protein symmetry on mrna (and on protein...)

    }

  /**
   * Create String with amino acids, left-justified with spaces versus nucleotides
   * @param residue - String of amino acids
   * @return - left-justified String
   */
  private static String processAminoAcid(String residue){
    if(residue.isEmpty()) {
      return residue;
    }
   
    char[] amino_acid = new char[residue.length()*3];
    for(int i=0; i < amino_acid.length; i++ ){
      if(i % 3 == 0){
        amino_acid[i] = residue.charAt(i/3);
      }else {
        amino_acid[i] = ' ';
      }
    }
    return String.valueOf(amino_acid);
  }

  /**
   * Sanity check on length of translations (the total should be divisible by 3).
   * @param transCheckExons
   * @param start
   * @param end
   */
  private static void checkTranslationLength(List<int[]> transCheckExons, int start ,int end){

        int length = 0;
        for(int[] exon : transCheckExons){
            int exon_start = exon[0];
            int exon_end = exon[1];

      //int old_length = length;
            if(exon_start >= start && exon_end <= end){
        // exon completely in translated region
                length += exon_end - exon_start;
            } else if(exon_start <= start && exon_end >= start){
        // translation start is past beginning of exon
                length += exon_end - start;
            } else if(exon_start <= end && exon_end >= end){
        // translation end is before ending of exon
                length += end - exon_start;
            }
        }

        if(length % 3 != 0) {
      System.out.println("WARNING:  Translation length is " + length + " and remainder modulo 3 is " + length % 3);
    }
    }
}
TOP

Related Classes of org.bioviz.protannot.Xml2GenometryParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.