Source Code of net.sf.mzmine.util.MascotParserUtils

/*
 * Copyright 2006-2012 The MZmine 2 Development Team
 * 
 * This file is part of MZmine 2.
 * 
 * MZmine 2 is free software; you can redistribute it and/or modify it under the
 * terms of the GNU General Public License as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option) any later
 * version.
 * 
 * MZmine 2 is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with
 * MZmine 2; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
 * Fifth Floor, Boston, MA 02110-1301 USA
 */


package net.sf.mzmine.util;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.Vector;


import net.sf.mzmine.data.DataPoint;
import net.sf.mzmine.data.impl.SimpleDataPoint;
import net.sf.mzmine.modules.peaklistmethods.identification.mascot.data.IonSignificance;
import net.sf.mzmine.modules.peaklistmethods.identification.mascot.data.ModificationPeptide;
import net.sf.mzmine.modules.peaklistmethods.identification.mascot.data.Peptide;
import net.sf.mzmine.modules.peaklistmethods.identification.mascot.data.PeptideFragmentation;
import net.sf.mzmine.modules.peaklistmethods.identification.mascot.data.PeptideIdentityDataFile;
import net.sf.mzmine.modules.peaklistmethods.identification.mascot.data.PeptideIonSerie;
import net.sf.mzmine.modules.peaklistmethods.identification.mascot.data.PeptideScan;
import net.sf.mzmine.modules.peaklistmethods.identification.mascot.data.Protein;
import net.sf.mzmine.modules.peaklistmethods.identification.mascot.data.ProteinSection;
import net.sf.mzmine.modules.peaklistmethods.identification.mascot.data.SerieIonType;


public class MascotParserUtils {


  /*
   * FRAGMENT ION/RULE KEYS: 1 singly charged 2 doubly charged if precursor 2+
   * or higher (not internal or immonium) 3 doubly charged if precursor 3+ or
   * higher (not internal or immonium) 4 immonium 5 a series 6 a - NH3 if a
   * significant and fragment includes RKNQ 7 a - H2O if a significant and
   * fragment includes STED 8 b series 9 b - NH3 if b significant and fragment
   * includes RKNQ 10 b - H2O if b significant and fragment includes STED 11 c
   * series 12 x series 13 y series 14 y - NH3 if y significant and fragment
   * includes RKNQ 15 y - H2O if y significant and fragment includes STED 16 z
   * series 17 internal yb < 700 Da 18 internal ya < 700 Da 19 y or y++ must
   * be significant 20 y or y++ must be highest scoring series 21 z+1 series
   * 22 d and d' series 23 v series 24 w and w' series 25 z+2 series
   * 
   * INSTRUMENT RULES: Default = 1,2,8,9,10,13,14,15 ESI-QUAD-TOF =
   * 1,2,8,9,10,13,14,15 MALDI-TOF-PSD = 1,4,5,6,7,8,9,10,13 ESI-TRAP =
   * 1,2,8,9,10,13,14,15 ESI-QUAD = 1,2,8,9,10,13,14,15 ESI-FTICR =
   * 1,2,8,9,10,13,14,15 MALDI-TOF-TOF =
   * 1,4,5,6,7,8,9,10,13,14,15,17,18,22,23,24 ESI-4SECTOR =
   * 1,2,4,5,8,9,10,13,16,17,18 FTMS-ECD = 1,2,11,13,21,25 ETD-TRAP =
   * 1,2,11,13,21,25 MALDI-QUAD-TOF = 1,2,4,89,10,13,14,15,17,18 MALDI-QIT-TOF
   * = 1,4,5,6,78,9,10,13,14,15,17,18
   */
  public static SerieIonType[] parseFragmentationRules(String sRules) {
    Vector<SerieIonType> ionSeries = new Vector<SerieIonType>();
    StringTokenizer st = new StringTokenizer(sRules, ",");
    int ionRule = 0;
    while (st.hasMoreTokens()) {
      ionRule = Integer.parseInt(st.nextToken());
      switch (ionRule) {
        case 5 : // a-ion
          ionSeries.add(SerieIonType.A_SERIES);
          break;


        case 8 : // b-ion
          ionSeries.add(SerieIonType.B_SERIES);
          break;


        case 11 : // c-ion
          ionSeries.add(SerieIonType.C_SERIES);
          break;


        case 12 : // x-ion
          ionSeries.add(SerieIonType.X_SERIES);
          break;


        case 13 : // y-ion
          ionSeries.add(SerieIonType.Y_SERIES);
          break;


        case 16 : // z-ion
          ionSeries.add(SerieIonType.Z_SERIES);
          break;


        case 21 : // zH-ion
          ionSeries.add(SerieIonType.ZH_SERIES);
          break;


        case 25 : // zHH-ion
          ionSeries.add(SerieIonType.ZHH_SERIES);
          break;


        default :
          break;
      }
    }


    return ionSeries.toArray(new SerieIonType[0]);
  }


  /*
   * ion series
   * 
   * [Mascot 2.2] 0 a 1 place holder 2 a++
   * 
   * 3 b 4 place holder 5 b++
   * 
   * 6 y 7 place holder 8 y++
   * 
   * 9 c 10 c++ 11 x 12 x++ 13 z 14 z++ 15 z+H 16 z+H++ 17 z+2H 18 z+2H++
   * 
   * 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 2 0 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
   * 15 16 17 18
   */
  public static PeptideIonSerie parseIonSeriesSignificance(
      String seriesIonString) {


    PeptideIonSerie peptideIonSeries = new PeptideIonSerie();
    IonSignificance significance = IonSignificance.NOT_Sign_NOT_Scoring;
    for (int i = 0; i < seriesIonString.length(); i++) {


      significance = IonSignificance.NOT_Sign_NOT_Scoring;
      switch (seriesIonString.charAt(i)) {
        case '1' :
          significance = IonSignificance.NOT_Sign_NOT_Scoring;
          break;


        case '2' :
          significance = IonSignificance.NOT_Sign_NOT_Scoring;
          break;
      }


      switch (i) {
        case 0 : // a-ion
          peptideIonSeries.addSerie(SerieIonType.A_SERIES,
              significance);
          break;
        case 2 : // a-double-ion
          peptideIonSeries.addSerie(SerieIonType.A_DOUBLE_SERIES,
              significance);
          break;


        case 3 : // b-ion
          peptideIonSeries.addSerie(SerieIonType.B_SERIES,
              significance);
          break;


        case 5 : // b-double-ion
          peptideIonSeries.addSerie(SerieIonType.B_DOUBLE_SERIES,
              significance);
          break;


        case 6 : // y-ion
          peptideIonSeries.addSerie(SerieIonType.Y_SERIES,
              significance);
          break;


        case 8 : // y-ion
          peptideIonSeries.addSerie(SerieIonType.Y_DOUBLE_SERIES,
              significance);
          break;


        case 9 : // c-ion
          peptideIonSeries.addSerie(SerieIonType.C_SERIES,
              significance);
          break;


        case 10 : // c-double-ion
          peptideIonSeries.addSerie(SerieIonType.C_DOUBLE_SERIES,
              significance);
          break;


        case 11 : // x-ion
          peptideIonSeries.addSerie(SerieIonType.X_SERIES,
              significance);
          break;


        case 12 : // x-double-ion
          peptideIonSeries.addSerie(SerieIonType.X_DOUBLE_SERIES,
              significance);
          break;


        case 13 : // z-ion
          peptideIonSeries.addSerie(SerieIonType.Z_SERIES,
              significance);
          break;


        case 14 : // z-double-ion
          peptideIonSeries.addSerie(SerieIonType.Z_DOUBLE_SERIES,
              significance);
          break;


        case 15 : // zh-ion
          peptideIonSeries.addSerie(SerieIonType.ZH_SERIES,
              significance);
          break;


        case 16 : // zh-double-ion
          peptideIonSeries.addSerie(SerieIonType.ZH_DOUBLE_SERIES,
              significance);
          break;


        case 17 : // zhh-ion
          peptideIonSeries.addSerie(SerieIonType.ZHH_SERIES,
              significance);
          break;


        case 18 : // zhh-double-ion
          peptideIonSeries.addSerie(SerieIonType.ZHH_DOUBLE_SERIES,
              significance);
          break;


        default :
          break;
      }
    }


    return peptideIonSeries;
  }


  /**
   * This method finds a property, associated by a name.
   * 
   * @param line
   *            String with the line on which the 'KEY=VALUE' pair is to be
   *            found.
   * @param propName
   *            String with the name of the KEY.
   * @return String property
   */
  public static String getProperty(String line, String propName) {
    propName += "=";
    int start = line.indexOf(propName);
    int offset = propName.length();
    // System.out.println("---" + line);
    String found = line.substring(start + offset).trim();
    // Trim away opening and closing '"'.
    if (found.startsWith("\"")) {
      found = found.substring(1);
    }
    if (found.endsWith("\"")) {
      found = found.substring(0, found.length() - 1);
    }
    return found.trim();
  }


  /**
   * This method parses the content of a section into key-value pairs and
   * stores these in a HashMap.
   * 
   * @param aContent
   *            the content of the section to be parsed.
   * @return HashMap with the contents of the section as key-value pairs.
   */
  public static HashMap<String, Object> processSectionToHashMap(
      String aContent) {


    HashMap<String, Object> sectionMap = new HashMap<String, Object>();
    try {
      BufferedReader bufferReader = new BufferedReader(new StringReader(
          aContent));
      String line = null;
      boolean firstLine = true;
      while ((line = bufferReader.readLine()) != null) {
        // KEY=VALUE
        // If this section has XML content (such as the modification /
        // quantitation sections as of Mascot 2.2)
        // The xml section is stored as a whole in the map for the key
        // 'XML'.
        if (firstLine) {
          if (line.startsWith("<?xml version")) {
            // We match a xml section.
            sectionMap.put("XML", aContent); // Now break the while
                              // loop.//
            break;
          }
          firstLine = false;
        }


        // More, for header lines ('hx_text'; holding the protein
        // description)
        // or Protein section names (starting with the accession,
        // encircled by '"')
        // we need to make sure we have the full description line (it
        // might
        // contain an '=', as IPI headers often do)!
        String key = null;
        Object value = null;
        if ((line.startsWith("h") && line.indexOf("_text=") >= 0)
            || (line.startsWith("\"") && line.endsWith("\""))) {
          key = line.substring(0, line.indexOf("=")).trim();
          if (line.length() > line.indexOf("=") + 1) {
            value = line.substring(line.indexOf("=") + 1).trim();
          }
        } else {
          StringTokenizer lst = new StringTokenizer(line, "=");
          key = lst.nextToken().trim();
          value = null;
          if (lst.hasMoreTokens()) {
            value = lst.nextToken().trim();
          }
        }
        sectionMap.put(key, value);
      }
      bufferReader.close();
    } catch (IOException ioe) {
      ioe.printStackTrace();
    }
    return sectionMap;
  }


  /*
   * String "info" is divided into three sections by ";", which the first one
   * is peptide, second protein and third is peptide terminals.
   * 
   * Peptide section must have 11 elements: missed cleavages, peptide mass,
   * delta mass, number of matched ions, aa sequence, peaks used from ion 1,
   * variable modifications (binary code), ion score, ion series, peaks used
   * from ions2, peaks used from ion 3
   * 
   * Protein section must have 5 elements: accesion name, frame number, start,
   * stop and multiplicity
   * 
   * Peptide terminal section must have 2 elements; left aa, right aa of the
   * sequence
   * 
   * Example='
   * 1,743.428955,0.000893,5,KGAAQLR,25,000001000,27.19,0002001000000000000
   * ,0,0;
   * "SPBC839.04":0:23:29:1,"SPBC2F12.07c":0:23:29:1,"SPAC1F7.13c":0:23:29:1;
   * R,T:R,T:R,T '
   */
  public static Peptide[] parsePeptideInfo(int queryNumber, String info,
      double mass, double massExpected, int precursorCharge,
      PeptideIdentityDataFile pepDataFile, double identityThreshold,
      boolean isTopScore) {


    String[] tokens = info.split(";");
    String peptideSection = tokens[0];
    String proteinSection = tokens[1];
    String terminalSection = tokens[2];


    // Terminals
    tokens = terminalSection.split(":");
    String[] peptideInfos = new String[tokens.length];
    for (int i = 0; i < tokens.length; i++) {
      peptideInfos[i] = peptideSection + "," + tokens[i];
    }


    // Proteins
    tokens = proteinSection.split(",");
    String[] proteinInfos = new String[tokens.length];
    for (int i = 0; i < tokens.length; i++) {
      proteinInfos[i] = tokens[i];
    }


    if (proteinInfos.length != peptideInfos.length)
      return null;
    Vector<Peptide> peptides = new Vector<Peptide>();


    // Peptide
    for (int pepIndex = 0; pepIndex < peptideInfos.length; pepIndex++) {


      tokens = peptideInfos[pepIndex].split(",");
      int missedCleavages = Integer.parseInt(tokens[0]);
      double precursorMass = Double.parseDouble(tokens[1]);
      double deltaMass = Double.parseDouble(tokens[2]);
      String sequence = tokens[11] + tokens[4] + tokens[12];
      String modSeries = tokens[6];
      double ionScore = Double.parseDouble(tokens[7]);


      if (ionScore >= identityThreshold)
        continue;


      Peptide peptide = new Peptide(queryNumber, sequence, ionScore,
          mass, massExpected, precursorCharge, precursorMass,
          deltaMass, missedCleavages, null, "Mascot", isTopScore);


      HashMap<Integer, ModificationPeptide> modifications = new HashMap<Integer, ModificationPeptide>();
      ModificationPeptide[] searchedMods = pepDataFile
          .getSearchedModifications();


      // TODO Verify the possibility of two modifications in the same
      // site.
      for (int pos = 0; pos < modSeries.length(); pos++) {
        if (modSeries.charAt(pos) == '1') {
          char aa = sequence.charAt(pos);
          for (int index = 0; index < searchedMods.length; index++) {
            if (searchedMods[index].getSite() == aa) {
              modifications.put(pos, searchedMods[index]);
            }
          }
        }
      }


      peptide.setModifications(modifications);


      // Ion serie
      PeptideIonSerie peptideIonSerie = parseIonSeriesSignificance(tokens[8]);
      peptide.setIonSeries(peptideIonSerie);


      // Calculate fragmentation
      PeptideFragmentation fragmentation = new PeptideFragmentation(
          peptide, pepDataFile);
      peptide.setFragmentation(fragmentation);


      // Protein info
      ProteinSection section;
      tokens = proteinInfos[pepIndex].split(":");
      String sysname = tokens[0].replace("\"", "");


      Protein protein = pepDataFile.getProtein(sysname);
      if (protein == null)
        protein = new Protein(sysname);


      int startRegion = Integer.parseInt(tokens[2]);
      int stopRegion = Integer.parseInt(tokens[3]);
      int multiplicity = Integer.parseInt(tokens[4]);
      section = new ProteinSection(startRegion, stopRegion, multiplicity);


      // Link peptide and protein
      peptide.setProtein(protein);
      protein.addPeptide(peptide, section, isTopScore);


      peptides.add(peptide);


    }


    return peptides.toArray(new Peptide[0]);
  }


  /**
   * Parse a string with information of possible modifications in a peptide's
   * amino acid sequence
   * 
   * @param modString
   * @param section
   * @param fixed
   * @return
   */
  public static ModificationPeptide[] parseModification(String modString,
      HashMap section, boolean fixed) {
    // Example "Deamidation (NQ)"
    Vector<ModificationPeptide> mods = new Vector<ModificationPeptide>();
    double mass = 0.0;
    modString = modString.replace("\n", "");
    String[] tokens = modString.split(" ");
    String name = tokens[0];
    String sites = tokens[1];
    if (sites.endsWith(")")) {
      sites = sites.replace("(", "");
      sites = sites.replace(")", "");
      for (int i = 0; i < sites.length(); i++) {
        mass = Double.parseDouble((String) section.get(name));
        mods.add(new ModificationPeptide(name, mass, sites.charAt(i),
            fixed));
      }
    }
    return mods.toArray(new ModificationPeptide[0]);
  }


  /**
   * Parse the information of data points (MS/MS peaks) and generate a new
   * PeptideScan
   * 
   * @param queryNumber
   * @param HashMap
   *            sectionMap with the data points info
   * @param pepDataFile
   * @return
   */
  public static PeptideScan parseScanIons(int queryNumber,
      HashMap sectionMap, PeptideIdentityDataFile pepDataFile) {


    String titleScan = (String) sectionMap.get("title");
    titleScan = titleScan.replace("%2e", ",");
    String[] tokens = titleScan.split(",");
    String rawFileName = tokens[0];
    int rawScanNumber = Integer.parseInt(tokens[1]);


    PeptideScan scan = new PeptideScan(pepDataFile, rawFileName,
        queryNumber, rawScanNumber);
    Vector<SimpleDataPoint> dataPoints = new Vector<SimpleDataPoint>();
    double mass, intensity;
    String ions = (String) sectionMap.get("Ions1");
    StringTokenizer tokenizer = new StringTokenizer(ions, ",");
    while (tokenizer.hasMoreTokens()) {
      tokens = tokenizer.nextToken().split(":");
      mass = Double.parseDouble(tokens[0]);
      intensity = Double.parseDouble(tokens[1]);
      dataPoints.add(new SimpleDataPoint(mass, intensity));
    }


    scan.setDataPoints(dataPoints.toArray(new SimpleDataPoint[0]));


    return scan;
  }


  /**
   * Compare two arrays of DataPoint and return true if they are equal
   * 
   */
  public static boolean compareDataPointsByMass(DataPoint[] dataPoints,
      DataPoint[] dataPoints2) {


    boolean flag = true;


    Integer[] masses1 = new Integer[dataPoints.length];
    for (int i = 0; i < dataPoints.length; i++)
      masses1[i] = (int) dataPoints[i].getMZ();


    Integer[] masses2 = new Integer[dataPoints2.length];
    for (int i = 0; i < dataPoints2.length; i++)
      masses2[i] = (int) dataPoints2[i].getMZ();


    for (int i = 0; i < masses2.length; i++) {
      if (!CollectionUtils.arrayContains(masses1, masses2[i]))
        flag = false;
    }


    return flag;


  }


  public static HashMap<String, Object> parseModificationMass(String keyPart,
      HashMap<String, Object> section) {


    int count = 1;
    String key = keyPart + count;
    String value;


    while (section.containsKey(key)) {
      value = (String) section.get(key);
      // No variable modifications
      if (value == null)
        break;
      if (value.equalsIgnoreCase("-1"))
        break;


      // Parse mass
      String[] tokens = value.split(",");
      String mass = tokens[0];
      // Oxidation (M)
      int endIndex = tokens[1].lastIndexOf(" ");
      value = tokens[1].substring(0, endIndex);
      value = value.trim();
      section.remove(key);
      section.put(value, mass);
      count++;
      key = keyPart + count;
      value = null;
    }


    return section;
  }


  public static double calculateIdentityThreshold(
      double significanceThreshold, double identityQueryScore) {


    /*
     * This formula comes from Mascot software.
     */
    double identityThreshold = identityQueryScore
        / (significanceThreshold * 20);
    identityThreshold = Math.log(identityThreshold) * 10.0;


    return identityThreshold;
  }


}
Source Code of net.sf.mzmine.util.MascotParserUtils

Related Classes of net.sf.mzmine.util.MascotParserUtils