Source Code of org.broad.igv.data.expression.ExpressionFileParser$FormatDescriptor

/*
 * Copyright (c) 2007-2011 by The Broad Institute of MIT and Harvard.  All Rights Reserved.
 *
 * This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
 * Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
 *
 * THE SOFTWARE IS PROVIDED "AS IS." THE BROAD AND MIT MAKE NO REPRESENTATIONS OR
 * WARRANTES OF ANY KIND CONCERNING THE SOFTWARE, EXPRESS OR IMPLIED, INCLUDING,
 * WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER
 * OR NOT DISCOVERABLE.  IN NO EVENT SHALL THE BROAD OR MIT, OR THEIR RESPECTIVE
 * TRUSTEES, DIRECTORS, OFFICERS, EMPLOYEES, AND AFFILIATES BE LIABLE FOR ANY DAMAGES
 * OF ANY KIND, INCLUDING, WITHOUT LIMITATION, INCIDENTAL OR CONSEQUENTIAL DAMAGES,
 * ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER
 * THE BROAD OR MIT SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT
 * SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
 */
/*
 * ExpressionFileParser.java
 *
 * Parser for GCT and related file formats (e.g. RES).
 *
 * Created on October 18, 2007, 2:33 PM
 *
 * To change this template, choose Tools | Template Manager
 * and open the template in the editor.
 */
package org.broad.igv.data.expression;


//~--- non-JDK imports --------------------------------------------------------


import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.exceptions.ParserException;
import org.broad.igv.exceptions.ProbeMappingException;
import org.broad.igv.feature.Locus;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.tools.StatusMonitor;
import org.broad.igv.track.TrackType;
import org.broad.igv.ui.IGV;
import org.broad.igv.ui.util.MagetabSignalDialog;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import htsjdk.tribble.readers.AsciiLineReader;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;


/**
 * TODO -- handle case with probe file
 *
 * @author jrobinso
 */
public class ExpressionFileParser {


    private static Logger log = Logger.getLogger(ExpressionFileParser.class);
    private static final int MAX_ERROR_COUNT = 200;


    public enum FileType {


        RES, GCT, MAPPED, TAB, MET, DCHIP, MAGE_TAB
    }


    ResourceLocator dataFileLocator;


    FileType type;
    Genome genome;


    /**
     * Map chr -> longest feature
     */
    Map<String, Integer> longestProbeMap;


    Map<String, List<Row>> rowMap = new HashMap();
    StatusMonitor statusMonitor;
    GeneToLocusHelper locusHelper;


    // For effecient lookup, data column name -> index
    Map<String, Integer> dataColumnIndexMap;


    int errorCount = 0;


    /**
     * Test to determine if the path referes to a GCT file.  The GCT specification requires that the first line
     * be equal to #1.2
     *
     * @param path
     * @return
     * @throws IOException
     */
    public static boolean isGCT(String path) throws IOException {
        BufferedReader reader = null;
        try {
            reader = ParsingUtils.openBufferedReader(path);
            String firstLine = reader.readLine();
            return firstLine != null && firstLine.trim().equals("#1.2");
        } finally {
            if (reader != null) reader.close();
        }
    }




    public ExpressionFileParser(ResourceLocator resFile, String probeFile, Genome genome) throws IOException {


        this.dataFileLocator = resFile;
        this.type = determineType(resFile);
        this.genome = genome;
        longestProbeMap = new HashMap();
        locusHelper = new GeneToLocusHelper(probeFile);


    }




    public ExpressionFileParser(File resFile, String probeFile, Genome genome) throws IOException {
        this(new ResourceLocator(resFile.getAbsolutePath()), probeFile, genome);
    }


    /*
     */


    public static boolean parsableMAGE_TAB(ResourceLocator file) throws IOException {
        AsciiLineReader reader = null;
        try {
            reader = ParsingUtils.openAsciiReader(file);
            String nextLine = null;


            //skip first row
            reader.readLine();


            //check second row for MAGE_TAB identifiers
            if ((nextLine = reader.readLine()) != null && (nextLine.contains("Reporter REF") ||
                    nextLine.contains("Composite Element REF") || nextLine.contains("Term Source REF") ||
                    nextLine.contains("CompositeElement REF") || nextLine.contains("TermSource REF") ||
                    nextLine.contains("Coordinates REF"))) {
                return true;
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
        }


        return false;
    }


    public ExpressionDataset createDataset() {
        ExpressionDataset dataset = new ExpressionDataset(genome);
        parse(dataset);
        return dataset;
    }


    public static FileType determineType(ResourceLocator dataFileLocator) {
        String fn = dataFileLocator.getPath().toLowerCase();
        if (fn.endsWith(".gz")) {
            int l = fn.length() - 3;
            fn = fn.substring(0, l);
        }
        if (fn.endsWith(".txt")) {
            int l = fn.length() - 4;
            fn = fn.substring(0, l);
        }


        //TODO genomespace hack
        if (dataFileLocator.getPath().contains("?") && dataFileLocator.getPath().contains("dataformat/gct")) {
            fn = ".gct";
        }
        FileType type;


        if (fn.endsWith("res")) {
            type = FileType.RES;
        } else if (fn.endsWith("gct")) {
            type = FileType.GCT;
        } else if (fn.endsWith("mapped")) {
            type = FileType.MAPPED;
        } else if (fn.endsWith("met")) {
            type = FileType.MET;
        } else if (fn.endsWith("dchip")) {
            type = FileType.DCHIP;
        } else if ("mage-tab".equals(dataFileLocator.getType()) || "MAGE_TAB".equals(dataFileLocator.getDescription())) {
            type = FileType.MAGE_TAB;
        } else {
            type = FileType.TAB;
        }
        return type;
    }


    /**
     * Parse the file and return a Dataset
     *
     * @return
     */
    public void parse(ExpressionDataset dataset) {


        // Create a buffer for the string split utility.  We use  a custom utility as opposed
        // to String.split() for performance.


        dataset.setType(TrackType.GENE_EXPRESSION);


        BufferedReader reader = null;
        String nextLine = null;
        int lineCount = 0;
        //String[] columnHeadings = null;
        try {


            reader = ParsingUtils.openBufferedReader(dataFileLocator);


            // Parse the header(s) to determine the precise format.
            FormatDescriptor formatDescriptor = parseHeader(reader, type, dataset);
            final int probeColumn = formatDescriptor.probeColumn;
            final int descriptionColumn = formatDescriptor.descriptionColumn;
            int nDataColumns = formatDescriptor.dataColumns.length;


            dataset.setColumnHeadings(formatDescriptor.dataHeaders);


            dataColumnIndexMap = new HashMap<String, Integer>();
            for (int i = 0; i < formatDescriptor.dataHeaders.length; i++) {
                dataColumnIndexMap.put(formatDescriptor.dataHeaders[i], i);
            }


            // Loop through the data rows


            while ((nextLine = reader.readLine()) != null) {


                String[] tokens = Globals.tabPattern.split(nextLine, -1);
                int nTokens = tokens.length;
                String probeId = new String(tokens[probeColumn]);
                float[] values = new float[nDataColumns];


                String description = (descriptionColumn >= 0) ? tokens[descriptionColumn] : null;


                if (type == FileType.MAGE_TAB && probeId.startsWith("cg")) {
                    // TODO -- this is a very ugly and fragile method to determine data type! Change this!
                    dataset.setType(TrackType.DNA_METHYLATION);
                }


                for (int i = 0; i < nDataColumns; i++) {
                    try {
                        int dataIndex = formatDescriptor.dataColumns[i];


                        // If we are out of value tokens, or the cell is blank, assign NAN to the cell.
                        if ((dataIndex >= nTokens) || (tokens[dataIndex].length() == 0)) {
                            values[i] = Float.NaN;
                        } else {
                            values[i] = Float.parseFloat(tokens[dataIndex]);
                        }


                    } catch (NumberFormatException numberFormatException) {


                        // This s an expected condition.  IGV uses NaN to
                        // indicate non numbers (missing data values)
                        values[i] = Float.NaN;
                    }


                }
                addRow(probeId, description, values);
                lineCount++;


                // This method is designed to be interruptable (canceled by
                // user.  Check every 1000 lines for an interrupt.
                if (lineCount == 1000) {
                    checkForInterrupt();
                    lineCount = 0;
                    if (statusMonitor != null) {
                        statusMonitor.incrementPercentComplete(1);
                    }
                }
            }    // End loop through lines




            // Sort row for each chromosome by start location
            sortRows();


            // Update dataset
            for (String chr : rowMap.keySet()) {
                dataset.setStartLocations(chr, getStartLocations(chr));
                dataset.setEndLocations(chr, getEndLocations(chr));
                dataset.setFeatureNames(chr, getProbes(chr));




                for (String heading : formatDescriptor.dataHeaders) {
                    dataset.setData(heading, chr, getData(heading, chr));
                }
            }


            dataset.setLongestFeatureMap(longestProbeMap);


        } catch (FileNotFoundException ex) {
            throw new RuntimeException(ex);
        } catch (InterruptedException e) {
            throw new RuntimeException("Operation cancelled");
        } catch (Exception e) {
            e.printStackTrace();
            if (nextLine != null && lineCount != 0) {
                throw new ParserException(e.getMessage(), e, lineCount, nextLine);
            } else {
                throw new RuntimeException(e);
            }
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {


                }
            }
        }


    }




    public void addRow(String probeId, String description, float[] values) {
        List<Locus> loci = locusHelper.getLoci(probeId, description, genome.getId());
        if (loci != null) {
            for (Locus locus : loci) {
                if ((locus != null) && locus.isValid()) {
                    List<Row> rows = rowMap.get(locus.getChr());
                    if (rows == null) {
                        rows = new ArrayList();
                        rowMap.put(locus.getChr(), rows);
                    }


                    String chr = locus.getChr();
                    int length = locus.getEnd() - locus.getStart();
                    if (longestProbeMap.containsKey(chr)) {
                        longestProbeMap.put(chr, Math.max(longestProbeMap.get(chr), length));
                    } else {
                        longestProbeMap.put(chr, length);
                    }


                    rows.add(new Row(probeId, locus.getChr(), locus.getStart(), locus.getEnd(), values));


                }
            }
        } else {
            if (errorCount < MAX_ERROR_COUNT) {
                log.info("Probe: '" + probeId + "' could not be mapped to a genomic position.");
            } else if (errorCount == MAX_ERROR_COUNT) {
                log.info("Maximum probe mapping warning count exceeded.  Further mapping errors will not be logged");
            }
            errorCount++;


        }
    }




    /**
     * Sort all row collections by ascending start location
     */
    private void sortRows() {
        Comparator<Row> c = new Comparator<Row>() {


            public int compare(ExpressionFileParser.Row arg0, ExpressionFileParser.Row arg1) {
                return arg0.start - arg1.start;
            }
        };
        for (List<Row> rows : rowMap.values()) {
            Collections.sort(rows, c);
        }
    }


    public String[] getProbes(String chr) {
        List<Row> rows = rowMap.get(chr);
        String[] labels = new String[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            labels[i] = rows.get(i).feature;
        }
        return labels;


    }




    public int[] getStartLocations(String chr) {


        List<Row> rows = rowMap.get(chr);
        int[] startLocations = new int[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            startLocations[i] = rows.get(i).start;
        }
        return startLocations;
    }




    public int[] getEndLocations(String chr) {


        List<Row> rows = rowMap.get(chr);
        int[] endLocations = new int[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            endLocations[i] = rows.get(i).end;
        }
        return endLocations;
    }




    public float[] getData(String heading, String chr) {


        int columnIndex = dataColumnIndexMap.get(heading);
        List<Row> rows = rowMap.get(chr);
        float[] data = new float[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            data[i] = rows.get(i).values[columnIndex];
        }
        return data;


    }




    /**
     * Note:  This is an exact copy of the method in IGVDatasetParser.  Refactor to merge these
     * two parsers, or share a common base class.
     *
     * @param comment
     * @param dataset
     */
    private static void parseComment(String comment, ExpressionDataset dataset) {


        // If no dataset is supplied there is nothing to do
        if (dataset == null) return;


        String tmp = comment.substring(1, comment.length());
        if (tmp.startsWith("track")) {
            dataset.setTrackLine(tmp);


        } else {
            String[] tokens = tmp.split("=");
            if (tokens.length != 2) {
                return;
            }
            String key = tokens[0].trim().toLowerCase();
            if (key.equals("name")) {
                dataset.setName(tokens[1].trim());
            } else if (key.equals("type")) {


                try {
                    dataset.setType(TrackType.valueOf(tokens[1].trim().toUpperCase()));
                } catch (Exception exception) {
                    // Ignore
                }
            }
        }
    }


    private void checkForInterrupt() throws InterruptedException {
        Thread.sleep(1);    // <- check for interrupted thread
    }


    public static FormatDescriptor parseHeader(BufferedReader reader, FileType type, ExpressionDataset dataset) throws IOException {




        int descriptionColumn = -1;    // Default - no description column
        int dataStartColumn = -1;
        int probeColumn = 0;


        switch (type) {
            case RES:
                dataStartColumn = 2;
                probeColumn = 1;
                descriptionColumn = 0;
                break;
            case GCT:
                dataStartColumn = 2;
                probeColumn = 0;
                descriptionColumn = 1;
                break;
            case MAPPED:
                dataStartColumn = 4;
                probeColumn = 0;
                break;
            case MET:
                dataStartColumn = 4;
                probeColumn = 0;
                break;
            case DCHIP:
                dataStartColumn = 1;
                probeColumn = 0;
                descriptionColumn = -1;
                break;
            case MAGE_TAB:
                descriptionColumn = -1;
                probeColumn = 0;
                break;
            case TAB:
                dataStartColumn = 1;
                probeColumn = 0;
            default:
                // throw exception?
        }




        String headerLine = findHeaderLine(reader, type, dataset);
        String[] firstHeaderRowTokens = headerLine.split("\t");


        List<String> dataHeadingList = new ArrayList(firstHeaderRowTokens.length);
        List<Integer> dataColumnList = new ArrayList<Integer>(firstHeaderRowTokens.length);
        if (type != FileType.MAGE_TAB) {
            int skip = type == FileType.RES ? 2 : 1;
            for (int i = dataStartColumn; i < firstHeaderRowTokens.length; i += skip) {
                String heading = firstHeaderRowTokens[i].replace('\"', ' ').trim();
                dataHeadingList.add(heading);
                dataColumnList.add(i);
            }
        } else {
            //Mage-Tab
            String nextLine = reader.readLine();
            String[] secondHeaderRowTokens = nextLine.split("\t");


            // Remove the label columns.
            for (int i = 1; i < firstHeaderRowTokens.length; i++) {
                if (firstHeaderRowTokens[i].trim().length() > 0) {
                    dataStartColumn = i;
                    break;
                }
            }


            String[] dataHeaderColumns = new String[secondHeaderRowTokens.length - dataStartColumn];
            System.arraycopy(secondHeaderRowTokens, dataStartColumn, dataHeaderColumns, 0, dataHeaderColumns.length);
            String qCol = findSignalColumnHeading(dataHeaderColumns);
            for (int i = 0; i < secondHeaderRowTokens.length; i++) {
                String heading = secondHeaderRowTokens[i].replace('\"', ' ').trim();
                if (heading.toLowerCase().contains(qCol.toLowerCase())) {
                    dataHeadingList.add(firstHeaderRowTokens[i]);
                    dataColumnList.add(i);
                }
            }
        }
        String[] dataHeaders = dataHeadingList.toArray(new String[dataHeadingList.size()]);
        int[] dataColumns = new int[dataColumnList.size()];
        for (int i = 0; i < dataColumnList.size(); i++) dataColumns[i] = dataColumnList.get(i);


        //nColumns = dataHeadings.length;


        // If format is RES skip the two lines following the header
        if (type == FileType.RES) {
            reader.readLine();
            reader.readLine();
        }


        return new FormatDescriptor(probeColumn, descriptionColumn, dataColumns, dataHeaders, firstHeaderRowTokens.length);
    }


    private static String findHeaderLine(BufferedReader reader, FileType type, ExpressionDataset dataset) throws IOException {
        String nextLine;
        String headerLine;
        if (type == FileType.GCT) {
            nextLine = reader.readLine();
            if (nextLine.startsWith("#")) {
                parseComment(nextLine, dataset);
            }
            nextLine = reader.readLine();
            if (nextLine.startsWith("#")) {
                parseComment(nextLine, dataset);
            }
            headerLine = reader.readLine();
        } else if (type != FileType.MAGE_TAB) {
            // Skip meta data, if any
            while ((nextLine = reader.readLine()).startsWith("#") && (nextLine != null)) {
                parseComment(nextLine, dataset);
            }
            headerLine = nextLine;
        } else {
            headerLine = reader.readLine();
        }
        return headerLine;
    }


    private static String findSignalColumnHeading(String[] tokens) {


        //Use a map so comparisons are case insensitive,
        //but returned column heading retains original case
        Map<String, String> columnHeaderSet = new HashMap<String, String>();
        for (String tok : tokens) {
            columnHeaderSet.put(tok.toLowerCase(), tok);
        }


        String[] signals = new String[]{"beta_value", "beta value", "log2 signal", "signal"};


        for (String sig : signals) {
            if (columnHeaderSet.containsKey(sig.toLowerCase())) {
                return columnHeaderSet.get(sig.toLowerCase());
            }
        }


        String qCol = null;


        HashSet<String> uniqueColumns = new HashSet<String>(Arrays.asList(tokens));
        List<String> qColumns = new ArrayList<String>(uniqueColumns);
        if (qColumns.size() == 1) {
            qCol = qColumns.get(0);
        } else if (!Globals.isHeadless()) {
            // Let user choose the signal column
            Collections.sort(qColumns);


            MagetabSignalDialog msDialog = new MagetabSignalDialog(IGV.getMainFrame(), qColumns.toArray(new String[0]));
            msDialog.setVisible(true);


            if (!msDialog.isCanceled()) {
                qCol = msDialog.getQuantitationColumn();
            } else {
                throw new RuntimeException("Could not find any signal columns in the MAGE-TAB file");
            }
        }
        return qCol;
    }


    class Row {


        String feature;
        String chr;
        int start;
        int end;
        float[] values;


        Row(String feature, String chr, int start, int end, float[] values) {
            this.feature = feature;
            this.chr = chr;
            this.start = start;
            this.end = end;
            this.values = values;
        }
    }


    public static class FormatDescriptor {
        private int probeColumn;
        private int descriptionColumn;
        private int[] dataColumns;
        private String[] dataHeaders;
        private int totalColumnCount;


        FormatDescriptor(int probeColumn, int descriptionColumn, int[] dataColumns, String[] dataHeaders, int totalColumnCount) {
            this.probeColumn = probeColumn;
            this.descriptionColumn = descriptionColumn;
            this.dataColumns = dataColumns;
            this.dataHeaders = dataHeaders;
            this.totalColumnCount = totalColumnCount;
        }


        public int getProbeColumn() {
            return probeColumn;
        }


        public int getDescriptionColumn() {
            return descriptionColumn;
        }


        public int[] getDataColumns() {
            return dataColumns;
        }


        public String[] getDataHeaders() {
            return dataHeaders;
        }


        public int getTotalColumnCount() {
            return totalColumnCount;
        }
    }
}
Source Code of org.broad.igv.data.expression.ExpressionFileParser$FormatDescriptor

Related Classes of org.broad.igv.data.expression.ExpressionFileParser$FormatDescriptor