Source Code of org.broad.igv.data.seg.SegmentFileParser

/*
 * Copyright (c) 2007-2012 The Broad Institute, Inc.
 * SOFTWARE COPYRIGHT NOTICE
 * This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved.
 *
 * This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality.
 *
 * This software is licensed under the terms of the GNU Lesser General Public License (LGPL),
 * Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php.
 */
package org.broad.igv.data.seg;


//~--- non-JDK imports --------------------------------------------------------


import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.exceptions.DataLoadException;
import org.broad.igv.exceptions.ParserException;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.track.TrackType;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import htsjdk.tribble.readers.AsciiLineReader;


/**
 * Example
 * CCLE_name  chrom  loc.start  loc.end  num.mark  seg.mean
 * A2780_OVARY  1  51598  88465  17  1.0491
 * A2780_OVARY  1  218569  55606710  30512  0.0248
 * A2780_OVARY  1  55606801  61331401  4399  -0.0431
 *
 * @author jrobinso
 */
public class SegmentFileParser implements SegFileParser {


    enum Type {
        SEG, BIRDSUITE, NEXUS
    }


    ;


    private static Logger log = Logger.getLogger(SegmentFileParser.class);


    boolean birdsuite = false;
    int sampleColumn = 0;
    int chrColumn = 1;
    int startColumn = 2;
    int endColumn = 3;
    //int snpCountColumn = 4;    // Default value
    int dataColumn = 5;        // Default value
    ResourceLocator locator;


    /**
     * Constructs ...
     *
     * @param locator
     */
    public SegmentFileParser(ResourceLocator locator) {
        this.locator = locator;
        if (locator.getPath().toLowerCase().endsWith("birdseye_canary_calls")) {
            birdsuite = true;
        }
    }




    /**
     * Return a map of trackId -> segment datasource
     *
     * @return
     */
    public SegmentedAsciiDataSet loadSegments(ResourceLocator locator, Genome genome) {


        SegmentedAsciiDataSet dataset = new SegmentedAsciiDataSet(genome);


        if (birdsuite) {
            dataset.setTrackType(TrackType.CNV);
        }


        AsciiLineReader reader = null;
        String nextLine = null;
        int lineNumber = 0;
        try {
            reader = ParsingUtils.openAsciiReader(locator);


            // Parse comments, if any
            nextLine = reader.readLine();
            while (nextLine.startsWith("#") || (nextLine.trim().length() == 0)) {
                lineNumber++;
                if (nextLine.length() > 0) {
                    parseComment(nextLine, dataset);
                }
                nextLine = reader.readLine();
            }


            // Read column headings
            String[] headings = nextLine.split("\t");




            if (birdsuite) {
                //sample  sample_index  copy_number  chr  start  end  confidence
                sampleColumn = 0;
                dataColumn = 2;
                chrColumn = 3;
                startColumn = 4;
                endColumn = 5;


            } else {
                sampleColumn = 0;
                chrColumn = 1;
                startColumn = 2;
                endColumn = 3;
                dataColumn = headings.length - 1;
            }


            while ((nextLine = reader.readLine()) != null && (nextLine.trim().length() > 0)) {
                lineNumber++;


                String[] tokens = Globals.tabPattern.split(nextLine, -1);
                int nTokens = tokens.length;
                if (nTokens > 4) {
                    int start;
                    int end;
                    try {
                        start = ParsingUtils.parseInt(tokens[startColumn].trim());
                    } catch (NumberFormatException numberFormatException) {
                        throw new ParserException("Column " + (startColumn + 1) + " must contain a numeric value.",
                                lineNumber, nextLine);
                    }
                    try {
                        end = ParsingUtils.parseInt(tokens[endColumn].trim());
                    } catch (NumberFormatException numberFormatException) {
                        throw new ParserException("Column " + (endColumn + 1) + " must contain a numeric value.",
                                lineNumber, nextLine);
                    }


                    String chr = tokens[chrColumn].trim();
                    if (genome != null) {
                        chr = genome.getChromosomeAlias(chr);
                    }




                    String trackId = new String(tokens[sampleColumn].trim());


                    StringBuffer desc = null;
                    if (birdsuite) {
                        desc = new StringBuffer();
                        desc.append("<br>");
                        desc.append(headings[6]);
                        desc.append("=");
                        desc.append(tokens[6]);
                    } else {
                        if (tokens.length > 4) {
                            desc = new StringBuffer();
                            for (int i = 4; i < headings.length - 1; i++) {
                                desc.append("<br>");
                                desc.append(headings[i]);
                                desc.append(": ");
                                desc.append(tokens[i]);
                            }
                        }
                    }




                    try {
                        float value = Float.parseFloat(tokens[dataColumn]);
                        String description = desc == null ? null : desc.toString();
                        dataset.addSegment(trackId, chr, start, end, value, description);
                    } catch (NumberFormatException numberFormatException) {
                        // log.info("Skipping line: " + nextLine);
                    }
                }
            }


        } catch (DataLoadException pe) {
            throw pe;
        } catch (ParserException pe) {
            throw pe;
        } catch (Exception e) {
            if (nextLine != null && lineNumber != 0) {
                throw new ParserException(e.getMessage(), e, lineNumber, nextLine);
            } else {
                throw new RuntimeException(e);
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
        }


        dataset.sortLists();
        return dataset;
    }




    /**
     * Note:  This is an exact copy of the method in ExpressionFileParser.  Refactor to merge these
     * two parsers, or share a common base class.
     *
     * @param comment
     * @param dataset
     */
    private void parseComment(String comment, SegmentedAsciiDataSet dataset) {


        String tmp = comment.substring(1, comment.length());
        if (tmp.startsWith("track")) {
            ParsingUtils.parseTrackLine(tmp, dataset.getTrackProperties());
        } else {
            String[] tokens = tmp.split("=");


            String key = tokens[0].trim().toLowerCase();
            if (key.equals("type")) {
                if (tokens.length != 2) {
                    return;
                }
                try {
                    dataset.setTrackType(TrackType.valueOf(tokens[1].trim().toUpperCase()));
                } catch (Exception exception) {
                    log.error("Unknown track type: " + tokens[1].trim().toUpperCase());
                }
            }


        }
    }
}
Source Code of org.broad.igv.data.seg.SegmentFileParser

Related Classes of org.broad.igv.data.seg.SegmentFileParser