Package org.broadinstitute.gatk.utils.text

Source Code of org.broadinstitute.gatk.utils.text.ListFileUtils$StringConverter

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package org.broadinstitute.gatk.utils.text;

import org.broadinstitute.gatk.utils.commandline.ParsingEngine;
import org.broadinstitute.gatk.utils.commandline.RodBinding;
import org.broadinstitute.gatk.utils.commandline.Tags;
import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID;
import org.broadinstitute.gatk.engine.refdata.tracks.FeatureManager;
import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet;
import org.broadinstitute.gatk.utils.exceptions.UserException;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;

/**
* A collection of convenience methods for working with list files.
*/
public class ListFileUtils {
    /**
     * Lines starting with this String in .list files are considered comments.
     */
    public static final String LIST_FILE_COMMENT_START = "#";       

    /**
     * Unpack the bam files to be processed, given a list of files.  That list of files can
     * itself contain entries which are lists of other files to be read (note: you cannot have lists
     * of lists of lists). Lines in .list files containing only whitespace or which begin with
     * LIST_FILE_COMMENT_START are ignored.
     *
     * @param samFiles The sam files, in string format.
     * @param parser Parser
     * @return a flattened list of the bam files provided
     */
    public static List<SAMReaderID> unpackBAMFileList(final List<String> samFiles, final ParsingEngine parser) {
        List<SAMReaderID> unpackedReads = new ArrayList<SAMReaderID>();
        for( String inputFileName: samFiles ) {
            Tags inputFileNameTags = parser.getTags(inputFileName);
            inputFileName = expandFileName(inputFileName);
            if (inputFileName.toLowerCase().endsWith(".list") ) {
                try {
                    for ( String fileName : new XReadLines(new File(inputFileName), true, LIST_FILE_COMMENT_START) ) {
                        unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName)));
                    }
                }
                catch( FileNotFoundException ex ) {
                    throw new UserException.CouldNotReadInputFile(new File(inputFileName), "Unable to find file while unpacking reads", ex);
                }
            }
            else if(inputFileName.toLowerCase().endsWith(".bam")) {
                unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags));
            }
            else if(inputFileName.endsWith("stdin")) {
                unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags));
            }
            else {
                throw new UserException.CommandLineException(String.format("The GATK reads argument (-I, --input_file) supports only BAM files with the .bam extension and lists of BAM files " +
                        "with the .list extension, but the file %s has neither extension.  Please ensure that your BAM file or list " +
                        "of BAM files is in the correct format, update the extension, and try again.",inputFileName));
            }
        }
        return unpackedReads;
    }

    /**
     * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine.
     * @param RODBindings a text equivale
     * @param parser Parser
     * @return a list of expanded, bound RODs.
     */
    @Deprecated
    @SuppressWarnings("unused") // TODO: Who is still using this? External walkers?
    public static Collection<RMDTriplet> unpackRODBindingsOldStyle(final Collection<String> RODBindings, final ParsingEngine parser) {
        // todo -- this is a strange home for this code.  Move into ROD system
        Collection<RMDTriplet> rodBindings = new ArrayList<RMDTriplet>();

        for (String fileName: RODBindings) {
            final Tags tags = parser.getTags(fileName);
            fileName = expandFileName(fileName);

            List<String> positionalTags = tags.getPositionalTags();
            if(positionalTags.size() != 2)
                throw new UserException("Invalid syntax for -B (reference-ordered data) input flag.  " +
                        "Please use the following syntax when providing reference-ordered " +
                        "data: -B:<name>,<type> <filename>.");
            // Assume that if tags are present, those tags are name and type.
            // Name is always first, followed by type.
            String name = positionalTags.get(0);
            String type = positionalTags.get(1);

            RMDTriplet.RMDStorageType storageType;
            if(tags.getValue("storage") != null)
                storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,tags.getValue("storage"));
            else if(fileName.toLowerCase().endsWith("stdin"))
                storageType = RMDTriplet.RMDStorageType.STREAM;
            else
                storageType = RMDTriplet.RMDStorageType.FILE;

            rodBindings.add(new RMDTriplet(name,type,fileName,storageType,tags));
        }

        return rodBindings;
    }

    /**
     * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine.
     * @param RODBindings a text equivale
     * @param parser Parser
     * @return a list of expanded, bound RODs.
     */
    @SuppressWarnings("unchecked")
    public static Collection<RMDTriplet> unpackRODBindings(final Collection<RodBinding> RODBindings, @SuppressWarnings("unused") final ParsingEngine parser) {
        // todo -- this is a strange home for this code.  Move into ROD system
        Collection<RMDTriplet> rodBindings = new ArrayList<RMDTriplet>();
        FeatureManager builderForValidation = new FeatureManager();

        for (RodBinding rodBinding: RODBindings) {
            String argValue = rodBinding.getSource();
            String fileName = expandFileName(argValue);
            String name = rodBinding.getName();
            String type = rodBinding.getTribbleType();

            RMDTriplet.RMDStorageType storageType;
            if(rodBinding.getTags().getValue("storage") != null)
                storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,rodBinding.getTags().getValue("storage"));
            else if(fileName.toLowerCase().endsWith("stdin"))
                storageType = RMDTriplet.RMDStorageType.STREAM;
            else
                storageType = RMDTriplet.RMDStorageType.FILE;

            RMDTriplet triplet = new RMDTriplet(name,type,fileName,storageType,rodBinding.getTags());

            // validate triplet type
            FeatureManager.FeatureDescriptor descriptor = builderForValidation.getByTriplet(triplet);
            if ( descriptor == null )
                throw new UserException.UnknownTribbleType(rodBinding.getTribbleType(),
                        String.format("Field %s had provided type %s but there's no such Tribble type.  The compatible types are: %n%s",
                                rodBinding.getName(), rodBinding.getTribbleType(), builderForValidation.userFriendlyListOfAvailableFeatures(rodBinding.getType())));
            if ( ! rodBinding.getType().isAssignableFrom(descriptor.getFeatureClass()) )
                throw new UserException.BadArgumentValue(rodBinding.getName(),
                        String.format("Field %s expects Features of type %s, but the input file produces Features of type %s. The compatible types are: %n%s",
                                rodBinding.getName(), rodBinding.getType().getSimpleName(), descriptor.getSimpleFeatureName(),
                                builderForValidation.userFriendlyListOfAvailableFeatures(rodBinding.getType())));


            rodBindings.add(triplet);
        }

        return rodBindings;
    }

    /**
     * Expand any special characters that appear in the filename.  Right now, '-' is expanded to
     * '/dev/stdin' only, but in the future, special characters like '~' and '*' that are passed
     * directly to the command line in some circumstances could be expanded as well.  Be careful
     * when adding UNIX-isms.
     * @param argument the text appearing on the command-line.
     * @return An expanded string suitable for opening by Java/UNIX file handling utilities.
     */
    private static String expandFileName(String argument) {
        if(argument.trim().equals("-"))
            return "/dev/stdin";
        return argument;
    }

    /**
     * Returns a new set of values, containing a final set of values expanded from values
     * <p/>
     * Each element E of values can either be a literal string or a file ending in .list.
     * For each E ending in .list we try to read a file named E from disk, and if possible
     * all lines from that file are expanded into unique values.
     *
     * @param values Original values
     * @return entries from values or the files listed in values
     */
    public static Set<String> unpackSet(Collection<String> values) {
        if (values == null)
            throw new NullPointerException("values cannot be null");
        Set<String> unpackedValues = new LinkedHashSet<String>();
        // Let's first go through the list and see if we were given any files.
        // We'll add every entry in the file to our set, and treat the entries as
        // if they had been specified on the command line.
        for (String value : values) {
            File file = new File(value);
            if (value.toLowerCase().endsWith(".list") && file.exists()) {
                try {
                    unpackedValues.addAll(new XReadLines(file, true, LIST_FILE_COMMENT_START).readLines());
                } catch (IOException e) {
                    throw new UserException.CouldNotReadInputFile(file, e);
                }
            } else {
                unpackedValues.add(value);
            }
        }
        return unpackedValues;
    }

    /**
     * Returns a new set of values including only values listed by filters
     * <p/>
     * Each element E of values can either be a literal string or a file.  For each E,
     * we try to read a file named E from disk, and if possible all lines from that file are expanded
     * into unique names.
     * <p/>
     * Filters may also be a file of filters.
     *
     * @param values     Values or files with values
     * @param filters    Filters or files with filters
     * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions
     * @return entries from values or the files listed in values, filtered by filters
     */
    public static Set<String> includeMatching(Collection<String> values, Collection<String> filters, boolean exactMatch) {
        return includeMatching(values, IDENTITY_STRING_CONVERTER, filters, exactMatch);
    }

    /**
     * Converts a type T to a String representation.
     *
     * @param <T> Type to convert to a String.
     */
    public static interface StringConverter<T> {
        String convert(T value);
    }

    /**
     * Returns a new set of values including only values matching filters
     * <p/>
     * Filters may also be a file of filters.
     * <p/>
     * The converter should convert T to a unique String for each value in the set.
     *
     * @param values     Values or files with values
     * @param converter  Converts values to strings
     * @param filters    Filters or files with filters
     * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions
     * @return entries from values including only values matching filters
     */
    public static <T> Set<T> includeMatching(Collection<T> values, StringConverter<T> converter, Collection<String> filters, boolean exactMatch) {
        if (values == null)
            throw new NullPointerException("values cannot be null");
        if (converter == null)
            throw new NullPointerException("converter cannot be null");
        if (filters == null)
            throw new NullPointerException("filters cannot be null");

        Set<String> unpackedFilters = unpackSet(filters);
        Set<T> filteredValues = new LinkedHashSet<T>();
        Collection<Pattern> patterns = null;
        if (!exactMatch)
            patterns = compilePatterns(unpackedFilters);
        for (T value : values) {
            String converted = converter.convert(value);
            if (unpackedFilters.contains(converted)) {
                filteredValues.add(value);
            } else if (!exactMatch) {
                for (Pattern pattern : patterns)
                    if (pattern.matcher(converted).find())
                        filteredValues.add(value);
            }
        }
        return filteredValues;
    }
   
    /**
     * Returns a new set of values excluding any values matching filters.
     * <p/>
     * Filters may also be a file of filters.
     * <p/>
     * The converter should convert T to a unique String for each value in the set.
     *
     * @param values     Values or files with values
     * @param converter  Converts values to strings
     * @param filters    Filters or files with filters
     * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions
     * @return entries from values exluding any values matching filters
     */
    public static <T> Set<T> excludeMatching(Collection<T> values, StringConverter<T> converter, Collection<String> filters, boolean exactMatch) {
        if (values == null)
            throw new NullPointerException("values cannot be null");
        if (converter == null)
            throw new NullPointerException("converter cannot be null");
        if (filters == null)
            throw new NullPointerException("filters cannot be null");

        Set<String> unpackedFilters = unpackSet(filters);
        Set<T> filteredValues = new LinkedHashSet<T>();
        filteredValues.addAll(values);
        Collection<Pattern> patterns = null;
        if (!exactMatch)
            patterns = compilePatterns(unpackedFilters);
        for (T value : values) {
            String converted = converter.convert(value);
            if (unpackedFilters.contains(converted)) {
                filteredValues.remove(value);
            } else if (!exactMatch) {
                for (Pattern pattern : patterns)
                    if (pattern.matcher(converted).find())
                        filteredValues.remove(value);
            }
        }
        return filteredValues;
    }

    private static Collection<Pattern> compilePatterns(Collection<String> filters) {
        Collection<Pattern> patterns = new ArrayList<Pattern>();
        for (String filter: filters) {
            patterns.add(Pattern.compile(filter));
        }
        return patterns;
    }

    protected static final StringConverter<String> IDENTITY_STRING_CONVERTER = new StringConverter<String>() {
        @Override
        public String convert(String value) {
            return value;
        }
    };
}
TOP

Related Classes of org.broadinstitute.gatk.utils.text.ListFileUtils$StringConverter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.