package picard.illumina;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProcessExecutor;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Illumina;
import picard.cmdline.StandardOptionDefinitions;
import picard.illumina.parser.IlluminaDataProviderFactory;
import picard.illumina.parser.IlluminaDataType;
import picard.illumina.parser.IlluminaFileUtil;
import picard.illumina.parser.OutputMapping;
import picard.illumina.parser.ParameterizedFileUtil;
import picard.illumina.parser.ReadStructure;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
* Program to check a lane of an Illumina output directory. This program checks that files exist, are non-zero in length, for every tile/cycle and
* specified data type. If NO data type is specified then the default data types used by IlluminaBasecallsToSam are used.
*/
@CommandLineProgramProperties(
usage = "Check that the files to provide the data specified by DATA_TYPES are available, exist, and are reasonably sized for every tile/cycle. " +
"Reasonably sized means non-zero sized for files that exist per tile and equal size for binary files that exist per cycle/per tile. " +
"CheckIlluminaDirectory DOES NOT check that the individual records in a file are well-formed.",
usageShort = "Asserts the validity of the data in the specified Illumina basecalling data",
programGroup = Illumina.class
)
public class CheckIlluminaDirectory extends CommandLineProgram {
private static final Log log = Log.getInstance(CheckIlluminaDirectory.class);
// The following attributes define the command-line arguments
@Option(doc = "The basecalls output directory. ", shortName = "B")
public File BASECALLS_DIR;
@Option(doc = "The data types that should be checked for each tile/cycle. If no values are provided then the data types checked are those " +
"required by IlluminaBaseCallsToSam (which is a superset of those used in ExtractIlluminaBarcodes). These data types vary slightly depending on " +
"whether or not the run is barcoded so READ_STRUCTURE should be the same as that which will be passed to IlluminaBasecallsToSam. If this option " +
"is left unspecified then both ExtractIlluminaBarcodes and IlluminaBaseCallsToSam should complete successfully UNLESS the " +
"individual records of the files themselves are spurious.",
shortName = "DT", optional = true)
public final Set<IlluminaDataType> DATA_TYPES = new TreeSet<IlluminaDataType>();
@Option(doc = ReadStructure.PARAMETER_DOC + " Note: If you want to check whether or not a future IlluminaBasecallsToSam or ExtractIlluminaBarcodes " +
"run will fail then be sure to use the exact same READ_STRUCTURE that you would pass to these programs for this run.",
shortName = "RS")
public String READ_STRUCTURE;
@Option(doc = "The number of the lane(s) to check. ", shortName = StandardOptionDefinitions.LANE_SHORT_NAME,
minElements = 1)
public List<Integer> LANES;
@Option(doc = "The number(s) of the tile(s) to check. ", shortName = "T", optional = true)
public List<Integer> TILE_NUMBERS;
@Option(doc = "A flag to determine whether or not to create fake versions of the missing files.", shortName = "F",
optional = true)
public Boolean FAKE_FILES = false;
@Option(doc = "A flag to create symlinks to the loc file for the X Ten for each tile.", shortName = "X",
optional = true)
public Boolean LINK_LOCS = false;
/**
* Required main method implementation.
*/
public static void main(final String[] argv) {
new CheckIlluminaDirectory().instanceMainWithExit(argv);
}
@Override
protected int doWork() {
final ReadStructure readStructure = new ReadStructure(READ_STRUCTURE);
if (DATA_TYPES.isEmpty()) {
DATA_TYPES.addAll(Arrays.asList(IlluminaBasecallsConverter.DATA_TYPES_NO_BARCODE));
}
final List<Integer> failingLanes = new ArrayList<Integer>();
int totalFailures = 0;
final int[] expectedCycles = new OutputMapping(readStructure).getOutputCycles();
log.info("Checking lanes(" + StringUtil.join(",", LANES) + " in basecalls directory (" + BASECALLS_DIR
.getAbsolutePath() + ")\n");
log.info("Expected cycles: " + StringUtil.intValuesToString(expectedCycles));
for (final Integer lane : LANES) {
IlluminaFileUtil fileUtil = new IlluminaFileUtil(BASECALLS_DIR, lane);
final List<Integer> expectedTiles = fileUtil.getExpectedTiles();
if (!TILE_NUMBERS.isEmpty()) {
expectedTiles.retainAll(TILE_NUMBERS);
}
if (LINK_LOCS) {
createLocFileSymlinks(fileUtil, lane);
//we need to create a new file util because it stores a cache to the files it found on
//construction and this doesn't inclue the recently created symlinks
fileUtil = new IlluminaFileUtil(BASECALLS_DIR, lane);
}
log.info("Checking lane " + lane);
log.info("Expected tiles: " + StringUtil.join(", ", expectedTiles));
final int numFailures = verifyLane(fileUtil, expectedTiles, expectedCycles, DATA_TYPES, FAKE_FILES);
if (numFailures > 0) {
log.info("Lane " + lane + " FAILED " + " Total Errors: " + numFailures);
failingLanes.add(lane);
totalFailures += numFailures;
} else {
log.info("Lane " + lane + " SUCCEEDED ");
}
}
int status = 0;
if (totalFailures == 0) {
log.info("SUCCEEDED! All required files are present and non-empty.");
} else {
status = totalFailures;
log.info("FAILED! There were " + totalFailures + " in the following lanes: " + StringUtil
.join(", ", failingLanes));
}
return status;
}
private void createLocFileSymlinks(final IlluminaFileUtil fileUtil, final int lane) {
final File baseFile = new File(BASECALLS_DIR.getParentFile().getAbsolutePath() + File.separator + "s.locs");
final File newFileBase = new File(baseFile.getParent() + File.separator + IlluminaFileUtil
.longLaneStr(lane) + File.separator);
if (baseFile.exists()) {
boolean success = true;
if (!newFileBase.exists()) {
success = newFileBase.mkdirs();
}
if (success) {
for (final Integer tile : fileUtil.getExpectedTiles()) {
final String newName =
newFileBase + File.separator + String.format("s_%d_%d.locs", lane, tile);
final ProcessExecutor.ExitStatusAndOutput output =
ProcessExecutor.executeAndReturnInterleavedOutput(new String[]{"ln", "-fs", baseFile.getAbsolutePath(), newName});
if (output.exitStatus != 0) {
throw new PicardException("Could not create symlink: " + output.stdout);
}
}
} else {
throw new PicardException(String.format("Could not create lane directory: %s.", newFileBase.getAbsolutePath()));
}
} else {
throw new PicardException(String.format("Locations file %s does not exist.", baseFile.getAbsolutePath()));
}
}
/**
* Use fileUtil to find the data types that would be used by IlluminaDataProvider. Verify that for the expected
* tiles/cycles/data types that all the files needed to provide their data is present. This method logs every
* error that is found (excluding file faking errors) and returns the number of errors found
*
* @param fileUtil A file util paramterized with the directory/lane to check
* @param expectedTiles The tiles we expect to be available/well-formed
* @param cycles The cycles we expect to be available/well-formed
* @param dataTypes The data types we expect to be available/well-formed
* @return The number of errors found/logged for this directory/lane
*/
private static final int verifyLane(final IlluminaFileUtil fileUtil, final List<Integer> expectedTiles,
final int[] cycles,
final Set<IlluminaDataType> dataTypes, final boolean fakeFiles) {
if (expectedTiles.isEmpty()) {
throw new PicardException(
"0 input tiles were specified! Check to make sure this lane is in the InterOp file!");
}
if (cycles.length == 0) {
throw new PicardException("0 output cycles were specified!");
}
int numFailures = 0;
//find what request IlluminaDataTypes we have files for and select the most preferred file format available for that type
final Map<IlluminaFileUtil.SupportedIlluminaFormat, Set<IlluminaDataType>> formatToDataTypes =
IlluminaDataProviderFactory.determineFormats(dataTypes, fileUtil);
//find if we have any IlluminaDataType with NO available file formats and, if any exist, increase the error count
final Set<IlluminaDataType> unmatchedDataTypes =
IlluminaDataProviderFactory.findUnmatchedTypes(dataTypes, formatToDataTypes);
if (!unmatchedDataTypes.isEmpty()) {
if (fakeFiles) {
for (final IlluminaDataType dataType : unmatchedDataTypes) {
final IlluminaFileUtil.SupportedIlluminaFormat format =
IlluminaDataProviderFactory.findPreferredFormat(dataType, fileUtil);
fileUtil.getUtil(format).fakeFiles(expectedTiles, cycles, format);
}
}
log.info("Could not find a format with available files for the following data types: " + StringUtil
.join(", ", new ArrayList<IlluminaDataType>(unmatchedDataTypes)));
numFailures += unmatchedDataTypes.size();
}
for (final IlluminaFileUtil.SupportedIlluminaFormat format : formatToDataTypes.keySet()) {
final ParameterizedFileUtil util = fileUtil.getUtil(format);
final List<String> failures = util.verify(expectedTiles, cycles);
//if we have failures and we want to fake files then fake them now.
if (!failures.isEmpty() && fakeFiles) {
//fake files
util.fakeFiles(expectedTiles, cycles, format);
}
numFailures += failures.size();
for (final String failure : failures) {
log.info(failure);
}
}
return numFailures;
}
@Override
protected String[] customCommandLineValidation() {
IOUtil.assertDirectoryIsReadable(BASECALLS_DIR);
final List<String> errors = new ArrayList<String>();
for (final Integer lane : LANES) {
if (lane < 1) {
errors.add(
"LANES must be greater than or equal to 1. LANES passed in " + StringUtil.join(", ", LANES));
break;
}
}
if (errors.isEmpty()) {
return null;
} else {
return errors.toArray(new String[errors.size()]);
}
}
}