Package au.org.intersect.samifier.parser

Source Code of au.org.intersect.samifier.parser.GenomeParserImpl

package au.org.intersect.samifier.parser;

import au.org.intersect.samifier.domain.GeneInfo;
import au.org.intersect.samifier.domain.GeneSequence;
import au.org.intersect.samifier.domain.Genome;
import au.org.intersect.samifier.domain.VirtualProtein;

import org.apache.log4j.Logger;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class GenomeParserImpl implements GenomeParser {
    private static Logger LOG = Logger.getLogger(GenomeParserImpl.class);
    public static final Pattern GENE_RE = Pattern.compile("^(gene|gene_cassette|pseudogene|transposable_element_gene)$");
    private static final String STRAND_FORWARD = "+";
    private static final Pattern STRAND_RE = Pattern.compile("^([" + STRAND_FORWARD + "]|[-])$");
    public static final String CODING_SEQUENCE = "CDS";
    public static final String INTRON = "intron";
    public static final Pattern SEQUENCE_RE = Pattern.compile("(" + CODING_SEQUENCE + "|" + INTRON + ")");
    //private static final Pattern ID_ATTRIBUTE_RE = Pattern.compile(".*Name=([^_;]+).*");
    private static final Pattern ID_ATTRIBUTE_RE = Pattern.compile(".*Name=([^;]+).*");
    private static final Pattern PARENT_ATTRIBUTE_RE = Pattern.compile(".*Parent=([^;]+).*");
    //private static final Pattern PARENT_ATTRIBUTE_RE = Pattern.compile(".*Parent=([^_;]+).*");
    private static final Pattern VIRTUAL_PROTEIN_ATTRIBUTE_RE = Pattern.compile(".*Virtual_protein=([^;]+).*");
    private static final Pattern BRACKETS = Pattern.compile("\\((.*?)\\)");
   
    private String genomeFileName;
    private int lineNumber = 0;
    private String line;

    public GenomeParserImpl() {

    }

    public Genome parseGenomeFile(File genomeFile)
            throws GenomeFileParsingException {
        try {
            genomeFileName = genomeFile.getAbsolutePath();
            return doParsing(genomeFile);
        } catch (IOException e) {
            throw new GenomeFileParsingException(e.getMessage());
        }

    }

    private Genome doParsing(File genomeFile) throws IOException,
            GenomeFileParsingException {
        Genome genome = new Genome();

        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(genomeFile));

            while ((line = reader.readLine()) != null) {
                lineNumber++;
                if (line.startsWith("##")){
                    continue;
                }
                // chromosome, source, type, start, stop, score, strand, phase,
                // attributes
                String[] parts = line.split("\\t", 9);
                if (parts.length < 9) {
                    // throw new
                    // GenomeFileParsingException("Line "+lineNumber+": not in expected format");
                    LOG.warn("Line " + lineNumber + ": not in expected format");
                    continue;
                }
                String type = parts[2];
                if (type == null) {
                    continue;
                }
                if (GENE_RE.matcher(type).matches()) {
                    GeneInfo gene = parseGene(parts);
                    processGene(genome, gene);
                } else if (SEQUENCE_RE.matcher(type).find()) {
                    GeneSequence sequence = parseSequence(parts);
                    processSequence(genome, parts[CHROMOSOME_PART], sequence);
                }
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
        }
        genome.verify();
        return genome;
    }

    private void throwParsingException(String message)
            throws GenomeFileParsingException {
        throw new GenomeFileParsingException("Error in " + genomeFileName + ":"
                + lineNumber + " " + line + "\n > " + message);
    }

    private int parseStrand(String direction) throws GenomeFileParsingException {
        if (!STRAND_RE.matcher(direction).matches()) {
            throwParsingException("Invalid strand " + direction);
        }
        return STRAND_FORWARD.equals(direction) ? 1 : -1;
    }

    private boolean parseSequenceType(String type)
            throws GenomeFileParsingException {
        return CODING_SEQUENCE.equals(type);
    }

    protected GeneInfo parseGene(String[] parts)
            throws GenomeFileParsingException {
        String chromosome = parts[CHROMOSOME_PART];
        int start = Integer.parseInt(parts[START_PART]);
        int stop = Integer.parseInt(parts[STOP_PART]);
        String direction = parts[STRAND_PART];
        if (start > stop) {
            throwParsingException("Start-stop invalid");
        }
        //parse virtual genes
       
        return new GeneInfo(chromosome, extractId(parts[ATTRIBUTES_PART]), start, stop, parseStrand(direction), parseVirtualProteins(extractId(parts[ATTRIBUTES_PART]), parts[ATTRIBUTES_PART]));
    }

    private List<VirtualProtein> parseVirtualProteins(String geneID, String attributes) {
        List<VirtualProtein> virtualProteins = new ArrayList<VirtualProtein>();
        Matcher m = VIRTUAL_PROTEIN_ATTRIBUTE_RE.matcher(attributes);
        if (!(m.matches())) {
            return virtualProteins;
        }
        String vProteinsString =  m.group(1);
        String[] vProteinArray = vProteinsString.split(",");
        for (String vp : vProteinArray) {
            Matcher matcher = BRACKETS.matcher(vp);
            if (matcher.find()) {
                String[] offsets = matcher.group(1).split("-");
                int startOffset =  Integer.parseInt(offsets[0]);
                int endOffset =  Integer.parseInt(offsets[1]);
                String name = vp.replaceAll(BRACKETS.pattern(), "");
                virtualProteins.add(new VirtualProtein(name, startOffset, endOffset, geneID));
            }
        }
        return virtualProteins;
    }

    protected GeneSequence parseSequence(String[] parts)
            throws GenomeFileParsingException {
        String type = parts[TYPE_PART];
        int start = Integer.parseInt(parts[START_PART]);
        int stop = Integer.parseInt(parts[STOP_PART]);
        String direction = parts[STRAND_PART];
        if (start > stop) {
            throwParsingException("Start-stop invalid");
        }
        return new GeneSequence(extractParent(parts[ATTRIBUTES_PART]), parseSequenceType(type), start, stop, parseStrand(direction), parseVirtualProteins(extractParent(parts[ATTRIBUTES_PART]), parts[ATTRIBUTES_PART]));
    }

    private void processGene(Genome genome, GeneInfo gene) {
        genome.addGene(gene);
    }

    private void processSequence(Genome genome, String chromosome,
            GeneSequence sequence) throws GenomeFileParsingException {
        if (!genome.hasGene(sequence.getParentId())) {
            GeneInfo gene = new GeneInfo(chromosome, sequence.getParentId(),
                    sequence.getStart(), sequence.getStop(),
                    sequence.getDirection(), sequence.getVirtualProteins());
            genome.addGene(gene);
        }
        GeneInfo gene = genome.getGene(sequence.getParentId());
        if (gene.getDirection() != sequence.getDirection()) {
            throwParsingException("A sequence in gene " + gene.getId() + " has inconsistent direction");
        }
        if (gene.getStart() > sequence.getStart()) {
            throwParsingException("Start of sequence in gene " + gene.getId() + " overflows gene");
        }
        if (gene.getStop() < sequence.getStop()) {
            throwParsingException("Stop of sequence in gene " + gene.getId() + " overflows gene");
        }
        genome.getGene(sequence.getParentId()).addLocation(sequence);
    }

    private String extractId(String attributes)
            throws GenomeFileParsingException {
        Matcher m = ID_ATTRIBUTE_RE.matcher(attributes);
        if (m.matches()) {
            return m.group(1);
        }
        throwParsingException("Attribute ID not found");
        return null; // make compiler happy
    }

    private String extractParent(String attributes)
            throws GenomeFileParsingException {
        Matcher m = PARENT_ATTRIBUTE_RE.matcher(attributes);
        if (m.matches()) {
            return m.group(1).replace("_mRNA", "");
        }
        throwParsingException("Attribute Parent not found");
        return null; // make compiler happy
    }
}
TOP

Related Classes of au.org.intersect.samifier.parser.GenomeParserImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.