Package edu.msu.cme.rdp.classifier.train

Source Code of edu.msu.cme.rdp.classifier.train.LineageSequenceParser

/*
* RawSequenceParser.java
*
* Copyright 2006 Michigan State University Board of Trustees
*
* Created on June 24, 2002, 2:13 PM
*/
package edu.msu.cme.rdp.classifier.train;

import edu.msu.cme.rdp.readseq.readers.Sequence;
import edu.msu.cme.rdp.readseq.readers.SequenceReader;
import java.util.*;
import java.io.*;

/**
* A parser to parse a reader containing the raw sequences.
* @author  wangqion
* @version
*/
public class LineageSequenceParser {

    private SequenceReader seqReader;
    public static final String delimiter = ";";
    private LineageSequence onDeck;
    private LineageSequence curSeq = null;

    /** Creates new RawSequenceParser to parse the input fasta file. */
    public LineageSequenceParser(File inFile) throws IOException {
        seqReader = new SequenceReader(inFile);
    }
   
    /** Creates new RawSequenceParser to parse the input fasta file. */
    public LineageSequenceParser(InputStream is) throws IOException {
        seqReader = new SequenceReader(is);
    }

    /**
     * Closes the reader.
     */
    public void close() throws IOException {
        seqReader.close();
    }

    /** Returns true if there is a parsed sequence available.
     */
    public boolean hasNext() throws IOException {
        if (onDeck != null) {
            return true;
        }
        if ((onDeck = getNextElement()) != null) {
            return true;
        }
        return false;
    }

    /** Returns the next parsed sequence. */
    public LineageSequence next() throws NoSuchElementException, IOException {
        LineageSequence tmp;
        if (onDeck != null) {
            tmp = onDeck;
            onDeck = null;
        } else {
            tmp = getNextElement();
        }
        if (tmp == null) {
            throw new NoSuchElementException();
        }

        return tmp;
    }

    /** Reads from the input stream and returns a parsed sequence.
     * Header format: seqID followed by a tab, followed by a list of ancestor nodes
     * Reads one line for the header and decompose the header into
     * a list of ancestors. Then reads the following lines for the sequence string
     * and modifies the sequence string.
     */
    private LineageSequence getNextElement() throws IOException {
        LineageSequence nextSeq = null;

        String seqstring = "";
        boolean endoffile = true;
        boolean origin = false;

        Sequence seq = seqReader.readNextSequence();
        if (seq == null) {
            return null;
        }

        curSeq = new LineageSequence(seq.getSeqName(), decomposeHeader(seq.getDesc()), modifySequence(seq.getSeqString()));

        LineageSequence retval = curSeq;
        curSeq = nextSeq;

        return retval;
    }

    /** Takes two different formats:
     * the old format is a string of sequence header( ancestors seperated by delimiter, such as ";" in our case).
     * new format: the taxid of the immediate parent taxon
     * It returns an array of ancestors with root ancestor first.
     */
    private List<String> decomposeHeader(String s) {
        List<String> al = new ArrayList();

        String[] values = s.split(delimiter);

        for (int i = 0; i < values.length; i++) {
            al.add(values[i].trim());
        }
        return al;
    }

    /** Modifies the sequence. Removes - and ~. It returns a string.
     */
    private String modifySequence(String s) {
        try {
            StringReader in = new StringReader(s);
            StringWriter out = new StringWriter();
            int c;
            while ((c = in.read()) != -1) {
                if (c == '-' || c == '~') {
                    continue;
                }
                out.write(c);
            }
            in.close();
            out.close();
            return out.toString();
        } catch (IOException e) {
            System.out.println("In StringReader or StringWriter exception : " + e.getMessage());
        }
        return null;
    }
}
TOP

Related Classes of edu.msu.cme.rdp.classifier.train.LineageSequenceParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.