What is the best way to read a DNA sequence?

Question

What is the best way to read a DNA sequence?

I wrote code in python to read the DNA sequence (in order to align motives on them later), however I am looking for a more efficient way to do this.

See below if you can help:

handle = open("a.fas.txt", "r")
a = handle.readlines()[1:]
a = ''.join([x.strip() for x in a])
with open("Output.txt", "w") as text_file:
    text_file.write(a)

f = 0
z = 100
b = ''
while f < len(a):
    b += a[f:z]+'\n'
    f += 1
    z += 1
with open("2.txt", "w") as runner_mtfs:
   runner_mtfs.write(b)

So, I want to do a bunch of analysis on every row b, but I don’t know a more efficient way to do this, and not split every 100 base pairs. The output file is more than 500 megabytes. Any suggestions?

The first part of the code is just a DNA sequence, I put all the lines together and I split 100 base pairs.

+4

python text

Anthaeus Mar 11 '16 at 3:57

source share

3 answers

cge · Answer 1 · 2016-03-11T04:33:29+0000

, , , . . , , , .

, , , , a[x:x+100] x. : . .

, numpy. , - , uint8s, 0,1,2,3 A, C, G, T. , , .

Numpy stride_tricks, :

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
handle = open("U00096.2.fas.txt", "r")
a = handle.readlines()[1:]
a = ''.join([x.strip() for x in a])
b = numpy.array([x for x in a], dtype=numpy.character)
rolling_window(b,100)

, ints:

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
handle = open("U00096.2.fas.txt", "r")
a = handle.readlines()[1:]
a = ''.join([x.strip() for x in a])
conv = {'a': 0, 'c': 1, 'g': 2, 't': 3}
b = numpy.array([conv[x] for x in a], dtype=numpy.uint8)
rolling_window(b,100)

, .

merletta · Answer 2 · 2016-03-11T04:53:54+0000

.fasta, , 1 .
python stackoverflow, . , (with open(...) file). .

, . :

def load_fasta(fasta_file_name, sliding_window_size = 100):
  buffer = ''
  with open(fasta_file_name) as f:
    for line in f:
      if line.startswith('>'):
        #skip or get some info from comment line
        buffer = ''
      else:
        #read next line
        buffer += line.strip('\r\n')
        offset = 0 # zero-based offset for current string
        while (offset + sliding_window_size <= len(buffer)):
          next_sliding_window = buffer[offset : offset + sliding_window_size]
          yield(next_sliding_window)
          offset += 1
        buffer = buffer[offset : ]

for str in load_fasta("a.fas.txt", 100):
  #do some processing with sliding window data
  print(str)

100 ( sliding window size), ( ).

biopython.

Austin hastings · Answer 3 · 2016-03-11T05:21:23+0000

Here's a class that does a few things you might want.

"""
Read in genome of E. Coli (or whatever) from given input file,
process it in segments of 100 basepairs at a time.

Usage: 100pairs [-n <pairs>] [-p] <file>

<file>                 Input file.
-n,--numpairs <pairs>  Use <pairs> per iteration. [default: 100]
-p,--partial           Allow partial sequences at end of genome.
"""
import docopt

class GeneBuffer:
    def __init__(self, path, bases=100, partial=True):
        self._buf = None
        self.bases = int(bases)
        self.partial = partial
        self.path = path

    def __enter__(self):
        self._file = open(self.path, 'r')
        self._header = next(self._file)
        return self

    def __exit__(self, *args):
        if self._file:
            self._file.close()

    def __iter__(self):
        return self

    def __next__(self):
        if self._buf is None:
            self._buf = ''

        while self._file and len(self._buf) < self.bases:
            try:
                self._buf += next(self._file).strip()
            except StopIteration:
                self._file.close()
                self._file = None
                break

        if len(self._buf) < self.bases:
            if len(self._buf) == 0 or not self.partial:
                raise StopIteration

        result = self._buf[:self.bases]
        self._buf = self._buf[1:]

        return result

def analyze(basepairs):
    """
    Dammit, Jim! I'm a computer programmer, not a geneticist!
    """
    print(basepairs)

def main(args):
    numpairs = args['--numpairs']
    partial = args['--partial']
    with GeneBuffer(args['<file>'], bases=numpairs, partial=partial) as genome:
        print("Header: ", genome._header)
        count = 0
        for basepairs in genome:
            count += 1
            print(count, end=' ')
            analyze(basepairs)

if __name__ == '__main__':
    args = docopt.docopt(__doc__)
    main(args)

What is the best way to read a DNA sequence?

More articles: