What is the best way to read a DNA sequence?

I wrote code in python to read the DNA sequence (in order to align motives on them later), however I am looking for a more efficient way to do this.

See below if you can help:

handle = open("a.fas.txt", "r")
a = handle.readlines()[1:]
a = ''.join([x.strip() for x in a])
with open("Output.txt", "w") as text_file:
    text_file.write(a)

f = 0
z = 100
b = ''
while f < len(a):
    b += a[f:z]+'\n'
    f += 1
    z += 1
with open("2.txt", "w") as runner_mtfs:
   runner_mtfs.write(b)

So, I want to do a bunch of analysis on every row b, but I don’t know a more efficient way to do this, and not split every 100 base pairs. The output file is more than 500 megabytes. Any suggestions?

The first part of the code is just a DNA sequence, I put all the lines together and I split 100 base pairs.

+4
source share
3 answers

, , , . . , , , .

, , , , a[x:x+100] x. : . .

, numpy. , - , uint8s, 0,1,2,3 A, C, G, T. , , .

Numpy stride_tricks, :

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
handle = open("U00096.2.fas.txt", "r")
a = handle.readlines()[1:]
a = ''.join([x.strip() for x in a])
b = numpy.array([x for x in a], dtype=numpy.character)
rolling_window(b,100)

, ints:

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
handle = open("U00096.2.fas.txt", "r")
a = handle.readlines()[1:]
a = ''.join([x.strip() for x in a])
conv = {'a': 0, 'c': 1, 'g': 2, 't': 3}
b = numpy.array([conv[x] for x in a], dtype=numpy.uint8)
rolling_window(b,100)

, .

+1
  • .fasta, , 1 .
  • python stackoverflow, . , (with open(...) file). .
  • , . :

    def load_fasta(fasta_file_name, sliding_window_size = 100):
      buffer = ''
      with open(fasta_file_name) as f:
        for line in f:
          if line.startswith('>'):
            #skip or get some info from comment line
            buffer = ''
          else:
            #read next line
            buffer += line.strip('\r\n')
            offset = 0 # zero-based offset for current string
            while (offset + sliding_window_size <= len(buffer)):
              next_sliding_window = buffer[offset : offset + sliding_window_size]
              yield(next_sliding_window)
              offset += 1
            buffer = buffer[offset : ]
    
    for str in load_fasta("a.fas.txt", 100):
      #do some processing with sliding window data
      print(str)
    

100 ( sliding window size), ( ).

biopython.

+1

Here's a class that does a few things you might want.

"""
Read in genome of E. Coli (or whatever) from given input file,
process it in segments of 100 basepairs at a time.

Usage: 100pairs [-n <pairs>] [-p] <file>

<file>                 Input file.
-n,--numpairs <pairs>  Use <pairs> per iteration. [default: 100]
-p,--partial           Allow partial sequences at end of genome.
"""
import docopt

class GeneBuffer:
    def __init__(self, path, bases=100, partial=True):
        self._buf = None
        self.bases = int(bases)
        self.partial = partial
        self.path = path

    def __enter__(self):
        self._file = open(self.path, 'r')
        self._header = next(self._file)
        return self

    def __exit__(self, *args):
        if self._file:
            self._file.close()

    def __iter__(self):
        return self

    def __next__(self):
        if self._buf is None:
            self._buf = ''

        while self._file and len(self._buf) < self.bases:
            try:
                self._buf += next(self._file).strip()
            except StopIteration:
                self._file.close()
                self._file = None
                break

        if len(self._buf) < self.bases:
            if len(self._buf) == 0 or not self.partial:
                raise StopIteration

        result = self._buf[:self.bases]
        self._buf = self._buf[1:]

        return result

def analyze(basepairs):
    """
    Dammit, Jim! I'm a computer programmer, not a geneticist!
    """
    print(basepairs)

def main(args):
    numpairs = args['--numpairs']
    partial = args['--partial']
    with GeneBuffer(args['<file>'], bases=numpairs, partial=partial) as genome:
        print("Header: ", genome._header)
        count = 0
        for basepairs in genome:
            count += 1
            print(count, end=' ')
            analyze(basepairs)

if __name__ == '__main__':
    args = docopt.docopt(__doc__)
    main(args)
+1
source

All Articles