Search for k-mers in a sliding window

I am trying to solve this bioinformatics problem: https://stepic.org/lesson/An-Explosion-of-Hidden-Messages-4/step/1?course=Bioinformatics-Algorithms-2&unit=8

The specific question is in the fifth window of the link above, and the question is: How many different 9-dimensional forms of (500,3) -clips in the genome of E. coli? (In other words, do not count 9-measures more than once.)

My code is below. This is wrong, and I would like to explain why and how I can improve it (obviously, the efficiency of O is terrible, but I started Python coding a few days ago ...) Thank you very much!

genome = '' #insert e. Coli genome here
k = 4 #length of k-mer
L = 50 #size of sliding window
t = 3 #k-mer appears t times
counter = 0
Count = []


for i in range(0,len(genome)-L): #slide window down the genome
    pattern = genome[i:i+k] #given this k-mer
    for j in range(i,i+L): #calculate k-mer frequency in window of len(L)
        if genome[j:j+k] == pattern:
            counter = counter + 1
    Count.append(counter)
    counter = 0 #IMPORTANT: reset counter after each i

Clump = []
for i in range(0,len(Count)):
    if Count[i] == t: #figure out the window that has k-mers of frequency t
        Clump.append(i)

Output = []
for i in range(0,len(Clump)):
    Output.append(genome[Clump[i]:Clump[i]+k])
print " ".join(list(set(Output))) #remove duplicates if a particular k-mer is found more than once
print len(Output)
print len(list(set(Output))) #total number of Clump(k,L,t)
+4
source share
2 answers

. github . .

ben@nixbox:~/bin$ time python kmers.py ../E-coli.txt 9 500 3
(500, 3)-clumps of 9-mers found in that file: 1904

real    0m15.510s
user    0m14.241s
sys     0m0.956s

( ) /. , , . . ( , : -)).

cats on the internet

(20,3) - 3-: "CAT". ( "AAA" ), , k, L t.

, . , , : (5,3) - 3-.

5-3 clump

5 . , 3-mers ATA, TAA AAA. , ATA , AAA. , TAA , AAA - (5,3) AAA s.

, , , - , ; k-mer . , - ( python, dict s) k-mers . , , k-mer.

, - - - , , list - , deque - , , dict - , Counter - deque. , OrderedDict , ; , 1.

, , , - .

:

def get_clumps(genome, k, L, t):
    kmers = KmerSequence(L-k, t)

    for kmer in sliding_window(genome, k):
        kmers.add(kmer)

    return kmers.clumps

class KmerSequence(object):
    __slots__ = ['order', 'counts', 'limit', 'clumps', 't']

    def __init__(self, limit, threshold):
        self.order = deque()
        self.counts = Counter()
        self.limit = limit
        self.clumps = set()
        self.t = threshold

    def add(self, kmer):
        if len(self.order) > self.limit:
            self._remove_oldest()
        self._add_one(kmer)

    def _add_one(self,kmer):
        self.order.append(kmer)
        new_count = self.counts[kmer] + 1
        self.counts[kmer] = new_count

        if new_count == self.t:
            self.clumps.add(kmer)

    def _remove_oldest(self):
        self.counts[self.order.popleft()] -= 1

:

with open(genomefile) as f:
    genome = f.read()

k = 9
L = 500
t = 3

clumps = get_clumps(genome, k,L,t)

, , , script __main__ - github . ..

+5

def findClumps(genome, k, L, t):
    length = len(genome) - k + 1
    indexes = defaultdict(list)
    result = set()

    for i in range(length):
        kterm = genome[i:i + k]

        # remove positions with distance > L
        while indexes[kterm] and i + k - indexes[kterm][0] > L:
            indexes[kterm].pop(0)

        # add current kterm position
        indexes[kterm].append(i)

        if len(indexes[kterm]) == t:
            result.add(kterm)

    return result

filename = 'E-coli.txt'
file = open(filename)
genome = file.read()
k=9
L=500
t=3

def test():
    print findClumps(genome, k, L, t)

import cProfile

cProfile.run('test()')
0

All Articles