You do not need to use re here, you can use itertools and save a lot of memory.
You can first extract all substrings of length 4, and then compare them with your substring and simply select those that have less than 2 differences with your substring :
from itertools import izip,islice,tee def sub_findre(s,substring,diffnumber): sublen=len(substring) zip_gen=(izip(substring,islice(s,i,i+sublen)) for i in xrange(len(s))) for z in zip_gen: l,z=tee(z) if sum(1 for i,j in l if i==j)>=sublen-diffnumber: new=izip(*z) next(new) yield ''.join(next(new))
Demo:
s='SSPQQQQPSSSSQQQSSQPSPSQSSQPSSQPPSSSSQPSPSQSSQPSSSSQPSPSQSSQPSSSSQPSPSQ' substring='SSQP' print list(sub_findre(s,substring,2)) ['SSPQ', 'SPQQ', 'QQQP', 'SSSS', 'SSSQ', 'SSQQ', 'SQQQ', 'SSQP', 'PSQS', 'SSQP', 'SSQP', 'SQPP', 'SSSS', 'SSSQ', 'SSQP', 'PSQS', 'SSQP', 'SSSS', 'SSSQ', 'SSQP', 'PSQS', 'SSQP', 'SSSS', 'SSSQ', 'SSQP', 'PSQ']
If you want to return the indexes, you need to put the indexes in izip , which you can use itertools.repeat() to repeat the index with the substring length:
from itertools import izip,islice,tee,repeat def sub_findre(s,substring,diffnumber): sublen=len(substring) zip_gen=(izip(substring,islice(s,i,i+sublen),repeat(i,sublen)) for i in xrange(len(s))) for z in zip_gen: l,z=tee(z) if sum(1 for i,j,_ in l if i==j)>=sublen-diffnumber: new=izip(*z) next(new) next(new) yield next(new)[0]
Demo:
print list(sub_findre(s,substring,2)) [0, 1, 4, 8, 9, 10, 11, 15, 20, 23, 27, 28, 32, 33, 34, 39, 42, 46, 47, 48, 53, 56, 60, 61, 62, 67]