import random import math def default_column(Seqs): """Return a default profile column""" S = set() S.update(*tuple(Seqs)) return dict((c, 1) for c in S) def profile_for(Seqs, default): """Compute a profile for the given sequence""" k = len(Seqs[0]) # length of sequences P = [dict(default) for i in xrange(k)] # columns of profiles # for every column for i in xrange(k): # for every row for s in Seqs: P[i][s[i]] += 1.0 for i in xrange(k): for c in P[i]: P[i][c] /= (2.0*len(Seqs)) return P def profile_score(P, s): """Compute the score of s under P""" return sum(math.log(P[i][c]) for i, c in enumerate(s)) def gibbs(Seqs, k): """Seqs is a list of strings. Find the best motif.""" default_col = default_column(Seqs) # start with random incdices I = [random.randint(0, len(x) - k) for x in Seqs] LastI = None it = 0 while I != LastI: # print some status information it += 1 B = [x[j : j + k] for x, j in zip(Seqs, I)] print "%d: %s %s" % ( it, str(I), B) #print str(profile_for(B, default_col)) LastI = list(I) # iterate through every string for i in xrange(len(Seqs)): # compute the profile for the sequences except i P = profile_for([ x[j : j + k] for q, (x, j) in enumerate(zip(Seqs, I)) if q != i ], default_col) # find the place the profile matches best best = None for j in xrange(len(Seqs[i]) - k + 1): score = profile_score(P, Seqs[i][j : j + k]) if score > best or best is None: best = score bestpos = j I[i] = bestpos return I, [x[j : j+k] for x,j in zip(Seqs, I)]