Bioinformatics Algorithms

Improved K-mer Count

Back to Index

This script performs the same task as the Counting K-mers script
on page one, but it employees the N2P and P2N functions and uses
a list instead of a dictionary to implement a more efficient algorithm.

TO RUN THIS SCRIPT:

     ./KCOUNT.py < pdic_data.txt

#!/usr/bin/python import math import sys def P2N(p): if p == "": return 0 symbol = LastSymbol(p) prefix = Prefix(p) return 4* P2N(prefix) + S2N(symbol) def LastSymbol(p): index = len(p)-1 return p[index] def Prefix(p): index = len(p)-1 return p[:index] def S2N(s): if s=="A": return 0 elif s=="C": return 1 elif s=="G": return 2 else: return 3 #--------- def N2P(i,k): if k==1: return N2S(i) pref = Q(i,4) r = REM(i,4) symbol = N2S(r) pP = N2P(pref,k-1) return pP + symbol def Q(i,d): return i//d def REM(i,d): return i%d def N2S(r): if r==0: return "A" elif r==1: return "C" elif r==2: return "G" else: return "T" #--------- freqARRAY = [] def ComputingFrequencies(text,k): for n in range(0,int(math.pow(4,k))): freqARRAY.append(0) for i in range(0,len(text)-k+1): pattern = text[i:i+k] j = P2N(pattern) freqARRAY[j] = freqARRAY[j]+1 flist = [] largest = 0 for n in range(0,len(freqARRAY)-1): if freqARRAY[n]>largest: largest=freqARRAY[n] for n in range(0,len(freqARRAY)-1): if freqARRAY[n]==largest: flist.append(N2P(n,k)) print largest return flist lines = sys.stdin.read().splitlines() seq=lines[0] k=int(lines[1]) output = ComputingFrequencies(seq,k) print output

SOURCE CODE:
Improved K-mer Count
Data for KCOUNT

Main Index