01 07 FrequentWordsWithMismatchesSolution
01 07 FrequentWordsWithMismatchesSolution
be listed multiple times!! def mutations(word, hamming_distance, charset='ATCG'): for indices in itertools.combinations(range(len(word)), hamming_distance): for replacements in itertools.product(charset, repeat=hamming_distance): mutation = list(word) for index, replacement in zip(indices, replacements): mutation[index] = replacement yield "".join(mutation) # Count the number of occurrences of sub in string def occurrences(string, sub): count = start = 0 while True: start = string.find(sub, start) + 1 if start > 0: count+=1 else: return count # Count all kmers in Genome and add to kmerDict kmerDict = {} # For each kmer in kmerDict, add kmer plus count to tmpDict. Then find all # mutations of kmer within d and add them to tmpDict. tmpDict = {} # Open the input file and read in Genome, k, and d. f = open('frequent_words_mismatch_data.txt', 'r') Genome = f.readline().rstrip('\n') # header in sample data Genome = f.readline().rstrip('\n') k = int(f.readline().rstrip('\n')) d = int(f.readline().rstrip('\n')) print 'parameters' print k print d f.close() # Step through genome 1 character at a time and add all kmers to kmerDict numChars = len(Genome) for i in range (0, numChars - k + 1): kmer = Genome[i:i+k] if kmer in kmerDict: kmerDict[kmer] += 1 else: kmerDict[kmer] = 1 print 'kmerDict completed' # Take kmer from kmerDict and add it to tmpDict # Find each mutation of kmer and record occurrences in tmpDict # Repeat for each entry in kmerDict # Result is count of all kmers and count of all d mutations. for key in kmerDict: count = kmerDict[key] if key in tmpDict: tmpDict[key] += count
else: tmpDict[key] = count # kmerDict[key] tmpList = mutations(key, d) tmpList = set(tmpList) tmpList = list(tmpList) for tmpMutation in tmpList: if tmpMutation != key: if tmpMutation in tmpDict: tmpDict[tmpMutation] += count # kmerDict[key] else: tmpDict[tmpMutation] = count print 'tmpDict completed' # Display the answers maxNum = 0 for key in sorted(tmpDict, key=tmpDict.get, reverse=True): if tmpDict[key] >= maxNum: maxNum = tmpDict[key] print key + ' ' + str(tmpDict[key])