Function Solutions
Function Solutions
1 Table of Contents
1 Functions
1.1 Exercise 1
1.2 Exercise 2
1.3 Exercise 3
1.4 Exercise 4
1.5 Exercise 5
1.6 Exercise 6
1.7 Exercise 7
1.8 Exercise 8
1.9 Exercise 9
1.10 Exercise 10
1.11 Exercise 11
1.12 Exercise 12
1.13 Exercise 13
1.14 Exercise 14
2 Case study
2.1 Exercise 15
2.2 Exercise 16
2.3 Exercise 17
2.4 Exercise 18
2.5 Exercise 19
2.6 Exercise 20
2.7 Exercise 21
2.8 A less readable way to write it
2 Functions
2.1 Exercise 1
In [3]: def print_line():
print("This is a function!")
print_line()
This is a function!
1
2.2 Exercise 2
In [4]: def print_line():
return "This is a function!"
result = print_line()
print(result)
print(result[0])
This is a function!
T
2.3 Exercise 3
In [5]: def greet(name1, name2):
return "Hello {} and {}!".format(name1, name2)
print(greet("Björn", "Dag"))
Hello Björn and Dag!
2.4 Exercise 4
In [6]: def greet(name1="?", name2="?"):
return "Hello {} and {}!".format(name1, name2)
print(greet())
print(greet("Petr"))
print(greet("Björn", "Dag"))
Hello ? and ?!
Hello Petr and ?!
Hello Björn and Dag!
2.5 Exercise 5
In [7]: def multiply(nbr1, nbr2):
return nbr1 * nbr2
print(multiply(2,3))
print(add(2,3))
6
5
2
2.6 Exercise 6
In [8]: def multiply(nbr1, nbr2):
return nbr1 * nbr2
print(calculate(2, 3, operation="add"))
print(calculate(2, 3, operation="multiply"))
5
6
2.7 Exercise 7
In [9]: def get_gc(seq_raw):
seq = seq_raw.upper() # Handle both lower- and upper- case
gc_count = seq.count("C") + seq.count("G")
at_count = seq.count("A") + seq.count("T")
return gc_count / (at_count + gc_count)
print(get_gc("CACAGGTT"))
print(get_gc("CAG"))
0.5
0.6666666666666666
2.8 Exercise 8
In [10]: def many_hi(number):
for _ in range(number):
print("Hi!")
many_hi(4)
Hi!
Hi!
Hi!
3
Hi!
2.9 Exercise 9
In [11]: def many_hi(number):
return ["Hi!"] * number
hi_list = many_hi(6)
print(hi_list)
2.10 Exercise 10
In [12]: def many_hi(number, word="Hi"):
return ["{}!".format(word)] * number
hi_list = many_hi(3)
print(hi_list)
word_list = many_hi(4, word="Halloa")
print(word_list)
2.11 Exercise 11
In [13]: seq_dict = {
"header1.1": "ATGCTAGCTAGCTAGCTACG",
"header1.2": "ACGTAGCTAGCTAGCAC",
"header2.1": "AGCTAGCTAGCTATTATCTACT"
}
seq_dict
2.12 Exercise 12
In [14]: n = 1
for key in seq_dict.keys():
value = seq_dict[key]
print("Entry {} has header: {} and sequence: {}".format(n, key, value))
n += 1
4
Entry 1 has header: header1.1 and sequence: ATGCTAGCTAGCTAGCTACG
Entry 2 has header: header1.2 and sequence: ACGTAGCTAGCTAGCAC
Entry 3 has header: header2.1 and sequence: AGCTAGCTAGCTATTATCTACT
2.13 Exercise 13
In [15]: def load_fasta(filepath):
seqs = dict()
with open(filepath, 'r') as in_fh:
key = None
for line in in_fh:
line = line.rstrip()
if line.startswith('>'):
key = line[1:]
else:
seq = line
seqs[key] = seq
return seqs
fasta_dict = load_fasta("test.fa")
print(fasta_dict)
2.14 Exercise 14
In [16]: def calcGC(seq_raw):
seq = seq_raw.upper() # Handle both lower- and upper- case
gc_count = seq.count("C") + seq.count("G")
at_count = seq.count("A") + seq.count("T")
return gc_count / (at_count + gc_count)
sequenceList = list(fasta_dict.values())
ATGCTAGCTAGCTAGCTACG: 0.5
ACGTAGCTAGCTAGCAC: 0.5294117647058824
AGCTAGCTAGCTATTATCTACT: 0.36363636363636365
5
3 Case study
3.1 Exercise 15
In [ ]:
3.2 Exercise 16
In [18]: in_fp = 'data/fungus.gff'
out_fp = 'data/fungus_renamed.gff'
3.3 Exercise 17
In [19]: def parse_fasta_to_dict(filepath):
seqs = dict()
with open(filepath, 'r') as in_fh:
key = None
for line in in_fh:
line = line.rstrip()
if line.startswith('>'):
key = line[1:]
else:
seq = line
seqs[key] = seq
return seqs
fasta_dict = parse_fasta_to_dict("data/fungusAssembly.single.fna")
print(len(fasta_dict.keys()))
print(len(fasta_dict['scaffold_10']))
6
2681
1262682
3.4 Exercise 18
In [20]: def get_feature_coordinates(annot_gff, target_pattern):
feature_coords_list = list()
with open(annot_gff, 'r') as in_fh:
for line in in_fh:
if not line.startswith('##'):
line = line.rstrip()
fields = line.split('\t')
chrom_nbr = fields[0]
id_type = fields[2]
start_pos = int(fields[3])
end_pos = int(fields[4])
if id_type == target_pattern:
feature_coords_list.append((chrom_nbr, start_pos, end_pos))
return feature_coords_list
3.5 Exercise 19
In [21]: def extract_features(genome_dict, feature_coords):
feature_dict = dict()
for coord_tuple in feature_coords:
scaffold = coord_tuple[0]
start_pos = coord_tuple[1]
end_pos = coord_tuple[2]
seq = genome_dict[scaffold][start_pos-1:end_pos]
feature_dict['>{} {}-{}'.format(scaffold, start_pos, end_pos)] = seq
return feature_dict
genome_path = 'data/fungusAssembly.single.fna'
annot_path = 'data/fungus_renamed.gff'
7
genome_dict = parse_fasta_to_dict(genome_path)
feature_coords = get_feature_coordinates(annot_path, 'CDS')
feature_fasta_dict = extract_features(genome_dict, feature_coords)
print(feature_fasta_dict[">scaffold_1 1335-1582"])
print(len(feature_fasta_dict[">scaffold_1 1335-1582"]))
ATGTGTAccccatcaatatcctcgcaaaatattcccagcgatgacaagagccgttcgagttcgcagggctccgaacgatgtggagagacctcgct
248
3.6 Exercise 20
In [22]: def write_fasta_dict(fasta_dict, out_fp):
with open(out_fp, 'w') as out_fh:
for header in sorted(fasta_dict.keys()):
seq = fasta_dict[header]
print('{}\n{}'.format(header, seq), file=out_fh)
print('Written {} entries to {}'.format(len(fasta_dict), out_fp))
write_fasta_dict(feature_fasta_dict, 'data/out_dict.fa')
3.7 Exercise 21
In [23]: fasta_dict = dict()
with open('data/fungusAssembly.single.fna', 'r') as in_fh:
key = None
for line in in_fh:
line = line.rstrip()
if line.startswith('>'):
key = line[1:]
else:
seq = line
fasta_dict[key] = seq
feature_coords = list()
with open('data/fungus_renamed.gff', 'r') as in_fh:
for line in in_fh:
if not line.startswith('##'):
line = line.rstrip()
fields = line.split('\t')
chrom_nbr = fields[0]
id_type = fields[2]
start_pos = int(fields[3])
8
end_pos = int(fields[4])
if id_type == 'CDS':
feature_coords.append((chrom_nbr, start_pos, end_pos))
feature_dict = dict()
for coord_tuple in feature_coords:
scaffold = coord_tuple[0]
start_pos = coord_tuple[1]
end_pos = coord_tuple[2]
seq = genome_dict[scaffold][start_pos-1:end_pos]
feature_dict['>{} {}-{}'.format(scaffold, start_pos, end_pos)] = seq
print(feature_fasta_dict[">scaffold_1 1335-1582"])
print(len(feature_fasta_dict[">scaffold_1 1335-1582"]))
ATGTGTAccccatcaatatcctcgcaaaatattcccagcgatgacaagagccgttcgagttcgcagggctccgaacgatgtggagagacctcgct
248
In [24]: d = dict()
with open('data/fungusAssembly.single.fna', 'r') as file:
key = None
for x in file:
x = x.rstrip()
if x.startswith('>'):
key = x[1:]
else:
x1 = x
d[key] = x1
mylist = list()
with open('data/fungus_renamed.gff', 'r') as file:
for l in file:
if not l.startswith('##'):
l = l.rstrip()
9
f = l.split('\t')
c = f[0]
t = f[2]
p1 = int(f[3])
p2 = int(f[4])
if t == 'CDS':
mylist.append((c, p1, p2))
fd = dict()
for ct in mylist:
s = ct[0]
p1 = ct[1]
p2 = ct[2]
s2 = d[s][p1-1:p2]
fd['>{} {}-{}'.format(s, p1, p2)] = s2
print(fd[">scaffold_1 1335-1582"])
print(len(fd[">scaffold_1 1335-1582"]))
ATGTGTAccccatcaatatcctcgcaaaatattcccagcgatgacaagagccgttcgagttcgcagggctccgaacgatgtggagagacctcgct
248
10