1
2 '''
3 Created 2012
4
5 Contains various help functions which initialize / translate /preprocess the data
6
7
8 @author: Sven Giese'''
9
10 import cPickle as pickle
11 import random
12
13 ''' INIT DICTIONARIES '''
14 genetic_code={'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
15 'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R',
16 'AAT':'N', 'AAC':'N',
17 'GAT':'D', 'GAC':'D',
18 'TGT':'C', 'TGC':'C',
19 'CAA':'Q', 'CAG':'Q',
20 'GAA':'E', 'GAG':'E',
21 'GGT':'G', 'GGC':'G','GGA':'G', 'GGG':'G',
22 'CAT':'H', 'CAC':'H',
23 'ATT':'I', 'ATC':'I','ATA':'I',
24 'ATG':'M',
25 'TTA':'L', 'TTG':'L', 'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
26 'AAA':'K', 'AAG':'K',
27 'TTT':'F', 'TTC':'F',
28 'CCT':'P', 'CCC':'P','CCA':'P', 'CCG':'P',
29 'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S', 'AGT':'S', 'AGC':'S',
30 'ACT':'T', 'ACC':'T','ACA':'T', 'ACG':'T',
31 'TGG':'W',
32 'TAT':'Y', 'TAC':'Y',
33 'GTT':'V', 'GTC':'V','GTA':'V', 'GTG':'V',
34 'TAA':'*', 'TGA':'*','TAG':'*','NNN':'n'}
35
36
37
39 """
40 Creates the dictionary for the AA triplets and searches the starting indices
41 of the triplets in the given aminoacid sequence.
42
43 @type AAsequence: string
44 @param AAsequence: aminoacid sequence
45 @rtype: dictionary
46 @return: A dictionary with starting positions of each triplet in the given AA sequence
47
48 """
49
50 liste = ["A","R","N","D","C","E","Q","G","H","I","L","K","M","F","P","S","T","W","Y","V","*"]
51 aa_triplets = {}
52
53
54 for i in range(0,len(liste)):
55 for k in range(0,len(liste)):
56 for l in range(0,len(liste)):
57 aa_triplets[liste[i]+liste[k]+liste[l]]= []
58
59
60
61
62 for i in range(1,len(AAsequence),3):
63 if i+3 > len(AAsequence):
64 break
65 if AAsequence[i:i+3] in aa_triplets:
66 aa_triplets[AAsequence[i:i+3]].append(i)
67 return(aa_triplets)
68
69
70
71
73 """
74 Function which checks if a given triplet has max hamming distance of 1
75 to a other triplet. Used for generation of possible substitutions triplets
76
77 @type codon: string
78 @param codon: nucleotide triplet
79 @type dictentry: string
80 @param dictentry: nucleotide triplet
81 @rtype: bool
82 @return: Boolean value. True if max hamming distance 1,else False .
83
84 """
85 counter = 0
86
87 for i in range (0,3):
88
89 if codon[i]== dictentry[i]:
90 counter+=1
91 else:
92 continue
93
94 if counter == 2:
95 return (True)
96 else:
97 return (False)
98
100 """
101 Funtion which translates DNA to AA
102
103 @type DNA: list
104 @param DNA: nucleotide sequence
105 @rtype: prot,rest
106 @return: Translated aminoacid sequence,untranslated nucleotide sequence
107 """
108 protein=[]
109 prot = ""
110 rest=""
111
112 DNA = "".join(DNA)
113 for i in range(0,len(DNA),3):
114
115 if(i+3 > len(DNA)):
116 rest +=DNA[i:i+3]
117
118 break
119
120 if("N" in DNA[i:i+3]):
121 a_a = "n"
122 protein.append(a_a)
123 else:
124
125 codon=DNA[i:i+3]
126
127 a_a=genetic_code[codon]
128 protein.append(a_a)
129
130
131 prot = "".join(protein)
132 return (prot,rest)
133
134 ''' DEBUG HELP FUNCTIONS '''
135
136
138 """
139 basic pickle functions. actually for debugging and to speed up multiple simulations ( possible to load orf lists)
140
141 @type dictionary: dictionary
142 @param dictionary: Dictionary containg start and end positions of ORFs.
143 @type outputname: string
144 @param outputname: Filename for saving.
145
146 """
147 pickle.dump( dictionary, open(outputname +".p", "wb" ) )
148 print("Saved .pickle to: " + outputname +".p")
149
151 """
152 basic pickle functions. actually for debugging and to speed up multiple simulations ( possible to load orf lists)
153
154
155 @type inputname: string
156 @param inputname: Filename for loading.
157 @rtype: dictionary
158 @return: Dictionary containing start and end positions of ORFs.
159 """
160 dictionary= pickle.load( open(inputname ))
161 print("Loaded "+inputname+" pickle!")
162 return (dictionary)
163