Package core :: Module FindOrfs
[hide private]
[frames] | no frames]

Source Code for Module core.FindOrfs

  1  #!/usr/bin/python3
 
  2  
 
  3  
 
  4  """
 
  5  FindOrfs, Created 2012
 
  6  
 
  7  Script to detect start and stop codon of open reading frames in a dna sequence. Note that the algorithm is only rudimentary and does not respect intron-exon structures.
 
  8  
 
  9  """ 
 10  
 
 11  
 
12 -def build_ORF(sequence,file_ORF,pdic):
13 """ 14 Get orf positions (forward/backward) and return them in a dictionary 15 16 @type sequence: string 17 @param sequence: nucleotide sequence 18 @type file_ORF: string 19 @param file_ORF: outputfile 20 @type pdic: dictionary 21 @param pdic: Used to start start / end positions of ORFs. 22 @rtype: dictionary 23 @return: Stored start start / end positions of ORFs 24 """ 25 26 27 START,STOP,STARTrev,STOPrev = find_orfs(sequence,pdic) 28 t=open(file_ORF,'a') 29 print "\t search forward orfs..." 30 orf_counter = 0 31 orf_name = "forw" 32 for i in START: 33 for j in STOP: 34 #3 conditions 35 #first condition: start position needs to be BEFORE the stop position 36 #second condition: our proteins need to be longer than 150 aa AND need to be smaller than the biggest protein we know 37 #third condition: each ORF needs to give a real aa sequence 38 if i<j and (j+3-i)>149 and (j+3-i)<103050 and (j+3-i)%3==0: 39 orf_counter+=1 40 #if all 3 conditions: write the ORF in a file 41 t.write(">"+orf_name+" " + str(orf_counter)) 42 t.write('\n') 43 t.write(sequence[i:j+3]) 44 t.write('\n') 45 pdic[i]="S" # S = START 46 pdic[j]="E" # E = END 47 48 temp = orf_counter 49 50 orf_counter = 0 51 orf_name = "rev" 52 print "\t search backward orfs..." 53 for i in STARTrev: 54 for j in STOPrev: 55 #3 conditions 56 #if j<i and (j+2-i+1)>150 and (j+2-i+1)<103050 and (j+2-i+1)%3==0: <--alt 57 #first condition: start position needs to be BEFORE the stop position 58 #second condition: our proteins need to be longer than 150 aa AND need to be smaller than the biggest protein we know 59 #third condition: each ORF needs to give a real aa sequence 60 if j<i and (i+3-j)>150 and (j+3-i)<103050 and (j+3-i)%3==0: 61 #if all 3 conditions: write the ORF in a file 62 orf_counter+=1 63 t.write(">"+orf_name+" " + str(orf_counter)) 64 t.write('\n') 65 t.write(sequence[j:i+3][::-1]) 66 t.write('\n') 67 pdic[i]="E" 68 pdic[j]="S" 69 print(str(temp+orf_counter)) 70 t.close() 71 72 return (pdic)
73 74
75 -def findstop_help(posLastStop,sequence,codon):
76 """ 77 return the index of the first position of codon in the dna sequence 78 79 @type posLastStop: int 80 @param posLastStop: Position of the last found stop codon. 81 @type sequence: string 82 @param sequence: Nucleotide sequence. 83 @type codon: string 84 @param codon: 3-letter DNA code. 85 @rtype: int 86 @return: The position of the stop codon in the nucleotide sequence. 87 """ 88 try: 89 return(sequence.index(codon,posLastStop)) 90 91 except: 92 return(-1)
93
94 -def find_orfs(genomeSequence,pdic):
95 """ 96 function to identify open reading frames in a dna sequence, careful: intron exon structures are not respected! 97 98 @type genomeSequence: string 99 @param genomeSequence: Nucleotide sequence. 100 @type pdic: dictionary 101 @param pdic: Used to store start / end positions of ORFs. 102 @rtype: dictionary 103 @return: Found start / end positions in the sequence consindering only the ORFs. 104 """ 105 # got a dictionary and a sequence! lets start! 106 # Startcodon = ATG 107 # Stopcodon = ["TAA","TGA","TAG"] 108 109 110 start =[] 111 stop =[] 112 113 posLastATG = 0 114 posLastStop = 3 115 orfList = [] 116 117 # forward 118 print("\t..find forward orfs") 119 while True: 120 foundNew = False 121 # catch error if "ATG" not found! 122 try: 123 124 start.append(genomeSequence.index("ATG",posLastATG)) 125 # retrieve last element 126 posLastATG = start[-1]+1 127 foundNew = True 128 except: 129 pass 130 131 stopSub =[] 132 stopcodons =["TAA","TGA","TAG"] 133 for item in stopcodons: 134 stopSub.append(findstop_help(posLastStop, genomeSequence, item)) 135 136 stopSub.sort() 137 138 if(stopSub[0] > -1): 139 stop.append(stopSub[0]) 140 posLastStop = stop[-1]+1 141 foundNew = True 142 143 elif(stopSub[1] > -1): 144 stop.append(stopSub[1]) 145 posLastStop = stop[-1]+1 146 foundNew = True; 147 148 elif(stopSub[2] > -1): 149 stop.append(stopSub[2]); 150 posLastStop = stop[-1]+1 151 foundNew = True; 152 153 if(foundNew): 154 pass 155 else: 156 break 157 158 # reverse now: 159 # start codon: CAT 160 # stop codons: TTA;TCA;CTA 161 162 startRev = [] 163 stopRev = [] 164 165 posLastCAT = 3 166 posLastStop_rev = 0 167 168 print("\t..find reverse orfs") 169 while True: 170 foundNew_rev = False 171 # catch error if "CAT" not found! 172 try: 173 174 startRev.append(genomeSequence.index("CAT",posLastCAT)) 175 # retrieve last element 176 posLastCAT = startRev[-1]+1 177 foundNew_rev = True 178 except: 179 pass 180 181 stopSub =[] 182 stopcodons =["TTA","TCA","CTA"] 183 for item in stopcodons: 184 stopSub.append(findstop_help(posLastStop_rev, genomeSequence, item)) 185 186 stopSub.sort() 187 188 if(stopSub[0] > -1): 189 stopRev.append(stopSub[0]) 190 posLastStop_rev = stopRev[-1]+1 191 foundNew_rev = True 192 193 elif(stopSub[1] > -1): 194 stopRev.append(stopSub[1]) 195 posLastStop_rev = stopRev[-1]+1 196 foundNew_rev = True 197 198 199 elif(stopSub[2] > -1): 200 stopRev.append(stopSub[2]) 201 posLastStop_rev = stopRev[-1]+1 202 foundNew_rev = True 203 204 205 if(foundNew_rev): 206 pass 207 else: 208 break 209 ###test################## 210 print("START codons : " + str(len(start))) 211 print("STOP codons : " +str(len(stop))) 212 print("revSTART codons : " +str(len(startRev))) 213 print("revSTOP codons : " +str(len(stopRev))) 214 215 216 # FORWARD 217 #vectors are filled, now test if in the same frame 218 # stores start positions without a suitable Stop in frame 219 removeList=[] 220 print("\t..creating forward orfs") 221 for stopPos in stop: 222 foundPartner=False 223 startPos = 0 224 i = 0 225 startPos = start[0] 226 #= start[i] 227 while ((i<len(start)-1) and (start[i] <stopPos)): 228 startPos = start[i] 229 i+=1 230 if((stopPos+3-startPos)% 3 == 0): 231 #Orf found in frame 232 if((foundPartner == False) and (stopPos+3)-startPos >149): 233 #found first matching AT (first ORF+ORF is at least 150bp long 234 foundPartner = True 235 pdic[stopPos+3]="E" 236 pdic[startPos]="S" 237 orfList.append(startPos) 238 orfList.append(stopPos+3) 239 removeList.append(startPos) 240 else: 241 removeList.append(startPos) 242 for item in removeList: 243 # remove all positions of ATGS leading to "sub-ORFs" 244 start.remove(item) 245 removeList =[] 246 247 248 # BACKWARD 249 print("\t..creating reverse orfs") 250 removeList_rev=[] 251 l = len(stopRev)-1 252 253 for r in range(l,-1,-1):# 254 stopPos = stopRev[r] 255 foundPartner=False 256 257 258 259 i = len(startRev)-1 260 261 262 while((i >=0) and (startRev[i]>stopPos)): 263 startPos=startRev[i] 264 i -=1 265 if((startPos+3-stopPos)%3 == 0): 266 # found ORF in frame 267 if((foundPartner != True) and (startPos+3-stopPos > 149)): 268 #found first matching ATG + ORF is at least 150bp long 269 pdic[stopPos]="S" 270 pdic[startPos+3]="E" 271 foundPartner = True 272 orfList.append(startPos) 273 orfList.append(stopPos+3) 274 removeList_rev.append(startPos) 275 else: 276 removeList_rev.append(startPos) 277 278 for item in removeList_rev: 279 # remove all positions of ATGS leading to "sub-ORFs" 280 startRev.remove(item) 281 removeList_rev =[] 282 283 return(pdic)
284