1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 """
25 Check sequence identity between templates.
26 """
27
28 import os
29 import re
30 import linecache
31 import Numeric as N
32
33 import Biskit.tools as T
34
35 from Biskit import StdLog, EHandler
36
37
39 """
40 This class regroups the different methods to prevent
41 the use of templates with a too high percentage of identities
42 """
43
44 F_INPUT_FOLDER = '/t_coffee'
45
46
47
48 F_INPUT_ALNS= F_INPUT_FOLDER +'/final.pir_aln'
49
50
51 F_OUTPUT_IDENTITIES = '/identities.out'
52 F_OUTPUT_IDENTITIES_INF = '/identities_info.out'
53 F_OUTPUT_IDENTITIES_COV = '/identities_cov.out'
54
55
56 - def __init__(self, outFolder='.', verbose=1):
57 """
58 @param outFolder: base folder
59 @type outFolder: str
60 @param verbose: write intermediary files (default: 0)
61 @type verbose: 1|0
62 """
63 self.outFolder = T.absfile( outFolder )
64 self.verbose = verbose
65 self.sequences_name=["target"]
66
67
69 """
70 Retrieve the lines from an aln file
71
72 @param aln_file: aln file (default: None -> L{F_INPUT_ALNS})
73 @type aln_file: str
74
75 @return: file as list of strings
76 @rtype: [str]
77 """
78 aln_file = aln_file or self.outFolder + self.F_INPUT_ALNS
79 file = open('%s'%aln_file, 'r')
80 string_lines = file.readlines()
81 file.close()
82
83 return string_lines
84
85
87 """
88 This function returns the length an alignment in the
89 file 'final.pir_aln'
90
91 @param string_lines: aln file as list of string
92 @type string_lines: [str]
93
94 @return: length of alignment
95 @rtype: int
96 """
97 end_of_sequence=1
98
99 for x in string_lines:
100 if(x == '*\n'):
101 endof_1st_sequence = end_of_sequence
102 break
103
104 end_of_sequence+=1
105
106 return end_of_sequence - 3
107
108
110 """
111 Create a dictionary with the name of the target (i.e 'target')
112 and sequence from the final output from T-Coffee (final.pir_aln).
113
114 @param string_lines: aln file as list of string
115 @type string_lines: [str]
116 @param aln_length: length of alignment
117 @type aln_length: int
118
119 @return: alignment dictionary
120 e.g. {'name':'target, 'seq': 'sequence of the target'}
121 @rtype: dict
122 """
123 aln_dictionary = {}
124 target_sequence = ""
125
126 y=0
127 for x in string_lines:
128 if(x == '>P1;target\n'):
129 break
130 y+=1
131
132 for s in string_lines[y+2:y+2+aln_length]:
133 target_sequence += s[:-1]
134
135 aln_dictionary["target"] = {'name':'target','seq':target_sequence}
136
137 return aln_dictionary
138
139
141 """
142 Add information about the name of the template sequences and
143 the sequence that is aligned to the template. Data taken from
144 the T-Coffee alignment (final.pir_aln).
145
146 @param string_lines: aln file as list of string
147 @type string_lines: [str]
148 @param aln_dict: alignment dictionary
149 @type aln_dict: dict
150
151 @return: template alignment dictionary
152 e.g. { str :{'name':str, 'seq': str} }
153 @rtype: dict
154 """
155 r1 = re.compile(r'>P1;\d')
156
157 z = 0
158 for string in string_lines[:]:
159 if(r1.match(string)):
160
161 pdb_name =""
162 for w in string[4:]:
163
164 for letters in w:
165 if(letters!='\n'):
166 pdb_name += w
167 else: break
168
169 template_sequence = ""
170 for lines in string_lines[z+2:z+2+aln_length]:
171 template_sequence += lines[:-1]
172
173 self.sequences_name.append("%s"%pdb_name)
174 aln_dict["%s"%pdb_name] = {'name':'%s'%pdb_name,
175 'seq':template_sequence}
176
177 z+=1
178
179 return aln_dict
180
181
183 """
184 Create a dictionary that contains information about all the
185 alignments in the aln_dictionar using pairwise comparisons.
186
187 @param aln_dictionary: alignment dictionary
188 @type aln_dictionary: dict
189
190 @return: a dictionary of dictionaries with the sequence name as the
191 top key. Each sub dictionary then has the keys:
192 - 'name' - str, sequence name
193 - 'seq' - str, sequence of
194 - 'template_info' - list of the same length as the 'key'
195 sequence excluding deletions. The number of sequences
196 in tha multiple alignmentthat contain information at
197 this position.
198 - 'ID' - dict, sequence identity in precent comparing the
199 'key' sequence all other sequences (excluding deletions)
200 - 'info_ID' - dict, same as 'ID' but compared to the template
201 sequence length (i.e excluding deletions and insertions
202 in the 'key' sequence )
203 - 'cov_ID' - dict, same as 'info_ID' but insertions are defined
204 comparing to all template sequences (i.e where
205 'template_info' is zero )
206 @rtype: dict
207 """
208
209 for i in self.sequences_name:
210 template_names = []
211
212
213 for name in self.sequences_name:
214 if(name is not i):
215 template_names.append(name)
216
217
218 info_ID, ID, cov_ID = {}, {}, {}
219 for y in self.sequences_name:
220 identity = 0
221 info_identity = 0
222 cov_identity = 0
223 nb_of_identities = 0
224 nb_of_template = 0
225 template_info = []
226 nb_of_residues = 0
227
228
229 for w in range(len(aln_dictionary["target"]["seq"])):
230
231
232 nb_of_info_res=0
233 if(aln_dictionary[i]["seq"][w] is not '-'):
234 nb_of_residues += 1
235
236
237 if(aln_dictionary[i]["seq"][w] == \
238 aln_dictionary[y]["seq"][w]):
239 nb_of_identities += 1
240
241
242 if(aln_dictionary[y]["seq"][w] is not '-'):
243 nb_of_template += 1
244
245
246 for z in template_names:
247
248
249 if(aln_dictionary[z]["seq"][w] is not '-'):
250 nb_of_info_res += 1
251
252 template_info.append(nb_of_info_res)
253
254
255
256 nb_cov_res = N.sum( N.greater(template_info, 0) )
257
258
259 info_ID[y] = 100. * nb_of_identities / nb_of_template
260 ID[y] = 100. * nb_of_identities / nb_of_residues
261 cov_ID[y] = 100. * nb_of_identities / nb_cov_res
262
263 aln_dictionary[i]["info_ID"] = info_ID
264 aln_dictionary[i]["ID"] = ID
265 aln_dictionary[i]["cov_ID"] = cov_ID
266 aln_dictionary[i]["template_info"] = template_info
267
268 return aln_dictionary
269
270
271 - def __writeId( self, name, dic, key, description ):
272 """
273 Write an sequence identity matrix to file.
274
275 @param name: file name
276 @type name: str
277 @param dic: alignment dictionary
278 @type dic: dict
279 @param key: key in dictionary to write
280 @type key: key
281 @param description: description to go into file (first line)
282 @type description: str
283 """
284 f = open( name, 'w' )
285 f.write( description +'\n\n')
286
287
288 f.write('%s'%(' '*5))
289 for s in self.sequences_name:
290 f.write('%8s'%s)
291 f.write('\n')
292
293
294 for s in self.sequences_name:
295 f.write('%5s'%s)
296 for t in self.sequences_name:
297 f.write("%8.2f"%dic[s][key][t])
298 f.write('\n')
299
300
301 - def output_identities(self, aln_dictionary, identities_file = None,
302 identities_info_file = None,
303 identities_cov_file = None):
304 """
305 Writes three files to disk with identity info about the current
306 multiple alignment.
307
308 @param aln_dictionary: alignment dictionary
309 @type aln_dictionary: dict
310 @param identities_file: name for file with sequence identity
311 in percent comparing a sequence to another
312 (excluding deletions in the first sequence)
313 (default: None -> L{F_OUTPUT_IDENTITIES})
314 @type identities_file: str
315 @param identities_info_file: name for file with sequence identity in
316 percent comparing a sequence to another
317 (excluding deletions and insertions in
318 the first sequence)
319 (default: None -> L{F_OUTPUT_IDENTITIES_INF})
320 @type identities_info_file: str
321 @param identities_cov_file: name for file with sequence identity in
322 percent comparing a sequence to another
323 (excluding deletions and insertions in
324 the first sequence but only when the
325 first sequence doesn't match any other
326 sequence in the multiple alignment)
327 (default: None -> L{F_OUTPUT_IDENTITIES_COV})
328 @type identities_cov_file: str
329 """
330
331 identities_file = identities_file or \
332 self.outFolder + self.F_OUTPUT_IDENTITIES
333 identities_info_file = identities_info_file or \
334 self.outFolder + self.F_OUTPUT_IDENTITIES_INF
335 identities_cov_file = identities_cov_file or \
336 self.outFolder + self.F_OUTPUT_IDENTITIES_COV
337
338 head_ID = "Sequence identity in percent comparing a sequence to another \n(excluding deletions in the first sequence)"
339
340 head_info_ID ="Sequence identity in percent comparing a sequence to another \n(excluding deletions and insertions in the first sequence)"
341
342 head_conv_ID ="Sequence identity in percent comparing a sequence to another \n(excluding deletions and insertions in the first sequence but only \nwhen the first sequence doesn't match any other sequence in the \nmultiple alignment )"
343
344 self.__writeId( identities_file, aln_dictionary,
345 'ID' , head_ID )
346 self.__writeId( identities_info_file, aln_dictionary,
347 'info_ID' , head_info_ID )
348 self.__writeId( identities_cov_file, aln_dictionary,
349 'cov_ID', head_conv_ID )
350
351
352 - def go(self, output_folder = None):
353 """
354 Perform sequence comparison.
355
356 @param output_folder: output folder
357 @type output_folder: str
358 """
359 output_folder = output_folder or self.outFolder
360
361 string_lines = self.get_lines()
362
363 aln_length = self.search_length( string_lines )
364
365
366 aln_dictionary = self.get_aln_sequences( string_lines,
367 aln_length )
368
369
370 aln_dictionary = self.get_aln_templates( string_lines,
371 aln_dictionary,
372 aln_length )
373
374 aln_dictionary = self.identities( aln_dictionary )
375
376 self.output_identities( aln_dictionary )
377
378 return aln_dictionary
379
380
381
382
383
384
385
387 """
388 Test class
389 """
390
391 - def run( self, local=0, validate_testRoot=0 ):
392 """
393 run function test
394
395 @param local: transfer local variables to global and perform
396 other tasks only when run locally
397 @type local: 1|0
398 @param validate_testRoot: check a validation project residing in
399 the test root (default: 0)
400 @type validate_testRoot: 1|0
401
402 @return: 1
403 @rtype: int
404 """
405 import tempfile
406 import shutil
407
408 if validate_testRoot:
409 self.m = Check_Identities( T.testRoot() + '/Mod/project')
410 self.m.go()
411
412 val_root = T.testRoot() + '/Mod/project/validation'
413 folders = os.listdir( val_root )
414
415 for f in folders:
416 self.m = Check_Identities( outFolder = val_root + '/' + f )
417 self.m.go()
418
419 if local:
420 globals().update( locals() )
421
422 else:
423
424 outfolder = tempfile.mkdtemp( '_test_CheckIdentities' )
425 os.mkdir( outfolder +'/t_coffee' )
426
427 shutil.copy( T.testRoot() + '/Mod/project/t_coffee/final.pir_aln',
428 outfolder + '/t_coffee' )
429
430 self.m = Check_Identities( outfolder )
431 self.m.go()
432
433 if local:
434 globals().update( locals() )
435 print """
436 The result from the template comparison can be found in the three files %s, %s and %s that reside in the folder %s"""\
437 %(self.m.F_OUTPUT_IDENTITIES[1:],
438 self.m.F_OUTPUT_IDENTITIES_INF[1:],
439 self.m.F_OUTPUT_IDENTITIES_COV[1:],
440 outfolder )
441
442
443 T.tryRemove( outfolder, tree=1 )
444
445 return 1
446
447
449 """
450 Precalculated result to check for consistent performance.
451
452 @return: 1
453 @rtype: int
454 """
455 return 1
456
457
458 if __name__ == '__main__':
459
460 test = Test()
461
462 assert test.run( local=1 ) == test.expected_result()
463