1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 """
29 Analyze model quality
30 """
31
32 import Biskit.tools as T
33 from Biskit.PDBModel import PDBModel
34 from Biskit.Mod.Benchmark import Benchmark
35 from Biskit.Mod.ValidationSetup import ValidationSetup as VS
36 from Biskit.Mod.CheckIdentities import CheckIdentities as CI
37 from Biskit.Mod.Modeller import Modeller
38
39 import os
40 from string import *
41 import numpy.oldnumeric as N
42
43
45 """
46 Create a folder named analyse in the project root folder
47 that contains the following results:
48
49 GLOBAL: global rmsd: all atoms, c-alpha only, percentage of
50 identities, Modeller score, and the number of templates
51
52 LOCAL: results of the cross validation, rmsd per residue c-alpha
53 only for each templates and the mean rmsd
54
55 3D structure: pickle down the final.pdb that is the best model of
56 the project and the mean rmsd is set inside (temperature_factor)
57 """
58
59 F_RESULT_FOLDER = '/analyse'
60 F_TEMPLATE_FOLDER = VS.F_RESULT_FOLDER
61
62 F_PDBModels = Benchmark.F_PDBModels_OUT
63 F_MODELS = Modeller.F_RESULT_FOLDER + Modeller.F_PDBModels
64
65 F_INPUT_ALNS= '/t_coffee/final.pir_aln'
66
67 F_INPUT_RMSD = Benchmark.F_RESULT_FOLDER
68 F_RMSD_AA = Benchmark.F_RMSD_AA
69 F_RMSD_CA = Benchmark.F_RMSD_CA
70
71
72 F_OUTPUT_VALUES = F_RESULT_FOLDER + '/global_results.out'
73 F_CROSS_VAL = F_RESULT_FOLDER + '/local_results.out'
74 F_FINAL_PDB = F_RESULT_FOLDER + '/final.pdb'
75
76
77 - def __init__( self, outFolder, log=None ):
78 """
79 @param outFolder: base folder for output
80 @type outFolder: str
81 @param log: None reports to STDOUT
82 @type log: LogFile instance or None
83 """
84 self.outFolder = T.absfile( outFolder )
85 self.log = log
86
87 self.prepareFolders()
88
89
96
97
99 """
100 Parse a identity matrix file
101
102 @param name: file to parse
103 @type name: str
104
105 @return: contents of parsed file
106 @rtype: [[str]]
107 """
108 f = open( name, 'r')
109 result = []
110 lines = f.readlines()
111
112 for l in lines:
113 if not l[0] == '#':
114 r =[]
115 for s in l.split():
116 try:
117 r += [float(s)]
118 except:
119 pass
120 if len(r)>=1:
121 result += [ r ]
122
123 f.close()
124 return result
125
126
128 """
129 List all the files and folders in a directory
130 with the exceprion of ...
131
132 @param path: dir to list
133 @type path: str
134
135 @return: list of files
136 @rtype: [str]
137 """
138 files = os.listdir( path )
139 if 'CVS' in files:
140 files.remove('CVS')
141
142 return files
143
144
145
146
147
148
149
151 """
152 Global RMSD values.
153
154 @param validation_folder: folder vith validation data
155 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER})
156 @type validation_folder: str
157
158 @return: two dictionaries:
159 - rmsd_aa_wo_if: global all atom rmsd for each
160 template without iterative fitting
161 - rmsd_aa_if: global all atom rmsd for each
162 templates with iterative fitting
163 @rtype: dict, dict
164 """
165 validation_folder = validation_folder or self.outFolder + \
166 self.F_TEMPLATE_FOLDER
167
168 folders = self.__listDir(validation_folder)
169
170 rmsd_aa_wo_if = {}
171 rmsd_aa_if = {}
172
173 for folder in folders:
174
175 file = "%s/%s"%(validation_folder, folder + self.F_RMSD_AA)
176 lst = self.parseFile( file )
177
178 rmsd_aa_wo_if[folder] = [ lst[0][0] ]
179 rmsd_aa_if[folder] = [ lst[0][1], lst[0][2]*100.]
180
181 return rmsd_aa_wo_if, rmsd_aa_if
182
183
185 """
186 Global RMSD CA values.
187
188 @param validation_folder: folder vith validation data
189 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER})
190 @type validation_folder: str
191
192 @return: two dictionaries:
193 - rmsd_ca_wo_if: global CA rmsd for each template
194 without iterative fitting
195 - rmsd_ca_if: global CA rmsd for each template
196 with iterative fitting
197 @rtype: dict, dict
198 """
199 validation_folder = validation_folder or self.outFolder + \
200 self.F_TEMPLATE_FOLDER
201
202 folders = self.__listDir(validation_folder)
203
204 rmsd_ca_wo_if = {}
205 rmsd_ca_if = {}
206
207 for folder in folders:
208
209 file = "%s/%s"%(validation_folder, folder + self.F_RMSD_CA)
210
211 lst = self.parseFile( file )
212
213 rmsd_ca_wo_if[folder] = [ lst[0][0] ]
214 rmsd_ca_if[folder] = [ lst[0][1], lst[0][2]*100.]
215
216 return rmsd_ca_wo_if, rmsd_ca_if
217
218
220 """
221 Calculate the mean of the percentage of identities for each
222 template with the others.
223
224 @param nb_templates: number of templates used in the cross-validation
225 @type nb_templates: int
226 @param validation_folder: folder vith validation data
227 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER})
228 @type validation_folder: str
229
230 @return: dictionary with mean percent identities for each template
231 @rtype: {str:float}
232 """
233
234 validation_folder = validation_folder or self.outFolder + \
235 self.F_TEMPLATE_FOLDER
236
237 folders = self.__listDir(validation_folder)
238 identities = {}
239
240 for folder in folders:
241 file = "%s/%s"%(validation_folder, folder + \
242 CI.F_OUTPUT_IDENTITIES_COV)
243
244 lst = self.parseFile( file )
245
246
247 identities[folder] = N.sum(lst[0][1:])/nb_templates
248
249 return identities
250
251
252 - def get_score(self, validation_folder = None):
253 """
254 Get the best global modeller score for each template re-modeled
255
256 @param validation_folder: folder vith validation data
257 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER})
258 @type validation_folder: str
259
260 @return: dictionary with modeller score for each template
261 @rtype: {str:float}
262 """
263 validation_folder = validation_folder or self.outFolder + \
264 self.F_TEMPLATE_FOLDER
265
266 folders = self.__listDir(validation_folder)
267 score = {}
268
269 for folder in folders:
270 file = "%s/%s"%(validation_folder, folder + Modeller.F_SCORE_OUT)
271
272 file = open(file, 'r')
273 string_lines = file.readlines()[3]
274 score[folder] = float( split(string_lines)[1] )
275
276 return score
277
278
279 - def output_values(self, rmsd_aa_wo_if, rmsd_aa_if, rmsd_ca_wo_if,
280 rmsd_ca_if, identities, score, nb_templates,
281 output_file = None):
282 """
283 Write result to file.
284
285 @param rmsd_aa_wo_if: Rmsd for heavy atoms and normal fit.
286 Data should be a dictionary mapping pdb
287 codes to a list containing the rmsd value
288 and the percent of discharded atoms in
289 the rmsd calculation.
290 @type rmsd_aa_wo_if: {str:[float,float]}
291 @param rmsd_aa_if: Rmsd for heavy atoms, iterative fit.
292 @type rmsd_aa_if: {str:[float,float]}
293 @param rmsd_ca_wo_if: rmsd for only CA, normal fit.
294 @type rmsd_ca_wo_if: {str:[float,float]}
295 @param rmsd_ca_if: Rmsd for only CA, iterative fit.
296 @type rmsd_ca_if: {str:[float,float]}
297 @param identities: mean identity to template, dictionary
298 mapping pdb codes to identity values
299 @type identities: {str:float}
300 @param score: score calculated by Modeller
301 @type score: float
302 @param nb_templates: number of templates used for re-modeling
303 @type nb_templates: int
304 @param output_file: file to write
305 (default: None S{->} outFolder/L{F_OUTPUT_VALUES})
306 @type output_file: str
307 """
308 output_file = output_file or self.outFolder + self.F_OUTPUT_VALUES
309
310 file = open( output_file, 'w' )
311 file.write("PROPERTIES OF RE-MODELED TEMPLATES:\n\n")
312 file.write(" | NORMAL FIT | ITERATIVE FIT | IDENTITY | SCORE | NR\n" )
313 file.write("PDB | heavy CA | heavy percent CA percent | mean to | mod8 | of\n")
314 file.write("code | rmsd rmsd | rmsd discarded rmsd discarded | templates | | templates\n")
315
316 for key, value in rmsd_aa_wo_if.items():
317
318 file.write("%4s %6.2f %6.2f %6.2f %8.1f %7.2f %7.1f %10.1f %9i %6i\n"%\
319 (key, value[0], rmsd_ca_wo_if[key][0], rmsd_aa_if[key][0],
320 rmsd_aa_if[key][1], rmsd_ca_if[key][0], rmsd_ca_if[key][1],
321 identities[key], score[key], nb_templates))
322
323 file.close()
324
325
326
327
328
330 """
331 Collect alignment information.
332
333 @param output_folder: output folder (default: None S{->} outFolder)
334 @type output_folder: str
335
336 @return: aln_dictionary, contains information from the alignment
337 between the target and its templates
338 e.g. {'name':'target, 'seq': 'sequence of the target'}
339 @rtype: dict
340 """
341 output_folder = output_folder or self.outFolder
342
343 ci = CI(outFolder=output_folder)
344
345 string_lines = ci.get_lines()
346 aln_length = ci.search_length(string_lines)
347 aln_dictionnary = ci.get_aln_sequences(string_lines, aln_length)
348 aln_dictionnary = ci.get_aln_templates(string_lines, aln_dictionnary,
349 aln_length)
350 aln_dictionnary = ci.identities(aln_dictionnary)
351
352 return aln_dictionnary
353
354
356 """
357 Collect RMSD values between all the templates.
358
359 @param templates: name of the different templates
360 @type templates: [str]
361
362 @return: template_rmsd_dic, contains all the rmsd per residues
363 of all the templates
364 @rtype: dict
365 """
366 template_rmsd_dic = {}
367 for template in templates:
368
369 pdb_list = self.outFolder + self.F_TEMPLATE_FOLDER \
370 + "/%s"%template + self.F_PDBModels
371
372 pdb_list = T.Load(pdb_list)
373
374 template_rmsd_dic[template] = \
375 pdb_list[0].compress(pdb_list[0].maskCA()).aProfiles["rmsd2ref_if"]
376
377 return template_rmsd_dic
378
379
381 """
382 Collect RMSD profiles of each template with the target and their %ID.
383
384 @param templates: name of the different templates
385 @type templates: [str]
386 @param aln_dic: contains all the informations between the
387 target and its templates from the alignment
388 @type aln_dic: dict
389 @param template_rmsd_dic: contains all the rmsd per residues of all
390 the templates
391 @type template_rmsd_dic: dict
392
393 @return: template_profiles, contains all the profile rmsd of
394 each template with the target and their %ID
395 @rtype: dict
396 """
397 templates_profiles = {}
398
399 target_aln = aln_dic["target"]["seq"]
400 for template in templates:
401 template_aln = []
402 template_profile = []
403 template_info = {}
404
405 template_rmsd = template_rmsd_dic[template]
406 for key in aln_dic:
407 if(key[:4] == template):
408 template_aln = aln_dic[key]["seq"]
409
410 no_res = -1
411 for i in range(len(target_aln)):
412 if(template_aln[i] is not '-'):
413 no_res += 1
414
415 if(target_aln[i] != '-' and template_aln[i] != '-'):
416 template_profile.append(template_rmsd[no_res])
417
418 if(target_aln[i] != '-' and template_aln[i] == '-'):
419 template_profile.append(-1)
420
421 template_info["rProfile"] = template_profile
422
423 for key in aln_dic["target"]["cov_ID"]:
424 if(key[:4] == template):
425 template_info["cov_ID"] = \
426 aln_dic["target"]["cov_ID"][key]
427
428 templates_profiles[template] = template_info
429
430 return templates_profiles
431
432
433
434 - def output_cross_val(self, aln_dic, templates_profiles,
435 templates, model, output_file=None):
436 """
437 Calculates the mean rmsd of the model to the templates and
438 write the result to a file.
439
440 @param aln_dic: contains all the informations between the
441 target and its templates from the alignment
442 @type aln_dic: dict
443 @param templates_profiles: contains all the profile rmsd of
444 each template with the target and their %ID
445 @type templates_profiles: dict
446 @param templates: name of the different templates
447 @type templates: [str]
448 @param model: model
449 @type model: PDBModel
450 @param output_file: output file
451 (default: None S{->} outFolder/L{F_CROSS_VAL})
452 @type output_file: str
453
454 @return: mean_rmsd, dictionary with the mean rmsd of the model
455 to the templates.
456 @rtype: dict
457 """
458 output_file = output_file or self.outFolder + self.F_CROSS_VAL
459
460 mean, sum, values = 0, 0, 0
461 mean_rmsd = []
462
463 for k,v in aln_dic["target"]["cov_ID"].items():
464
465 if (k != "target"):
466 sum += aln_dic["target"]["cov_ID"][k]
467 values +=1
468
469
470
471 for i in range(len(templates_profiles[templates[0]]["rProfile"])):
472
473 mean = 0
474 sum = 0
475 n_values = 0
476
477 for k in templates_profiles:
478
479 if(templates_profiles[k]["rProfile"][i] != -1):
480 sum += templates_profiles[k]["rProfile"][i]
481 n_values += 1
482
483 if(n_values != 0):
484 mean = float(sum) / float(n_values)
485
486 else: mean = -1
487
488 mean_rmsd.append(mean)
489
490
491 file = open (output_file, 'w')
492 file.write("Mean rmsd of model to templates and the residue rmsd.\n")
493
494
495 file.write(" "*7)
496 for k in templates_profiles.keys():
497 file.write("%6s"%k)
498 file.write(" mean\n")
499
500
501 file.write(" "*7)
502 for k in templates_profiles.keys():
503 file.write("%6.2f"%templates_profiles[k]["cov_ID"])
504 file.write("\n%s\n"%('='*70))
505
506
507 res_nr = model.compress( model.maskCA()).aProfiles['residue_number']
508 res_na = model.compress( model.maskCA()).aProfiles['residue_name']
509 for i in range(len(templates_profiles[templates[0]]["rProfile"])):
510 file.write("%3i %3s"%(res_nr[i],
511 res_na[i]))
512 for k in templates_profiles:
513 file.write("%6.2f"%(templates_profiles[k]["rProfile"][i]))
514
515 file.write("%6.2f\n"%(mean_rmsd[i]))
516
517 file.close()
518
519 return mean_rmsd
520
521
522
523
524
525
527 """
528 pickle down the final.pdb which is judged to be the best model
529 of the project. The mean rmsd to the templates is written to the
530 temperature_factor column.
531
532 @param mean_rmsd_atoms: mean rmsd for each atom of the
533 target's model
534 @type mean_rmsd_atoms: [int]
535 @param model: target's model with the highest modeller score
536 @type model: PDBModel
537 """
538 model['temperature_factor'] = v
539
540 model.writePdb(self.outFolder + self.F_FINAL_PDB)
541
542
543
544
545
546
547
548 - def go(self, output_folder = None, template_folder = None):
549 """
550 Run analysis of models.
551
552 @param output_folder: folder for result files
553 (default: None S{->} outFolder/L{F_RESULT_FOLDER})
554 @type output_folder: str
555 @param template_folder: folder with template structures
556 (default: None S{->} outFolder/L{VS.F_RESULT_FOLDER})
557 @type template_folder: str
558 """
559
560 pdb_list = T.Load(self.outFolder + self.F_MODELS)
561 model = PDBModel(pdb_list[0])
562
563
564 output_folder = output_folder or self.outFolder + self.F_RESULT_FOLDER
565 template_folder = template_folder or self.outFolder +VS.F_RESULT_FOLDER
566
567 templates = self.__listDir(template_folder)
568
569
570 global_rmsd_aa_wo_if, global_rmsd_aa_if = self.global_rmsd_aa()
571 global_rmsd_ca_wo_if, global_rmsd_ca_if = self.global_rmsd_ca()
572 nb_templates = len(templates)-1
573
574 identities = self.get_identities(nb_templates)
575 score = self.get_score()
576
577 self.output_values(global_rmsd_aa_wo_if, global_rmsd_aa_if,
578 global_rmsd_ca_wo_if, global_rmsd_ca_if,
579 identities, score, nb_templates)
580
581
582 aln_dic = self.get_aln_info(output_folder=self.outFolder)
583
584 template_rmsd_dic = self.get_templates_rmsd(templates)
585 templates_profiles = self.templates_profiles(templates,
586 aln_dic,
587 template_rmsd_dic)
588 mean_rmsd = self.output_cross_val(aln_dic, templates_profiles,
589 templates, model)
590
591
592 mean_rmsd_atoms = model.res2atomProfile(mean_rmsd)
593 self.updatePDBs_charge(mean_rmsd_atoms, model)
594
595
596
597
598
599
600 import Biskit.test as BT
601
602 -class Test( BT.BiskitTest ):
603 """
604 Test class
605 """
606
608 import tempfile
609 import shutil
610
611
612 self.outfolder = tempfile.mkdtemp( '_test_Analyse' )
613
614
615 dir = '/Mod/project/validation/'
616 for v in ['1DT7', '1J55']:
617
618 os.makedirs( self.outfolder +'/validation/%s/modeller'%v )
619 shutil.copy( T.testRoot() +dir+'/%s/modeller/Modeller_Score.out'%v,
620 self.outfolder + '/validation/%s/modeller'%v)
621
622 shutil.copy( T.testRoot() + dir + '/%s/identities_cov.out'%v,
623 self.outfolder + '/validation/%s'%v )
624
625 os.mkdir( self.outfolder +'/validation/%s/benchmark'%v )
626 for f in [ 'rmsd_aa.out', 'rmsd_ca.out', 'PDBModels.list' ]:
627 shutil.copy( T.testRoot() + dir + '/%s/benchmark/%s'%(v,f),
628 self.outfolder + '/validation/%s/benchmark'%v )
629
630
631
632 os.mkdir( self.outfolder +'/modeller' )
633 shutil.copy( T.testRoot() + '/Mod/project/modeller/PDBModels.list',
634 self.outfolder + '/modeller' )
635
636 os.mkdir( self.outfolder +'/t_coffee' )
637 shutil.copy( T.testRoot() + '/Mod/project/t_coffee/final.pir_aln',
638 self.outfolder + '/t_coffee' )
639
640
642 """Mod.Analyzer test"""
643
644 self.a = Analyse( outFolder = self.outfolder )
645 self.a.go()
646
647 if self.local and self.DEBUG:
648 self.log.add(
649 'The result from the analysis is in %s/analyse'%self.outfolder)
650
653
654
656 """
657 Test case for analyzing a complete modeling project in test/Mod/project
658 """
659
660
662 """
663 Mod.Analyze full test/Mod/project test
664 """
665 self.outfolder = T.testRoot() + '/Mod/project'
666
667 self.a = Analyse( outFolder = self.outfolder )
668 self.a.go()
669
670 if self.local:
671 print 'The result from the analysis can be found in %s/analyse'%outfolder
672
673 if __name__ == '__main__':
674
675 BT.localTest()
676