Biskit.Mod.Analyse

45 """ 46 Create a folder named analyse in the project root folder 47 that contains the following results: 48 49 GLOBAL: global rmsd: all atoms, c-alpha only, percentage of 50 identities, Modeller score, and the number of templates 51 52 LOCAL: results of the cross validation, rmsd per residue c-alpha 53 only for each templates and the mean rmsd 54 55 3D structure: pickle down the final.pdb that is the best model of 56 the project and the mean rmsd is set inside (temperature_factor) 57 """ 58 59 F_RESULT_FOLDER = '/analyse' 60 F_TEMPLATE_FOLDER = VS.F_RESULT_FOLDER 61 62 F_PDBModels = Benchmark.F_PDBModels_OUT 63 F_MODELS = Modeller.F_RESULT_FOLDER + Modeller.F_PDBModels 64 65 F_INPUT_ALNS= '/t_coffee/final.pir_aln' 66 67 F_INPUT_RMSD = Benchmark.F_RESULT_FOLDER 68 F_RMSD_AA = Benchmark.F_RMSD_AA 69 F_RMSD_CA = Benchmark.F_RMSD_CA 70 71 72 F_OUTPUT_VALUES = F_RESULT_FOLDER + '/global_results.out' 73 F_CROSS_VAL = F_RESULT_FOLDER + '/local_results.out' 74 F_FINAL_PDB = F_RESULT_FOLDER + '/final.pdb' 75 76

77 - def __init__( self, outFolder, log=None ):

78 """ 79 @param outFolder: base folder for output 80 @type outFolder: str 81 @param log: None reports to STDOUT 82 @type log: LogFile instance or None 83 """ 84 self.outFolder = T.absfile( outFolder ) 85 self.log = log 86 87 self.prepareFolders()

88 89

90 - def prepareFolders( self ):

91 """ 92 Create folders needed by this class. 93 """ 94 if not os.path.exists( self.outFolder + self.F_RESULT_FOLDER ): 95 os.mkdir( self.outFolder + self.F_RESULT_FOLDER )

96 97

98 - def parseFile( self, name ):

99 """ 100 Parse a identity matrix file 101 102 @param name: file to parse 103 @type name: str 104 105 @return: contents of parsed file 106 @rtype: [[str]] 107 """ 108 f = open( name, 'r') 109 result = [] 110 lines = f.readlines() 111 112 for l in lines: 113 if not l[0] == '#': 114 r =[] 115 for s in l.split(): 116 try: 117 r += [float(s)] 118 except: 119 pass 120 if len(r)>=1: 121 result += [ r ] 122 123 f.close() 124 return result

125 126

127 - def __listDir( self, path ):

128 """ 129 List all the files and folders in a directory 130 with the exceprion of ... 131 132 @param path: dir to list 133 @type path: str 134 135 @return: list of files 136 @rtype: [str] 137 """ 138 files = os.listdir( path ) 139 if 'CVS' in files: 140 files.remove('CVS') 141 142 return files

143 144 145 ###################################################################### 146 #### GLOBAL RESTULTS: RMSD_AA, RMSD_CA, 147 #### %ID(mean of the templates), 148 #### Nb of Templates 149

150 - def global_rmsd_aa(self, validation_folder= None):

151 """ 152 Global RMSD values. 153 154 @param validation_folder: folder vith validation data 155 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER}) 156 @type validation_folder: str 157 158 @return: two dictionaries: 159 - rmsd_aa_wo_if: global all atom rmsd for each 160 template without iterative fitting 161 - rmsd_aa_if: global all atom rmsd for each 162 templates with iterative fitting 163 @rtype: dict, dict 164 """ 165 validation_folder = validation_folder or self.outFolder + \ 166 self.F_TEMPLATE_FOLDER 167 168 folders = self.__listDir(validation_folder) 169 170 rmsd_aa_wo_if = {} 171 rmsd_aa_if = {} 172 173 for folder in folders: 174 175 file = "%s/%s"%(validation_folder, folder + self.F_RMSD_AA) 176 lst = self.parseFile( file ) 177 178 rmsd_aa_wo_if[folder] = [ lst[0][0] ] 179 rmsd_aa_if[folder] = [ lst[0][1], lst[0][2]*100.] 180 181 return rmsd_aa_wo_if, rmsd_aa_if

182 183

184 - def global_rmsd_ca(self, validation_folder= None):

185 """ 186 Global RMSD CA values. 187 188 @param validation_folder: folder vith validation data 189 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER}) 190 @type validation_folder: str 191 192 @return: two dictionaries: 193 - rmsd_ca_wo_if: global CA rmsd for each template 194 without iterative fitting 195 - rmsd_ca_if: global CA rmsd for each template 196 with iterative fitting 197 @rtype: dict, dict 198 """ 199 validation_folder = validation_folder or self.outFolder + \ 200 self.F_TEMPLATE_FOLDER 201 202 folders = self.__listDir(validation_folder) 203 204 rmsd_ca_wo_if = {} 205 rmsd_ca_if = {} 206 207 for folder in folders: 208 209 file = "%s/%s"%(validation_folder, folder + self.F_RMSD_CA) 210 211 lst = self.parseFile( file ) 212 213 rmsd_ca_wo_if[folder] = [ lst[0][0] ] 214 rmsd_ca_if[folder] = [ lst[0][1], lst[0][2]*100.] 215 216 return rmsd_ca_wo_if, rmsd_ca_if

217 218

219 - def get_identities(self, nb_templates, validation_folder = None):

220 """ 221 Calculate the mean of the percentage of identities for each 222 template with the others. 223 224 @param nb_templates: number of templates used in the cross-validation 225 @type nb_templates: int 226 @param validation_folder: folder vith validation data 227 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER}) 228 @type validation_folder: str 229 230 @return: dictionary with mean percent identities for each template 231 @rtype: {str:float} 232 """ 233 234 validation_folder = validation_folder or self.outFolder + \ 235 self.F_TEMPLATE_FOLDER 236 237 folders = self.__listDir(validation_folder) 238 identities = {} 239 240 for folder in folders: 241 file = "%s/%s"%(validation_folder, folder + \ 242 CI.F_OUTPUT_IDENTITIES_COV) 243 244 lst = self.parseFile( file ) 245 246 ## identity to mean template 247 identities[folder] = N.sum(lst[0][1:])/nb_templates 248 249 return identities

250 251

252 - def get_score(self, validation_folder = None):

253 """ 254 Get the best global modeller score for each template re-modeled 255 256 @param validation_folder: folder vith validation data 257 (defult: None S{->} outFolder/L{F_TEMPLATE_FOLDER}) 258 @type validation_folder: str 259 260 @return: dictionary with modeller score for each template 261 @rtype: {str:float} 262 """ 263 validation_folder = validation_folder or self.outFolder + \ 264 self.F_TEMPLATE_FOLDER 265 266 folders = self.__listDir(validation_folder) 267 score = {} 268 269 for folder in folders: 270 file = "%s/%s"%(validation_folder, folder + Modeller.F_SCORE_OUT) 271 272 file = open(file, 'r') 273 string_lines = file.readlines()[3] 274 score[folder] = float( split(string_lines)[1] ) 275 276 return score

277 278

279 - def output_values(self, rmsd_aa_wo_if, rmsd_aa_if, rmsd_ca_wo_if, 280 rmsd_ca_if, identities, score, nb_templates, 281 output_file = None):

282 """ 283 Write result to file. 284 285 @param rmsd_aa_wo_if: Rmsd for heavy atoms and normal fit. 286 Data should be a dictionary mapping pdb 287 codes to a list containing the rmsd value 288 and the percent of discharded atoms in 289 the rmsd calculation. 290 @type rmsd_aa_wo_if: {str:[float,float]} 291 @param rmsd_aa_if: Rmsd for heavy atoms, iterative fit. 292 @type rmsd_aa_if: {str:[float,float]} 293 @param rmsd_ca_wo_if: rmsd for only CA, normal fit. 294 @type rmsd_ca_wo_if: {str:[float,float]} 295 @param rmsd_ca_if: Rmsd for only CA, iterative fit. 296 @type rmsd_ca_if: {str:[float,float]} 297 @param identities: mean identity to template, dictionary 298 mapping pdb codes to identity values 299 @type identities: {str:float} 300 @param score: score calculated by Modeller 301 @type score: float 302 @param nb_templates: number of templates used for re-modeling 303 @type nb_templates: int 304 @param output_file: file to write 305 (default: None S{->} outFolder/L{F_OUTPUT_VALUES}) 306 @type output_file: str 307 """ 308 output_file = output_file or self.outFolder + self.F_OUTPUT_VALUES 309 310 file = open( output_file, 'w' ) 311 file.write("PROPERTIES OF RE-MODELED TEMPLATES:\n\n") 312 file.write(" | NORMAL FIT | ITERATIVE FIT | IDENTITY | SCORE | NR\n" ) 313 file.write("PDB | heavy CA | heavy percent CA percent | mean to | mod8 | of\n") 314 file.write("code | rmsd rmsd | rmsd discarded rmsd discarded | templates | | templates\n") 315 316 for key, value in rmsd_aa_wo_if.items(): 317 318 file.write("%4s %6.2f %6.2f %6.2f %8.1f %7.2f %7.1f %10.1f %9i %6i\n"%\ 319 (key, value[0], rmsd_ca_wo_if[key][0], rmsd_aa_if[key][0], 320 rmsd_aa_if[key][1], rmsd_ca_if[key][0], rmsd_ca_if[key][1], 321 identities[key], score[key], nb_templates)) 322 323 file.close()

324 325 326 ######################################################################## 327 ######### LOCAL RESULTS: Cross Validation ---- RMSD / Res 328

329 - def get_aln_info(self, output_folder = None):

330 """ 331 Collect alignment information. 332 333 @param output_folder: output folder (default: None S{->} outFolder) 334 @type output_folder: str 335 336 @return: aln_dictionary, contains information from the alignment 337 between the target and its templates 338 e.g. {'name':'target, 'seq': 'sequence of the target'} 339 @rtype: dict 340 """ 341 output_folder = output_folder or self.outFolder 342 343 ci = CI(outFolder=output_folder) 344 345 string_lines = ci.get_lines() 346 aln_length = ci.search_length(string_lines) 347 aln_dictionnary = ci.get_aln_sequences(string_lines, aln_length) 348 aln_dictionnary = ci.get_aln_templates(string_lines, aln_dictionnary, 349 aln_length) 350 aln_dictionnary = ci.identities(aln_dictionnary) 351 352 return aln_dictionnary

353 354

355 - def get_templates_rmsd(self, templates):

356 """ 357 Collect RMSD values between all the templates. 358 359 @param templates: name of the different templates 360 @type templates: [str] 361 362 @return: template_rmsd_dic, contains all the rmsd per residues 363 of all the templates 364 @rtype: dict 365 """ 366 template_rmsd_dic = {} 367 for template in templates: 368 369 pdb_list = self.outFolder + self.F_TEMPLATE_FOLDER \ 370 + "/%s"%template + self.F_PDBModels 371 372 pdb_list = T.Load(pdb_list) 373 374 template_rmsd_dic[template] = \ 375 pdb_list[0].compress(pdb_list[0].maskCA()).aProfiles["rmsd2ref_if"] 376 377 return template_rmsd_dic

378 379

380 - def templates_profiles(self, templates, aln_dic, template_rmsd_dic):

381 """ 382 Collect RMSD profiles of each template with the target and their %ID. 383 384 @param templates: name of the different templates 385 @type templates: [str] 386 @param aln_dic: contains all the informations between the 387 target and its templates from the alignment 388 @type aln_dic: dict 389 @param template_rmsd_dic: contains all the rmsd per residues of all 390 the templates 391 @type template_rmsd_dic: dict 392 393 @return: template_profiles, contains all the profile rmsd of 394 each template with the target and their %ID 395 @rtype: dict 396 """ 397 templates_profiles = {} 398 399 target_aln = aln_dic["target"]["seq"] 400 for template in templates: 401 template_aln = [] 402 template_profile = [] 403 template_info = {} 404 405 template_rmsd = template_rmsd_dic[template] 406 for key in aln_dic: 407 if(key[:4] == template): 408 template_aln = aln_dic[key]["seq"] 409 410 no_res = -1 411 for i in range(len(target_aln)): 412 if(template_aln[i] is not '-'): 413 no_res += 1 414 415 if(target_aln[i] != '-' and template_aln[i] != '-'): 416 template_profile.append(template_rmsd[no_res]) 417 418 if(target_aln[i] != '-' and template_aln[i] == '-'): 419 template_profile.append(-1) 420 421 template_info["rProfile"] = template_profile 422 423 for key in aln_dic["target"]["cov_ID"]: 424 if(key[:4] == template): 425 template_info["cov_ID"] = \ 426 aln_dic["target"]["cov_ID"][key] 427 428 templates_profiles[template] = template_info 429 430 return templates_profiles

431 432 433

434 - def output_cross_val(self, aln_dic, templates_profiles, 435 templates, model, output_file=None):

436 """ 437 Calculates the mean rmsd of the model to the templates and 438 write the result to a file. 439 440 @param aln_dic: contains all the informations between the 441 target and its templates from the alignment 442 @type aln_dic: dict 443 @param templates_profiles: contains all the profile rmsd of 444 each template with the target and their %ID 445 @type templates_profiles: dict 446 @param templates: name of the different templates 447 @type templates: [str] 448 @param model: model 449 @type model: PDBModel 450 @param output_file: output file 451 (default: None S{->} outFolder/L{F_CROSS_VAL}) 452 @type output_file: str 453 454 @return: mean_rmsd, dictionary with the mean rmsd of the model 455 to the templates. 456 @rtype: dict 457 """ 458 output_file = output_file or self.outFolder + self.F_CROSS_VAL 459 460 mean, sum, values = 0, 0, 0 461 mean_rmsd = [] 462 463 for k,v in aln_dic["target"]["cov_ID"].items(): 464 465 if (k != "target"): 466 sum += aln_dic["target"]["cov_ID"][k] 467 values +=1 468 469 ## cov_id_target = float(sum/values) 470 471 for i in range(len(templates_profiles[templates[0]]["rProfile"])): 472 473 mean = 0 474 sum = 0 475 n_values = 0 476 477 for k in templates_profiles: 478 479 if(templates_profiles[k]["rProfile"][i] != -1): 480 sum += templates_profiles[k]["rProfile"][i] 481 n_values += 1 482 483 if(n_values != 0): 484 mean = float(sum) / float(n_values) 485 486 else: mean = -1 487 488 mean_rmsd.append(mean) 489 490 ## write header 491 file = open (output_file, 'w') 492 file.write("Mean rmsd of model to templates and the residue rmsd.\n") 493 494 ## write pdb code 495 file.write(" "*7) 496 for k in templates_profiles.keys(): 497 file.write("%6s"%k) 498 file.write(" mean\n") 499 500 ## write mean rmsd 501 file.write(" "*7) 502 for k in templates_profiles.keys(): 503 file.write("%6.2f"%templates_profiles[k]["cov_ID"]) 504 file.write("\n%s\n"%('='*70)) 505 506 ## write rmsd residue profiles 507 res_nr = model.compress( model.maskCA()).aProfiles['residue_number'] 508 res_na = model.compress( model.maskCA()).aProfiles['residue_name'] 509 for i in range(len(templates_profiles[templates[0]]["rProfile"])): 510 file.write("%3i %3s"%(res_nr[i], 511 res_na[i])) 512 for k in templates_profiles: 513 file.write("%6.2f"%(templates_profiles[k]["rProfile"][i])) 514 515 file.write("%6.2f\n"%(mean_rmsd[i])) 516 517 file.close() 518 519 return mean_rmsd

520 521 522 ################################################# 523 ###### 3D Structure: mean RMSD 524 525

526 - def updatePDBs_charge(self, mean_rmsd_atoms, model):

527 """ 528 pickle down the final.pdb which is judged to be the best model 529 of the project. The mean rmsd to the templates is written to the 530 temperature_factor column. 531 532 @param mean_rmsd_atoms: mean rmsd for each atom of the 533 target's model 534 @type mean_rmsd_atoms: [int] 535 @param model: target's model with the highest modeller score 536 @type model: PDBModel 537 """ 538 model['temperature_factor'] = v 539 540 model.writePdb(self.outFolder + self.F_FINAL_PDB)

541 542 543 544 ###################### 545 ### LAUNCH FUNCTION ## 546 ###################### 547

548 - def go(self, output_folder = None, template_folder = None):

549 """ 550 Run analysis of models. 551 552 @param output_folder: folder for result files 553 (default: None S{->} outFolder/L{F_RESULT_FOLDER}) 554 @type output_folder: str 555 @param template_folder: folder with template structures 556 (default: None S{->} outFolder/L{VS.F_RESULT_FOLDER}) 557 @type template_folder: str 558 """ 559 ## 560 pdb_list = T.Load(self.outFolder + self.F_MODELS) 561 model = PDBModel(pdb_list[0]) 562 563 ## 564 output_folder = output_folder or self.outFolder + self.F_RESULT_FOLDER 565 template_folder = template_folder or self.outFolder +VS.F_RESULT_FOLDER 566 567 templates = self.__listDir(template_folder) 568 569 ## 570 global_rmsd_aa_wo_if, global_rmsd_aa_if = self.global_rmsd_aa() 571 global_rmsd_ca_wo_if, global_rmsd_ca_if = self.global_rmsd_ca() 572 nb_templates = len(templates)-1 573 574 identities = self.get_identities(nb_templates) 575 score = self.get_score() 576 577 self.output_values(global_rmsd_aa_wo_if, global_rmsd_aa_if, 578 global_rmsd_ca_wo_if, global_rmsd_ca_if, 579 identities, score, nb_templates) 580 581 ## 582 aln_dic = self.get_aln_info(output_folder=self.outFolder) 583 584 template_rmsd_dic = self.get_templates_rmsd(templates) 585 templates_profiles = self.templates_profiles(templates, 586 aln_dic, 587 template_rmsd_dic) 588 mean_rmsd = self.output_cross_val(aln_dic, templates_profiles, 589 templates, model) 590 591 ## 592 mean_rmsd_atoms = model.res2atomProfile(mean_rmsd) 593 self.updatePDBs_charge(mean_rmsd_atoms, model)

Source Code for Module Biskit.Mod.Analyse