Biskit.Mod.Modeller

58 """ 59 Run a Modeller job 60 ================== 61 62 Take Alignment from t_coffee and template PDBs. Creates a modeller-script 63 and runs modeller and post-processes the results. The Modeller output ends 64 up in the output folder. 65 66 Example usage 67 ------------- 68 >>> m = Modeller( outfolder ) 69 >>> m.run() 70 71 References 72 ---------- 73 - U{http://www.salilab.org/modeller/} 74 """ 75 76 MODELLER_TEMPLATE = """ 77 # Homology modelling by the MODELLER TOP routine 'model'. 78 79 INCLUDE # Include the predefined TOP routines 80 81 SET OUTPUT_CONTROL = 1 1 1 1 1 # uncomment to produce a large log file 82 SET ALNFILE = '%(f_pir)s' # alignment filename 83 SET KNOWNS = %(template_ids)s 84 SET SEQUENCE = '%(target_id)s' # code of the target 85 SET ATOM_FILES_DIRECTORY = '%(template_folder)s'# directories for input atom files 86 SET STARTING_MODEL= %(starting_model)i # index of the first model 87 SET ENDING_MODEL = %(ending_model)i # index of the last model 88 #(determines how many models to calculate) 89 90 CALL ROUTINE = 'model' # do homology modelling 91 """ 92 93 F_RESULT_FOLDER = '/modeller' 94 F_MOD_SCRIPT = 'modeller.top' 95 96 F_INPUT_FOLDER = F_RESULT_FOLDER 97 98 ## default modeller inp file 99 F_INP = F_RESULT_FOLDER + '/' + F_MOD_SCRIPT 100 101 ## standard file name output for PDBModels 102 F_PDBModels = '/PDBModels.list' 103 104 ## standard file name output for Objective Function 105 F_SCORE_OUT = F_RESULT_FOLDER + '/Modeller_Score.out' 106 107

108 - def __init__( self, outFolder='.', log=None, fasta_target=None, 109 f_pir=None, template_folder=None, fout=None, 110 starting_model=1, ending_model=10, verbose=0, 111 host=None, **kw ):

112 """ 113 @param outFolder: base folder for Modeller output 114 (default: L{F_RESULT_FOLDER}) 115 @type outFolder: str 116 @param log: log file instance, if None, STDOUT is used (default: None) 117 @type log: LogFile 118 @param f_pir: PIR alignment file (default: None -> L{Aligner.F_FINAL_ALN}) 119 @type f_pir: str 120 @param template_folder: folder with template coordinate files 121 (default: None -> L{TC.F_MODELLER}) 122 @type template_folder: str 123 @param fout: name of modeller script (default: None -> L{F_INP}) 124 @type fout: None or str 125 @param starting_model: first model (default: 1) 126 @type starting_model: int 127 @param ending_model: last model (default: 10) 128 @type ending_model: int 129 @param verbose: verbosity level (default: 1) 130 @type verbose: 1|0 131 @param host: host for calculation (None->no ssh) (default: None) 132 @type host: str 133 """ 134 self.outFolder = T.absfile( outFolder ) 135 self.log = log 136 137 self.verbose = verbose 138 139 self.f_inp = None 140 141 self.f_pir = f_pir 142 self.template_folder = template_folder 143 self.starting_model = starting_model 144 self.ending_model = ending_model 145 146 f_top = '%s%s'%(self.outFolder, self.F_INP) 147 148 self.template_ids = '' 149 self.target_id = '' 150 151 ## sequence ids from the last prepared alignment 152 self.pir_ids = [] 153 154 Executor.__init__( self, 'modeller', args=f_top, f_in=f_top, 155 template=self.MODELLER_TEMPLATE, 156 cwd='%s%s'%(self.outFolder, self.F_RESULT_FOLDER), 157 verbose=verbose, node=host, **kw )

158 159

160 - def prepareFolders( self ):

161 """ 162 Create needed output folders if they don't exist. 163 """ 164 if not os.path.exists( self.outFolder + self.F_RESULT_FOLDER ): 165 os.mkdir( self.outFolder + self.F_RESULT_FOLDER ) 166 if self.verbose: 167 self.logWrite( 'Creating '+self.outFolder + self.F_RESULT_FOLDER)

168 169

170 - def prepare( self ):

171 """ 172 Create needed folders and get information needed by Modeller 173 """ 174 self.prepareFolders() 175 176 self.prepare_modeller()

177 178

179 - def generateInp(self):

180 """ 181 Convert the list of template id's into a string in addition 182 L{Executor.generateInp} functionality. 183 184 Replace formatstr place holders in inp by fields of this class. 185 186 @return: input file name OR (if pipes=1) content of input file 187 @rtype: str 188 189 @raise TemplateError: if error while creating template file 190 """ 191 ## we need to convert the list of template id's into a string 192 self.template_ids = [ "'%s'"%id for id in self.template_ids ] 193 self.template_ids = ' '.join( self.template_ids ) 194 195 try: 196 inp = None 197 198 if self.template: 199 inp = self.fillTemplate() 200 201 return self.convertInput( inp ) 202 203 except Exception, why: 204 s = "Error while creating template file." 205 s += "\n template file: " + str( self.template ) 206 s += "\n why: " + str( why ) 207 s += "\n Error:\n " + t.lastError() 208 raise TemplateError, s

209 210

211 - def logWrite( self, msg, force=1 ):

212 """ 213 Write message to log. 214 215 @param msg: message to print 216 @type msg: str 217 """ 218 if self.log: 219 self.log.add( msg ) 220 else: 221 if force: 222 print msg

223 224

225 - def get_template_ids( self ):

226 """ 227 Extract names of all PDB files in a folder (without .PDB) 228 229 @return: list of PDB names 230 @rtype: [str] 231 """ 232 fs = os.listdir( self.template_folder ) 233 return [ f[:-4] for f in fs if f[-4:].upper()=='.PDB' ]

234 235

236 - def get_target_id( self, f_fasta ):

237 """ 238 Extract (first) sequence id from fasta file. Make intelligent guess 239 if there is no exact match between target fasta and alignment file. 240 241 @param f_fasta: fasta file with target sequence 242 @type f_fasta: str 243 244 @raise ModellerError: if target ID is not found in alignment 245 """ 246 ex_fasta_id = re.compile('^>([A-Za-z0-9_]+)') 247 248 title = open( f_fasta ).readline() 249 250 try: 251 id_fasta = ex_fasta_id.findall( title )[0] 252 except IndexError, why: 253 raise ModellerError( "Can't extract sequence id from "+f_fasta ) 254 255 if self.pir_ids: 256 ## the id matches one previously found in the alignment 257 if id_fasta in self.pir_ids: 258 return id_fasta 259 260 guesses = [ pid for pid in self.pir_ids 261 if pid[:len(id_fasta)] == id_fasta ] 262 263 if len( guesses ) == 1: 264 return guesses[0] 265 266 raise ModellerError("Target ID %s is not found in alignment." %\ 267 id_fasta ) 268 269 self.logWrite('Warning: target sequence ID extracted without check'+ 270 ' against alignment file. Use prepare_alignment first.') 271 272 title.close() 273 274 return id_fasta

275 276

277 - def prepare_alignment( self ):

278 """ 279 Convert t_coffee - generated PIR into PIR for Modeller. 280 On the way all sequence ids are extracted into self.pir_ids. 281 282 @raise ModellerError: if can't rename file 283 """ 284 ex_title = re.compile( '^>P1;([A-Za-z0-9_]+)' ) 285 ex_description = re.compile( '^sequence|^structure' ) 286 287 self.pir_ids = [] 288 289 try: 290 bak_fname = self.f_pir + '_original' 291 os.rename( self.f_pir, bak_fname ) 292 except OSError, why: 293 raise ModellerError( "Can't rename " + self.f_pir + " to "+ bak_fname ) 294 295 lines = open( bak_fname ).readlines() 296 297 for i in range( len( lines ) ): 298 current = lines[ i ] 299 300 if ex_title.match( current ): 301 id = ex_title.findall( current )[0] 302 303 self.pir_ids += [ id ] 304 305 prefix = 'sequence' 306 if id in self.template_ids: 307 prefix = 'structure' 308 ## if lines[i + 2][0] != '*': 309 ## lines[i + 2] = '*' + lines[i + 2] 310 311 lines[i + 1] = "%s:%s: : : : : : : : \n" \ 312 % (prefix, id) 313 314 else: 315 if not ex_description.match( current ): 316 lines[ i ] = current.upper() 317 318 open( self.f_pir, 'w' ).writelines( lines )

319 320

321 - def prepare_modeller( self, fasta_target=None, fout=None ):

322 """ 323 Prepare information needed by modeller. 324 325 @param fasta_target: target fasta file 326 (default: None -> L{SS.F_FASTA_TARGET}) 327 @type fasta_target: str 328 @param fout: name of modeller script (default: None -> L{F_INP}) 329 @type fout: None or str 330 """ 331 fasta_target = fasta_target or self.outFolder + SS.F_FASTA_TARGET 332 self.template_folder = self.template_folder or self.outFolder + TC.F_MODELLER 333 self.f_pir = self.f_pir or self.outFolder + Aligner.F_FINAL_ALN 334 335 self.template_ids = self.get_template_ids( ) 336 337 ## add Modeller-style lines and extract sequence ids 338 self.prepare_alignment( ) 339 340 ## guess target seq id within alignment 341 self.target_id = self.get_target_id( fasta_target )

342 343

344 - def finish( self ):

345 """ 346 Overrides Executor method 347 """ 348 Executor.finish( self ) 349 350 self.postProcess()

351 352 353 #################### 354 ## Post processing 355

356 - def pdb_list(self, model_folder):

357 """ 358 Compile a list with the names of all models created by Modeller. 359 360 @param model_folder: folder containing model PDBs 361 (output from Modeller) 362 @type model_folder: str 363 364 @return: pdb files from modeller 365 @rtype: [str] 366 """ 367 return glob.glob('%s/target.B*'%(model_folder))

368 369

370 - def extract_modeller_score(self, f_model, model):

371 """ 372 Extract the modeller score from the models 373 374 @param f_model: file name of model pdb 375 @type f_model: str 376 @param model: model 377 @type model: PDBModel 378 379 @return: modeller score for the models 380 @rtype: float 381 """ 382 r1 = re.compile(r'\w\d.*\d') 383 384 line_of_interest = linecache.getline(f_model, 2) 385 OF = r1.findall(line_of_interest) 386 387 file = open(model.validSource(),'r') 388 string = file.readlines()[:2] 389 model.info["headlines"] = string 390 file.close() 391 392 return float(OF[0])

393 394

395 - def output_score(self, pdb_list, model_folder):

396 """ 397 Write a list of model scores to file L{F_SCORE_OUT} 398 399 @param pdb_list: list of models 400 @type pdb_list: ModelList 401 @param model_folder: ouput folder 402 @type model_folder: str 403 """ 404 file_output = open(self.outFolder+self.F_SCORE_OUT,'w') 405 file_output.write("The models from modeller with their Score:\n") 406 407 for model in pdb_list: 408 409 file_output.write('%s\t%6.2f\n'%(T.stripFilename( 410 model.validSource()), model.info["mod_score"])) 411 412 file_output.close()

413 414

415 - def update_PDB(self, pdb_list, cwd, model_folder):

416 """ 417 Extract number of templates per residue position from alignment. 418 Write new PDBs with number of templates in occupancy column. 419 420 @param pdb_list: list of models 421 @type pdb_list: ModelList 422 @param cwd: current project directory 423 @type cwd: str 424 @param model_folder: folder with models 425 @type model_folder: str 426 """ 427 cwd = cwd or self.outFolder 428 429 ci = CI(cwd) 430 aln_dictionnary = ci.go() 431 432 template_info = aln_dictionnary["target"]["template_info"] 433 434 for model in pdb_list: 435 436 model.setResProfile("n_templates", template_info, 437 comment="number of templates for each residue") 438 439 rmask = pdb_list[0].profile2mask("n_templates", 1,1000) 440 amask = pdb_list[0].res2atomMask(rmask) 441 442 443 for i in range(len(pdb_list)): 444 445 for a,v in zip(pdb_list[i].atoms, amask): 446 a['occupancy'] = v 447 448 t = [] 449 450 for string in pdb_list[i].info["headlines"]: 451 452 tuple = () 453 tuple = string.split()[0],join(string.split()[1:]) 454 t.append(tuple) 455 456 pdb_list[i].writePdb('%s/model_%02i.pdb'%(model_folder,i), 457 headlines = t)

458 459

460 - def update_rProfiles(self, pdb_list):

461 """ 462 Read Modeller score from the 'temperature_factor' column and 463 create a profile from it. 464 465 @param pdb_list: list of models 466 @type pdb_list: ModelList 467 """ 468 for model in pdb_list: 469 470 m = model.compress( model.maskCA()) 471 472 atoms = DictList( m.atoms ) 473 prof = atoms.valuesOf( "temperature_factor") 474 475 model.setResProfile('mod_score', prof)

476 477

478 - def write_PDBModels(self, pdb_list, model_folder = None):

479 """ 480 Dump the list of PDBModels. 481 482 @param pdb_list: list of models 483 @type pdb_list: ModelList 484 @param model_folder: ouput folder (default: None -> L{F_PDBModels}) 485 @type model_folder: str 486 """ 487 T.Dump(pdb_list, '%s'%(model_folder + self.F_PDBModels))

488 489

490 - def postProcess(self, model_folder=None):

491 """ 492 Rename model PDBs to 'model_01.pdb' - 'model_xx.pdb' according to their 493 score, create and dump a ModelList instance containing these models 494 and info-records and profiles with the Modeller score. 495 496 @param model_folder: folder containing model PDBs 497 (default: None -> L{F_INPUT_FOLDER}) 498 @type model_folder: str 499 """ 500 501 model_folder = model_folder or self.outFolder + self.F_INPUT_FOLDER 502 503 model_files = self.pdb_list(model_folder) 504 505 pdb_list = ModelList( model_files ) 506 507 for model in pdb_list: 508 509 model.info["mod_score"] = self.extract_modeller_score( 510 model.validSource(), model) 511 512 pdb_list = pdb_list.sortBy("mod_score") 513 514 self.update_PDB(pdb_list, self.outFolder, model_folder) 515 516 self.output_score(pdb_list,model_folder) 517 518 self.update_rProfiles(pdb_list) 519 520 self.write_PDBModels(pdb_list, model_folder)

Source Code for Module Biskit.Mod.Modeller