Package Biskit :: Package Mod :: Module TemplateSearcher
[hide private]
[frames] | no frames]

Source Code for Module Biskit.Mod.TemplateSearcher

  1  ## 
  2  ## Biskit, a toolkit for the manipulation of macromolecular structures 
  3  ## Copyright (C) 2004-2005 Raik Gruenberg & Johan Leckner 
  4  ## 
  5  ## This program is free software; you can redistribute it and/or 
  6  ## modify it under the terms of the GNU General Public License as 
  7  ## published by the Free Software Foundation; either version 2 of the 
  8  ## License, or any later version. 
  9  ## 
 10  ## This program is distributed in the hope that it will be useful, 
 11  ## but WITHOUT ANY WARRANTY; without even the implied warranty of 
 12  ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 13  ## General Public License for more details. 
 14  ## 
 15  ## You find a copy of the GNU General Public License in the file 
 16  ## license.txt along with this program; if not, write to the Free 
 17  ## Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 18  ## 
 19  ## 
 20  ## last $Author: leckner $ 
 21  ## last $Date: 2006/12/21 09:05:12 $ 
 22  ## $Revision: 2.12 $ 
 23   
 24  """ 
 25  Search for templates. 
 26  """ 
 27   
 28  from SequenceSearcher import SequenceSearcher, BlastError 
 29  import settings 
 30  import Biskit.tools as T 
 31  from Biskit import StdLog, EHandler 
 32   
 33  from Bio import Fasta 
 34  import re 
 35  import os, shutil 
 36  import Numeric 
 37   
 38  import urllib 
 39  import string 
 40  import gzip 
 41  import subprocess 
 42  import types 
 43  from Bio import File 
 44   
45 -class TemplateSearcher( SequenceSearcher ):
46 """ 47 Take a sequence and return a list of files of nonredundant PDB homologues 48 (selecting for best resolution). 49 """ 50 51 ## resolution assigned to NMR structures 52 NMR_RESOLUTION = 3.5 53 54 ## default folder names (created if verbose=1) 55 F_RESULT_FOLDER = '/templates' ## default parent folder 56 57 ## standard file name for unclustered sequences 58 F_FASTA_ALL = F_RESULT_FOLDER + '/all.fasta' 59 60 ## standard file name for non-redundant seqs 61 F_FASTA_NR = F_RESULT_FOLDER + '/nr.fasta' 62 63 ## clusters 64 F_CLUSTER_RAW = F_RESULT_FOLDER + '/cluster_raw.out' 65 F_CLUSTER_LOG = F_RESULT_FOLDER + '/cluster_result.out' 66 67 ## pseudo blast output 68 F_BLAST_OUT = F_RESULT_FOLDER + '/blast.out' 69 F_CLUSTER_BLAST_OUT = F_RESULT_FOLDER + '/cluster_blast.out' 70 71 ## folders for PDB files 72 F_ALL = F_RESULT_FOLDER + '/all'## all PDB homologues 73 F_NR = F_RESULT_FOLDER + '/nr' ## best PDB homologue of each cluster 74 75 ## default name for file : chain dic 76 F_CHAIN_INDEX = '/chain_index.txt' 77 78
79 - def __init__( self, outFolder='.', clusterLimit=20, verbose=1, 80 log=None, silent=0 ):
81 """ 82 @param outFolder: project folder (results are put into subfolder) ['.'] 83 @type outFolder: str 84 @param clusterLimit: maximal number of returned sequence clusters 85 (default: 20) 86 @type clusterLimit: int 87 @param verbose: keep temporary files (default: 1) 88 @type verbose: 1|0 89 @param log: log file instance, if None, STDOUT is used (default: None) 90 @type log: LogFile 91 @param silent: don't print messages to STDOUT is used (default: 0) 92 @type silent: 1|0 93 """ 94 SequenceSearcher.__init__( self, outFolder=outFolder, verbose=1 ) 95 96 self.ex_pdb = re.compile( 'pdb\|([A-Z0-9]{4})\|([A-Z]*)' ) 97 98 self.ex_resolution = re.compile(\ 99 'REMARK 2 RESOLUTION\. *([0-9\.]+|NOT APPLICABLE)' ) 100 101 self.prepareFolders() 102 103 self.verbose = verbose 104 self.silent = silent 105 106 self.log = log or StdLog() 107 108 ## the maximal number of clusters to return 109 self.clusterLimit = clusterLimit
110 111
112 - def prepareFolders( self ):
113 """ 114 Create folders needed by this class. 115 """ 116 SequenceSearcher.prepareFolders( self ) 117 118 if not os.path.exists( self.outFolder + self.F_ALL ): 119 os.mkdir( self.outFolder + self.F_ALL ) 120 if not os.path.exists( self.outFolder + self.F_NR ): 121 os.mkdir( self.outFolder + self.F_NR )
122 123
124 - def getSequenceIDs( self, blast_records ):
125 """ 126 Extract sequence ids (pdb codes and chain ID) from BlastParser result. 127 128 @param blast_records: result from BlastParser 129 @type blast_records: Bio.Blast.Record.Blast 130 131 @return: list of dictionaries mapping pdb codes and chain IDs 132 @rtype: [ {'pdb':str, 'chain':str } ] 133 134 @raise BlastError: if couldn't find ID 135 """ 136 result = [] 137 for a in blast_records.alignments: 138 139 ids = self.ex_pdb.findall( a.title ) 140 141 if not ids: 142 raise BlastError( "Couldn't find ID in " + a.title) 143 144 for id in ids: 145 result += [ { 'pdb':id[0], 'chain':id[1] } ] 146 147 return result
148 149
150 - def fastaFromIds( self, db, id_lst, fastaOut=None ):
151 """ 152 Use:: 153 fastaFromIds( id_lst, fastaOut ) -> { str: Bio.Fasta.Record } 154 155 @param db: database name 156 @type db: str 157 @param id_lst: list of dictionaries with pdb codes and chain IDs 158 @type id_lst: [{'pdb':str, 'chain':str}] 159 160 @return: Dictionary mapping pdb codes to Bio.Fasta.Records. The 161 returned records have an additional field: chain. 162 @rtype: { str: Bio.Fasta.Record } 163 """ 164 result = {} 165 for i in id_lst: 166 try: 167 r = self.fastaRecordFromId( db, i['pdb'] ) 168 r.chain = i['chain'] 169 result[ i['pdb'] ] = r 170 except BlastError, why: 171 T.errWriteln("ERROR (ignored): couldn't fetch "+ str(i) ) 172 173 return result
174 175
176 - def getLocalPDB( self, id, db_path=settings.pdb_path ):
177 """ 178 Get the coordinate file from a local pdb database. 179 180 @param id: pdb code, 4 characters 181 @type id: str 182 @param db_path: path to local pdb database 183 (default: L{settings.pdb_path}) 184 @type db_path: str 185 186 @return: the requested pdb file as a file handle 187 @rtype: open file handle 188 189 @raise BlastError: if couldn't find PDB file 190 """ 191 id = string.lower( id ) 192 filenames = ['%s.pdb' % id, 193 db_path + '/pdb%s.ent' % id, 194 db_path + '/%s/pdb%s.ent.Z' %( id[1:3], id ) ] 195 196 for f in filenames: 197 if os.path.exists( f ): 198 ## gzipped pdb file 199 if f[-3:]=='.gz': 200 return gzip.open(f) 201 ## the gzip module doesn't handle .Z files 202 ## doesn't return open file handle 203 elif f[-2:]=='.Z': 204 p = subprocess.Popen( [ 'gunzip', '-c', f ], 205 stdout=subprocess.PIPE ) 206 return p.communicate()[0] 207 ## uncompressed 208 else: 209 return open(f) 210 211 raise BlastError( "Couldn't find PDB file.")
212 213
214 - def getRemotePDB( self, id, rcsb_url=settings.rcsb_url ):
215 """ 216 Get the coordinate file remotely from the RCSB. 217 218 @param id: pdb code, 4 characters 219 @type id: str 220 @param rcsb_url: template url for pdb download 221 (default: L{settings.rcsb_url}) 222 @type rcsb_url: str 223 224 @return: the requested pdb file as a file handle 225 @rtype: open file handle 226 227 @raise BlastError: if couldn't retrieve PDB file 228 """ 229 handle = urllib.urlopen( rcsb_url% (id,id) ) 230 231 uhandle = File.UndoHandle(handle) 232 233 if not uhandle.peekline(): 234 raise BlastError( "Couldn't retrieve ", rcsb_url ) 235 236 return uhandle
237 238
239 - def __extractPDBInfos( self, handle ):
240 """ 241 Extract extra infos from PDB file. 242 NMR files get resolution 3.5. 243 244 @param handle: open file handle OR string of file to examine 245 @type handle: open file handle OR strings 246 247 @return: pdb file as list of strings, dictionary with resolution 248 @rtype: [str], {'resolution':float } 249 250 @raise BlastError: if couldn't extract PDB Info 251 """ 252 infos = {} 253 if type( handle ) == types.FileType: 254 lines = handle.readlines() 255 elif type( handle ) == types.StringType and len(handle) > 5000: 256 lines = handle.splitlines( True ) 257 elif type( handle ) == types.InstanceType: 258 lines = handle.readlines() 259 else: 260 raise BlastError( "Couldn't extract PDB Info." ) 261 262 for l in lines: 263 found = self.ex_resolution.findall( l ) 264 265 if found: 266 if found[0] == 'NOT APPLICABLE': 267 infos['resolution'] = self.NMR_RESOLUTION 268 else: 269 infos['resolution'] = float( found[0] ) 270 return lines, infos
271 272
273 - def retrievePDBs( self, outFolder=None, pdbCodes=None ):
274 """ 275 Get PDB from local database if it exists, if not try to 276 download the coordinartes drom the RSCB. 277 Write PDBs for given fasta records. Add PDB infos to internal 278 dictionary of fasta records. NMR structures get resolution 3.5. 279 280 @param outFolder: folder to put PDB files into (default: L{F_ALL}) 281 @type outFolder: str OR None 282 @param pdbCodes: list of PDB codes [all previously found templates] 283 @type pdbCodes: [str] 284 285 @return: list of PDB file names 286 @rtype: [str] 287 288 @raise BlastError: if can't write file 289 """ 290 outFolder = outFolder or self.outFolder + self.F_ALL 291 pdbCodes = pdbCodes or self.record_dic.keys() 292 result = [] 293 i = 0 294 if not self.silent: 295 T.flushPrint("retrieving %i PDBs..." % len( pdbCodes ) ) 296 for c in pdbCodes: 297 298 i += 1 299 if i%10 == 0 and not self.silent: 300 T.flushPrint('#') 301 302 fname = '%s/%s.pdb' % (outFolder, c) 303 304 try: 305 if os.path.exists( fname ): 306 h = open( fname, 'r' ) 307 else: 308 h = self.getLocalPDB( c ) 309 except: 310 h = self.getRemotePDB( c ) 311 312 try: 313 lines, infos = self.__extractPDBInfos( h ) 314 infos['file'] = fname 315 316 if c in self.record_dic: 317 self.record_dic[ c ].__dict__.update( infos ) 318 319 ## close if it is a handle 320 try: 321 h.close() 322 except: 323 pass 324 325 if not os.path.exists( fname ): 326 f = open( fname, 'w', 1 ) 327 f.writelines( lines ) 328 f.close() 329 330 result += [ fname ] 331 332 except IOError, why: 333 raise BlastError( "Can't write file "+fname ) 334 335 if not self.silent: 336 T.flushPrint('\n%i files written to %s\n' %(i,outFolder) ) 337 338 return result
339 340
341 - def selectFasta( self, ids_in_cluster ):
342 """ 343 select one member of cluster of sequences. 344 345 @param ids_in_cluster: list of sequence ids defining the cluster 346 @type ids_in_cluster: [str] 347 348 @return: Bio.Fasta.Record 349 @rtype: Bio.Fasta.Record 350 """ 351 resolutions = [] 352 for id in ids_in_cluster: 353 resolutions += [ getattr( self.record_dic[id],'resolution', 99. ) ] 354 355 return ids_in_cluster[ Numeric.argmin( resolutions ) ]
356 357
358 - def reportClustering( self, raw=None ):
359 """ 360 Report the clustering result. 361 362 Writes: 363 - clustering results to L{F_CLUSTER_LOG} 364 - blast records to L{F_BLAST_OUT} 365 - blast records of centers to L{F_CLUSTER_BLAST_OUT} 366 - raw clustering results to L{F_CLUSTER_RAW} if raw not None 367 368 @param raw: write raw clustering result to disk (default: None) 369 @type raw: 1|0 370 """ 371 try: 372 if self.verbose: 373 f = open( self.outFolder +self.F_CLUSTER_LOG, 'w', 1) 374 375 for cluster in self.clusters: 376 f.write( "%i\t" % ( len( cluster ))) 377 for id in cluster: 378 f.write("%s:%.2f "%(id,self.record_dic[id].resolution)) 379 f.write( "\n") 380 381 f.close() 382 383 ## write blast records of centers to disc 384 centers = [ c[0] for c in self.clusters ] 385 386 self.writeClusteredBlastResult( \ 387 self.outFolder + self.F_BLAST_OUT, 388 self.outFolder + self.F_CLUSTER_BLAST_OUT, centers ) 389 390 self.copyClusterOut( raw=raw ) 391 392 except IOError, why: 393 T.errWriteln( "Can't write cluster report." + str(why) )
394 395
396 - def saveClustered( self, outFolder=None ):
397 """ 398 Copy best PDB of each cluster into another folder. 399 Create index file in same folder. 400 The returned dictionary or index file (L{F_CHAIN_INDEX}) is 401 used as input to TemplateCleaner. 402 403 @param outFolder: folder to write files to (default: L{F_NR}) 404 @type outFolder: str OR None 405 406 @return: { str_filename : str_chain_id }, file names and chain ids 407 @rtype: {str, str} 408 409 @raise BlastError: if sequences are not clustered 410 """ 411 result = {} 412 outFolder = outFolder or self.outFolder + self.F_NR 413 414 f_index = open( outFolder +self.F_CHAIN_INDEX, 'w') 415 f_index.write('## template files mapped to matching chain ID\n') 416 417 if not self.bestOfCluster: 418 raise BlastError( 'Sequences are not yet clustered.' ) 419 420 for id in self.bestOfCluster: 421 fn = self.record_dic[ id ].file 422 fnNew = outFolder+'/'+id+'.pdb' 423 shutil.copy( fn, fnNew ) 424 425 self.record_dic[ id ].file = fnNew 426 result[ fnNew ] = self.record_dic[ id ].chain 427 428 f_index.write( '%s\t%s\n' % (fnNew, self.record_dic[ id ].chain) ) 429 430 f_index.close() 431 432 return result
433 434 435 436 ############# 437 ## TESTING 438 ############# 439
440 -class Test:
441 """ 442 Test class 443 """ 444
445 - def run( self, local=0 ):
446 """ 447 run function test 448 449 @param local: transfer local variables to global and perform 450 other tasks only when run locally 451 @type local: 1|0 452 453 @return: 1 454 @rtype: int 455 """ 456 import tempfile 457 import shutil 458 from Biskit.LogFile import LogFile 459 460 query = T.testRoot() + '/Mod/project/target.fasta' 461 outfolder = tempfile.mkdtemp( '_test_TemplateSearcher' ) 462 shutil.copy( query, outfolder ) 463 464 ## log file 465 f_out = outfolder + '/TemplateSearcher.log' 466 l = LogFile( f_out, mode='w') 467 468 f_target = outfolder + '/target.fasta' 469 470 silent = 1 471 if local: silent=0 472 473 searcher = TemplateSearcher( outFolder=outfolder, 474 verbose=1, log=l, 475 silent=silent ) 476 477 db = settings.db_pdbaa 478 searcher.localBlast( f_target, db, 'blastp', alignments=200, e=0.0001) 479 480 ## first tries to collect the pdb files from a local db and if 481 ## that fails it tries to collect them remotely 482 searcher.retrievePDBs() 483 484 searcher.clusterFasta() ## expects all.fasta 485 486 searcher.writeFastaClustered() 487 488 fn = searcher.saveClustered() 489 490 if local: 491 print '\nThe set of clustered template files from the search' 492 print ' can be found in %s/templates'%outfolder 493 print '\nTemplateSearcher log file written to: %s'%f_out 494 globals().update( locals() ) 495 496 ## cleanup 497 T.tryRemove( outfolder, tree=1 ) 498 499 return 1
500 501
502 - def expected_result( self ):
503 """ 504 Precalculated result to check for consistent performance. 505 506 @return: 1 507 @rtype: int 508 """ 509 return 1
510 511 512 513 if __name__ == '__main__': 514 515 test = Test() 516 517 assert test.run( local=1 ) == test.expected_result() 518