1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 """
25 Search for templates.
26 """
27
28 from SequenceSearcher import SequenceSearcher, BlastError
29 import settings
30 import Biskit.tools as T
31 from Biskit import StdLog, EHandler
32
33 from Bio import Fasta
34 import re
35 import os, shutil
36 import Numeric
37
38 import urllib
39 import string
40 import gzip
41 import subprocess
42 import types
43 from Bio import File
44
46 """
47 Take a sequence and return a list of files of nonredundant PDB homologues
48 (selecting for best resolution).
49 """
50
51
52 NMR_RESOLUTION = 3.5
53
54
55 F_RESULT_FOLDER = '/templates'
56
57
58 F_FASTA_ALL = F_RESULT_FOLDER + '/all.fasta'
59
60
61 F_FASTA_NR = F_RESULT_FOLDER + '/nr.fasta'
62
63
64 F_CLUSTER_RAW = F_RESULT_FOLDER + '/cluster_raw.out'
65 F_CLUSTER_LOG = F_RESULT_FOLDER + '/cluster_result.out'
66
67
68 F_BLAST_OUT = F_RESULT_FOLDER + '/blast.out'
69 F_CLUSTER_BLAST_OUT = F_RESULT_FOLDER + '/cluster_blast.out'
70
71
72 F_ALL = F_RESULT_FOLDER + '/all'
73 F_NR = F_RESULT_FOLDER + '/nr'
74
75
76 F_CHAIN_INDEX = '/chain_index.txt'
77
78
79 - def __init__( self, outFolder='.', clusterLimit=20, verbose=1,
80 log=None, silent=0 ):
81 """
82 @param outFolder: project folder (results are put into subfolder) ['.']
83 @type outFolder: str
84 @param clusterLimit: maximal number of returned sequence clusters
85 (default: 20)
86 @type clusterLimit: int
87 @param verbose: keep temporary files (default: 1)
88 @type verbose: 1|0
89 @param log: log file instance, if None, STDOUT is used (default: None)
90 @type log: LogFile
91 @param silent: don't print messages to STDOUT is used (default: 0)
92 @type silent: 1|0
93 """
94 SequenceSearcher.__init__( self, outFolder=outFolder, verbose=1 )
95
96 self.ex_pdb = re.compile( 'pdb\|([A-Z0-9]{4})\|([A-Z]*)' )
97
98 self.ex_resolution = re.compile(\
99 'REMARK 2 RESOLUTION\. *([0-9\.]+|NOT APPLICABLE)' )
100
101 self.prepareFolders()
102
103 self.verbose = verbose
104 self.silent = silent
105
106 self.log = log or StdLog()
107
108
109 self.clusterLimit = clusterLimit
110
111
113 """
114 Create folders needed by this class.
115 """
116 SequenceSearcher.prepareFolders( self )
117
118 if not os.path.exists( self.outFolder + self.F_ALL ):
119 os.mkdir( self.outFolder + self.F_ALL )
120 if not os.path.exists( self.outFolder + self.F_NR ):
121 os.mkdir( self.outFolder + self.F_NR )
122
123
125 """
126 Extract sequence ids (pdb codes and chain ID) from BlastParser result.
127
128 @param blast_records: result from BlastParser
129 @type blast_records: Bio.Blast.Record.Blast
130
131 @return: list of dictionaries mapping pdb codes and chain IDs
132 @rtype: [ {'pdb':str, 'chain':str } ]
133
134 @raise BlastError: if couldn't find ID
135 """
136 result = []
137 for a in blast_records.alignments:
138
139 ids = self.ex_pdb.findall( a.title )
140
141 if not ids:
142 raise BlastError( "Couldn't find ID in " + a.title)
143
144 for id in ids:
145 result += [ { 'pdb':id[0], 'chain':id[1] } ]
146
147 return result
148
149
151 """
152 Use::
153 fastaFromIds( id_lst, fastaOut ) -> { str: Bio.Fasta.Record }
154
155 @param db: database name
156 @type db: str
157 @param id_lst: list of dictionaries with pdb codes and chain IDs
158 @type id_lst: [{'pdb':str, 'chain':str}]
159
160 @return: Dictionary mapping pdb codes to Bio.Fasta.Records. The
161 returned records have an additional field: chain.
162 @rtype: { str: Bio.Fasta.Record }
163 """
164 result = {}
165 for i in id_lst:
166 try:
167 r = self.fastaRecordFromId( db, i['pdb'] )
168 r.chain = i['chain']
169 result[ i['pdb'] ] = r
170 except BlastError, why:
171 T.errWriteln("ERROR (ignored): couldn't fetch "+ str(i) )
172
173 return result
174
175
177 """
178 Get the coordinate file from a local pdb database.
179
180 @param id: pdb code, 4 characters
181 @type id: str
182 @param db_path: path to local pdb database
183 (default: L{settings.pdb_path})
184 @type db_path: str
185
186 @return: the requested pdb file as a file handle
187 @rtype: open file handle
188
189 @raise BlastError: if couldn't find PDB file
190 """
191 id = string.lower( id )
192 filenames = ['%s.pdb' % id,
193 db_path + '/pdb%s.ent' % id,
194 db_path + '/%s/pdb%s.ent.Z' %( id[1:3], id ) ]
195
196 for f in filenames:
197 if os.path.exists( f ):
198
199 if f[-3:]=='.gz':
200 return gzip.open(f)
201
202
203 elif f[-2:]=='.Z':
204 p = subprocess.Popen( [ 'gunzip', '-c', f ],
205 stdout=subprocess.PIPE )
206 return p.communicate()[0]
207
208 else:
209 return open(f)
210
211 raise BlastError( "Couldn't find PDB file.")
212
213
215 """
216 Get the coordinate file remotely from the RCSB.
217
218 @param id: pdb code, 4 characters
219 @type id: str
220 @param rcsb_url: template url for pdb download
221 (default: L{settings.rcsb_url})
222 @type rcsb_url: str
223
224 @return: the requested pdb file as a file handle
225 @rtype: open file handle
226
227 @raise BlastError: if couldn't retrieve PDB file
228 """
229 handle = urllib.urlopen( rcsb_url% (id,id) )
230
231 uhandle = File.UndoHandle(handle)
232
233 if not uhandle.peekline():
234 raise BlastError( "Couldn't retrieve ", rcsb_url )
235
236 return uhandle
237
238
240 """
241 Extract extra infos from PDB file.
242 NMR files get resolution 3.5.
243
244 @param handle: open file handle OR string of file to examine
245 @type handle: open file handle OR strings
246
247 @return: pdb file as list of strings, dictionary with resolution
248 @rtype: [str], {'resolution':float }
249
250 @raise BlastError: if couldn't extract PDB Info
251 """
252 infos = {}
253 if type( handle ) == types.FileType:
254 lines = handle.readlines()
255 elif type( handle ) == types.StringType and len(handle) > 5000:
256 lines = handle.splitlines( True )
257 elif type( handle ) == types.InstanceType:
258 lines = handle.readlines()
259 else:
260 raise BlastError( "Couldn't extract PDB Info." )
261
262 for l in lines:
263 found = self.ex_resolution.findall( l )
264
265 if found:
266 if found[0] == 'NOT APPLICABLE':
267 infos['resolution'] = self.NMR_RESOLUTION
268 else:
269 infos['resolution'] = float( found[0] )
270 return lines, infos
271
272
274 """
275 Get PDB from local database if it exists, if not try to
276 download the coordinartes drom the RSCB.
277 Write PDBs for given fasta records. Add PDB infos to internal
278 dictionary of fasta records. NMR structures get resolution 3.5.
279
280 @param outFolder: folder to put PDB files into (default: L{F_ALL})
281 @type outFolder: str OR None
282 @param pdbCodes: list of PDB codes [all previously found templates]
283 @type pdbCodes: [str]
284
285 @return: list of PDB file names
286 @rtype: [str]
287
288 @raise BlastError: if can't write file
289 """
290 outFolder = outFolder or self.outFolder + self.F_ALL
291 pdbCodes = pdbCodes or self.record_dic.keys()
292 result = []
293 i = 0
294 if not self.silent:
295 T.flushPrint("retrieving %i PDBs..." % len( pdbCodes ) )
296 for c in pdbCodes:
297
298 i += 1
299 if i%10 == 0 and not self.silent:
300 T.flushPrint('#')
301
302 fname = '%s/%s.pdb' % (outFolder, c)
303
304 try:
305 if os.path.exists( fname ):
306 h = open( fname, 'r' )
307 else:
308 h = self.getLocalPDB( c )
309 except:
310 h = self.getRemotePDB( c )
311
312 try:
313 lines, infos = self.__extractPDBInfos( h )
314 infos['file'] = fname
315
316 if c in self.record_dic:
317 self.record_dic[ c ].__dict__.update( infos )
318
319
320 try:
321 h.close()
322 except:
323 pass
324
325 if not os.path.exists( fname ):
326 f = open( fname, 'w', 1 )
327 f.writelines( lines )
328 f.close()
329
330 result += [ fname ]
331
332 except IOError, why:
333 raise BlastError( "Can't write file "+fname )
334
335 if not self.silent:
336 T.flushPrint('\n%i files written to %s\n' %(i,outFolder) )
337
338 return result
339
340
342 """
343 select one member of cluster of sequences.
344
345 @param ids_in_cluster: list of sequence ids defining the cluster
346 @type ids_in_cluster: [str]
347
348 @return: Bio.Fasta.Record
349 @rtype: Bio.Fasta.Record
350 """
351 resolutions = []
352 for id in ids_in_cluster:
353 resolutions += [ getattr( self.record_dic[id],'resolution', 99. ) ]
354
355 return ids_in_cluster[ Numeric.argmin( resolutions ) ]
356
357
359 """
360 Report the clustering result.
361
362 Writes:
363 - clustering results to L{F_CLUSTER_LOG}
364 - blast records to L{F_BLAST_OUT}
365 - blast records of centers to L{F_CLUSTER_BLAST_OUT}
366 - raw clustering results to L{F_CLUSTER_RAW} if raw not None
367
368 @param raw: write raw clustering result to disk (default: None)
369 @type raw: 1|0
370 """
371 try:
372 if self.verbose:
373 f = open( self.outFolder +self.F_CLUSTER_LOG, 'w', 1)
374
375 for cluster in self.clusters:
376 f.write( "%i\t" % ( len( cluster )))
377 for id in cluster:
378 f.write("%s:%.2f "%(id,self.record_dic[id].resolution))
379 f.write( "\n")
380
381 f.close()
382
383
384 centers = [ c[0] for c in self.clusters ]
385
386 self.writeClusteredBlastResult( \
387 self.outFolder + self.F_BLAST_OUT,
388 self.outFolder + self.F_CLUSTER_BLAST_OUT, centers )
389
390 self.copyClusterOut( raw=raw )
391
392 except IOError, why:
393 T.errWriteln( "Can't write cluster report." + str(why) )
394
395
397 """
398 Copy best PDB of each cluster into another folder.
399 Create index file in same folder.
400 The returned dictionary or index file (L{F_CHAIN_INDEX}) is
401 used as input to TemplateCleaner.
402
403 @param outFolder: folder to write files to (default: L{F_NR})
404 @type outFolder: str OR None
405
406 @return: { str_filename : str_chain_id }, file names and chain ids
407 @rtype: {str, str}
408
409 @raise BlastError: if sequences are not clustered
410 """
411 result = {}
412 outFolder = outFolder or self.outFolder + self.F_NR
413
414 f_index = open( outFolder +self.F_CHAIN_INDEX, 'w')
415 f_index.write('## template files mapped to matching chain ID\n')
416
417 if not self.bestOfCluster:
418 raise BlastError( 'Sequences are not yet clustered.' )
419
420 for id in self.bestOfCluster:
421 fn = self.record_dic[ id ].file
422 fnNew = outFolder+'/'+id+'.pdb'
423 shutil.copy( fn, fnNew )
424
425 self.record_dic[ id ].file = fnNew
426 result[ fnNew ] = self.record_dic[ id ].chain
427
428 f_index.write( '%s\t%s\n' % (fnNew, self.record_dic[ id ].chain) )
429
430 f_index.close()
431
432 return result
433
434
435
436
437
438
439
441 """
442 Test class
443 """
444
445 - def run( self, local=0 ):
446 """
447 run function test
448
449 @param local: transfer local variables to global and perform
450 other tasks only when run locally
451 @type local: 1|0
452
453 @return: 1
454 @rtype: int
455 """
456 import tempfile
457 import shutil
458 from Biskit.LogFile import LogFile
459
460 query = T.testRoot() + '/Mod/project/target.fasta'
461 outfolder = tempfile.mkdtemp( '_test_TemplateSearcher' )
462 shutil.copy( query, outfolder )
463
464
465 f_out = outfolder + '/TemplateSearcher.log'
466 l = LogFile( f_out, mode='w')
467
468 f_target = outfolder + '/target.fasta'
469
470 silent = 1
471 if local: silent=0
472
473 searcher = TemplateSearcher( outFolder=outfolder,
474 verbose=1, log=l,
475 silent=silent )
476
477 db = settings.db_pdbaa
478 searcher.localBlast( f_target, db, 'blastp', alignments=200, e=0.0001)
479
480
481
482 searcher.retrievePDBs()
483
484 searcher.clusterFasta()
485
486 searcher.writeFastaClustered()
487
488 fn = searcher.saveClustered()
489
490 if local:
491 print '\nThe set of clustered template files from the search'
492 print ' can be found in %s/templates'%outfolder
493 print '\nTemplateSearcher log file written to: %s'%f_out
494 globals().update( locals() )
495
496
497 T.tryRemove( outfolder, tree=1 )
498
499 return 1
500
501
503 """
504 Precalculated result to check for consistent performance.
505
506 @return: 1
507 @rtype: int
508 """
509 return 1
510
511
512
513 if __name__ == '__main__':
514
515 test = Test()
516
517 assert test.run( local=1 ) == test.expected_result()
518