1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 Sequence Alignment
25 """
26
27 import re, os
28
29 from TemplateCleaner import TemplateCleaner as TC
30 from SequenceSearcher import SequenceSearcher as SS
31
32 import Biskit.tools as T
33 from Biskit.Errors import *
34
35 from Biskit import Executor, TemplateError
36
37
39 pass
40
41
43 """
44 Execute a t_coffee command
45 """
46
47 - def __init__(self, cmd, host=None, **kw ):
48 """
49 @param cmd: command sequence
50 @type cmd: str
51 @param host: host name for remote execution
52 (default: None=local execution)
53 @type host: str
54 """
55 Executor.__init__( self, 't_coffee', cmd, catch_out=1,
56 node=host, **kw )
57
58
60 """
61 Take template CA traces and template sequences plus non-template
62 sequences.
63
64 Returns a T-Coffee script or T-Coffee alignment
65 """
66
67 F_RESULT_FOLDER = '/t_coffee'
68
69 F_FAST_LIB = F_RESULT_FOLDER + '/fast_pair.lib'
70 F_LALIGN_LIB = F_RESULT_FOLDER + '/lalign_id_pair.lib'
71 F_SAP_LIB = F_RESULT_FOLDER + '/sap_pair.lib'
72 F_STRUCT_ALN = F_RESULT_FOLDER + '/struct.aln'
73 F_FINAL = F_RESULT_FOLDER + '/final'
74 F_COFFEE_LOG = F_RESULT_FOLDER + '/t_coffee.log'
75 F_COFFEE_INP = F_RESULT_FOLDER + '/t_coffee.inp'
76 F_FAST_TREE = F_RESULT_FOLDER + '/fast_tree.dnd'
77 F_SEQ_TREE = F_RESULT_FOLDER + '/sequence.dnd'
78
79
80 F_FINAL_ALN = F_FINAL + '.pir_aln'
81
82 - def __init__( self, outFolder='.', log=None, verbose=1, sap=1 ):
83 """
84 @param outFolder: base folder for t_coffee output
85 (default: L{F_RESULT_FOLDER})
86 @type outFolder: str
87 @param log: log file instance, if None, STDOUT is used (default: None)
88 @type log: LogFile
89 @param verbose: be verbose
90 @type verbose: 1 | 0
91 @param sap: perform structural alignment
92 @type sap: 1 | 0
93 """
94 self.log = log
95 self.outFolder = T.absfile( outFolder )
96 self.verbose = verbose
97 self.sap = sap
98
99
100 self.ex_inp_types = { 'P' : re.compile('.+\.[Pp][Dd][Bb]$|'+
101 '.+\.[Aa][Ll][Pp][Hh][Aa]$'),
102 'L' : re.compile('.+\.[Ll][Ii][Bb]$'),
103 'S' : re.compile('.*\.[Ff][Aa][Ss][Tt][Aa]$')
104 }
105
106 self.commands = []
107
108 fn = self.outFolder + self.F_RESULT_FOLDER
109 if not os.path.exists( fn ):
110 os.mkdir( fn )
111 if verbose: self.logWrite( 'Creating new output folder '+ fn )
112
113
115 """
116 Write message to log.
117
118 @param msg: message to print
119 @type msg: str
120 """
121 if self.log:
122 self.log.writeln( msg )
123 else:
124 if force:
125 print msg
126
127
129 """
130 Try guessing a t_coffee type code for file name and add type code
131 (P for structurem S for sequence and L for library).
132
133 @param inp: file name
134 @type inp: str
135
136 @return: type_code + file name OR file name unchanged
137 @rtype: str
138 """
139 for code, pattern in self.ex_inp_types.items():
140 if pattern.match( inp ):
141 return code + inp
142 return inp
143
144
145 - def coffee_align_inp( self, input, method=None, out_lib=None,
146 output=None, outfile=None, **kw ):
147 """
148 Create a single t_coffee command.
149
150 @param input: list of input files
151 @type input: [str]
152 @param method: t_coffee method ('M' is added automatically)
153 (default:None)
154 @type method: str
155 @param output: output format(s) (default:None)
156 @type output: [str]
157 @param out_lib: output library file if needed (default:None)
158 @type out_lib: str
159 @param outfile: output file (base) if needed (default:None)
160 @type outfile: str
161 @param kw: additional param=value pairs
162 @type kw: param=value
163
164 @return: t_coffee command
165 @rtype: str
166 """
167 r = ''
168
169 input = [ self.__add_type_code( i ) for i in T.toList(input) ]
170
171 if input or method:
172 r += ' -in'
173 for i in input:
174 r += ' ' + i
175 if method:
176 r += ' M' + method
177
178 if out_lib:
179 r += ' -out_lib ' + out_lib
180
181 if output:
182 r += ' -output'
183 for o in T.toList( output ):
184 r += ' ' + o
185
186 if not outfile:
187 raise AlignerError, 'T-Coffee -output also requires -outfile'
188
189 if outfile:
190 r += ' -outfile ' + outfile
191
192 if not output:
193 r += ' -lib_only'
194
195 r += ' -template_file No'
196
197 for param, value in kw.items():
198 r += ' -'+param + ' ' + str( value )
199
200 return r
201
202
237
238
239 - def align_for_modeller_inp( self, pdbFiles=None, fasta_templates=None,
240 fasta_sequences=None, fasta_target=None,
241 f_fast_tree=None, f_sequence_tree=None ):
242 """
243 Prepare alignment commands for homology modeling.
244
245 @note: If verbose==1, the commands are mirrored to t_coffee.inp.
246
247 @param pdbFiles: template PDBs from L{TC.F_COFFEE} (default:None)
248 @type pdbFiles: [str]
249 @param fasta_templates: template sequences from
250 templates/templates.fasta (default:None)
251 @type fasta_templates: [str]
252 @param fasta_sequences: more sequences from L{SS.F_FASTA_NR}
253 (default:None)
254 @type fasta_sequences: [str]
255 @param fasta_target: fasta target files
256 L{SS.F_FASTA_TARGET} (default:None)
257 @type fasta_target: str
258 @param f_fast_tree: filename of clustalW style guide tree (.dnd)
259 @type f_fast_tree: str
260 @param f_sequence_tree: filename of clustalW style guide tree (.dnd)
261 @type f_sequence_tree: str
262 """
263
264 d = self.__default_input_files( pdbFiles, fasta_templates,
265 fasta_sequences, fasta_target )
266
267 pdbFiles = d['pdbs']
268 f_templates = d['templates']
269 f_sequences = d['sequences']
270 f_target = d['target']
271
272 pdbFiles = [ T.absfile( f ) for f in pdbFiles ]
273
274 f_templates = T.absfile( f_templates )
275 f_sequences = T.absfile( f_sequences )
276
277
278 f_fast_lib = self.outFolder + self.F_FAST_LIB
279 f_lalign_lib = self.outFolder + self.F_LALIGN_LIB
280 f_sap_lib = self.outFolder + self.F_SAP_LIB
281 f_struct_aln = self.outFolder + self.F_STRUCT_ALN
282 f_final_aln = self.outFolder + self.F_FINAL
283 f_coffee_log= self.outFolder + self.F_COFFEE_LOG
284 f_fast_tree= f_fast_tree or self.outFolder + self.F_FAST_TREE
285 f_sequence_tree= f_sequence_tree or self.outFolder + self.F_SEQ_TREE
286
287
288 self.repair_target_fasta( f_target )
289
290 if len( pdbFiles ) == 0:
291 raise AlignerError( "\n No templates avaliable. ABORTING" )
292
293
294 if (not self.sap) or ( len( pdbFiles ) == 1 ):
295 if self.sap:
296 self.logWrite('WARNING! Only one template avaliable:' +\
297 str(pdbFiles) )
298 self.logWrite('Structural alignment (SAP) will not be performed.')
299
300 r = [
301
302
303 self.coffee_align_inp( [ f_sequences, f_templates, f_target ],
304 method='fast_pair',
305 out_lib=f_fast_lib,
306 quiet=f_coffee_log+'_1', convert='' ),
307
308
309 self.coffee_align_inp( [ f_sequences, f_templates, f_target ],
310 method='lalign_id_pair',
311 out_lib=f_lalign_lib,
312 quiet=f_coffee_log+'_2', convert='' ),
313
314
315 self.coffee_align_inp( [ f_fast_lib, f_lalign_lib],
316 clean_aln=0, newtree=f_fast_tree,
317 output=['clustalw','phylip',
318 'score_html', 'pir_aln'],
319
320 outfile=f_final_aln,
321 quiet=f_coffee_log+'_4')
322 ]
323
324
325
326 else:
327 r = [
328
329
330 self.coffee_align_inp( [ f_sequences, f_templates, f_target ],
331 method='fast_pair',
332 out_lib=f_fast_lib,
333 quiet=f_coffee_log+'_1', convert='' ),
334
335
336 self.coffee_align_inp( [ f_sequences, f_templates, f_target ],
337 method='lalign_id_pair',
338 out_lib=f_lalign_lib,
339 quiet=f_coffee_log+'_2', convert='' ),
340
341
342 self.coffee_align_inp( pdbFiles, newtree=f_sequence_tree,
343 method='sap_pair',
344 weight=1000,
345 out_lib=f_sap_lib,
346 output='fasta_aln',
347 outfile=f_struct_aln,
348 quiet=f_coffee_log+'_3' ),
349
350
351 (f_sap_lib, f_struct_aln),
352
353
354 self.coffee_align_inp( [ f_fast_lib, f_lalign_lib, f_sap_lib],
355 clean_aln=0, newtree=f_fast_tree,
356 output=['clustalw','phylip',
357 'score_html',
358 'pir_aln', 'score_ascii'],
359
360 outfile=f_final_aln,
361 quiet=f_coffee_log+'_4')
362 ]
363
364
365 self.commands += r
366
367 if self.verbose:
368 f = open( self.outFolder + self.F_COFFEE_INP, 'w' )
369 for cmd in r:
370 f.write( str(cmd) + '\n\n' )
371 f.close()
372
373
375 """
376 Msap_pair alignments mess up sequence ids of certain
377 (single-chain?) structures.
378 @note: Dirty hack..
379
380 @param fname: Msap_pair file to repair
381 @type fname: str
382 """
383 ex_sapfix = re.compile('_[A-Z]{0,1}_[A-Z]')
384 ex_alpha = re.compile('.alpha')
385 bak_fname = fname+'_original'
386
387 os.rename( fname, bak_fname)
388
389 fout = open( fname, 'w' )
390 for l in open( bak_fname ):
391 l = re.sub( ex_alpha, '', l )
392 fout.write( re.sub( ex_sapfix, '_', l ) )
393
394 fout.close()
395
396
398 """
399 Change target ID to 'target' and remove '|' etc. that mess up
400 things in the alignments.
401
402 @param fname: fasta file to repair
403 @type fname: str
404
405 @raise AlignerError: if cannot fix target sequence file
406 """
407 bak_fname = fname+'_original'
408 os.rename( fname, bak_fname)
409
410 try:
411 fout = open( fname, 'w' )
412 fin = open( bak_fname )
413
414 l0 = fin.readline()
415
416 fout.write( '>target\n' )
417
418 for l in fin:
419 fout.write( l )
420
421 fin.close()
422 fout.close()
423
424 except Exception, why:
425 print T.lastError()
426 raise AlignerError("Cannot fix target sequence file: "+str(why))
427
428
429 - def go( self, host=None ):
430 """
431 Run the previously added commands, delete internal command list.
432
433 @param host: host name for remote execution
434 (default: None=local execution)
435 @type host: str
436
437 @raise AlignerError: if T_Coffee execution failed
438 """
439 try:
440 if self.verbose:
441 self.logWrite('\nALIGNING...\n')
442
443 for cmd in self.commands:
444
445 if type( cmd ) == tuple:
446 for f in cmd:
447 self.repair_lib_file( f )
448 else:
449
450 if host:
451 tc = TCoffee( cmd, host )
452 else:
453 tc = TCoffee( cmd )
454
455 self.logWrite( 'Running t_coffee .. ')
456 self.logWrite( cmd )
457
458
459 output, error, returncod = tc.run()
460
461 self.logWrite( '..done: ' + str( output ) + '\n' )
462 print "DEBUG: ", os.path.exists('1MHO_.template_file')
463
464 except EnvironmentError, why:
465 self.logWrite("ERROR: Can't run t_coffee: %s Error: %s"\
466 %( why, error ) )
467 raise AlignerError( "Can't run t_coffee: %s Error: %"\
468 %( why, error ) )
469
470
471
472
473
474 import Biskit.test as BT
475
476 -class Test(BT.BiskitTest):
477 """
478 Base Test class (does not actually run any test)
479 """
481 import tempfile
482 import shutil
483 from Biskit import LogFile, StdLog
484
485
486
487 self.outfolder = tempfile.mkdtemp( '_test_Aligner' )
488
489
490
491
492
493 os.mkdir( self.outfolder +'/templates' )
494 os.mkdir( self.outfolder +'/sequences/' )
495
496 shutil.copytree( T.testRoot() + '/Mod/project/templates/t_coffee',
497 self.outfolder + '/templates/t_coffee' )
498
499 shutil.copy( T.testRoot() + '/Mod/project/templates/templates.fasta',
500 self.outfolder + '/templates' )
501
502 shutil.copy( T.testRoot() + '/Mod/project/sequences/nr.fasta',
503 self.outfolder + '/sequences/' )
504
505 shutil.copy( T.testRoot() + '/Mod/project/target.fasta',
506 self.outfolder )
507
508 if not self.local:
509 self.a_log = LogFile( self.outfolder + '/Aligner.log' )
510 else:
511 self.a_log = StdLog()
512
513
526
529
530
532 """Test case dry run -- only initialize without running t-coffee"""
533
534 TAGS = []
535
537 """Mod.Aligner dry run test"""
538 self.t_Aligner(run=False)
539
540
542 """Complete test case for Aligner"""
543
544 TAGS = [BT.EXE, BT.LONG]
545
549
550 if __name__ == '__main__':
551
552 BT.localTest(debug=0)
553