1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 """
28 Search Hmmer Pfam database and retrieve conservation data.
29 """
30
31 import tempfile
32 import re, string
33 import types, os.path
34 import numpy.oldnumeric as N
35 import molUtils
36 import settings
37
38 import Biskit.mathUtils as math
39 from Biskit.Errors import BiskitError
40 import Biskit.tools as T
41 import Biskit.molTools as MT
42 import Biskit.molUtils as MU
43 from Biskit import Executor, TemplateError, PDBModel, StdLog, EHandler
44
45
46
47 hmmpfamExe = settings.hmmpfam_bin
48 hmmfetchExe = settings.hmmfetch_bin
49 hmmalignExe = settings.hmmalign_bin
50 hmmindexExe = settings.hmmindex_bin
51 hmmDatabase = settings.hmm_db
52
53
55 pass
56
57
59 """
60 Checks if the hmm database has been indexed or not,
61 if not indexing will be done.
62 """
63
65 """
66 @param hmmdb: Pfam hmm database
67 @type hmmdb: str
68 """
69 Executor.__init__( self, 'hmmindex', args='%s'%hmmdb, **kw )
70
71 if not os.path.exists(hmmdb+'.ssi'):
72 if self.verbose:
73 self.log.writeln(
74 'HMMINDEX: Indexing hmm database. This will take a while')
75
76 self.run()
77
78
80 """
81 Search hmm database (using the hmmpfam program) with a sequence
82 in fasta format.
83 If the profile names have been provided - skip the search and
84 only write the temporary sequence files.
85 """
86
87 - def __init__( self, target, hmmdb, noSearch=None, **kw ):
88 """
89 @param target: sequence
90 @type target: PDBModel or fasta file
91 @param hmmdb: Pfam hmm database
92 @type hmmdb: str
93 @param noSearch: don't perform a seach
94 @type noSearch: 1 OR None
95 """
96 self.fName = tempfile.mktemp('.fasta')
97 self.hmmdb = hmmdb
98
99 Executor.__init__( self, 'hmmpfam', catch_out=1,
100 args=' %s %s'%(hmmdb, self.fName), **kw )
101
102 self.target = target
103
104 self.fastaID = ''
105
106 if noSearch:
107 if self.verbose:
108 self.log.writeln(
109 'Profiles provided - No search will be performed.')
110
111
113 """
114 Verify that a given file or string is in Fasta format.
115 The definition used for a fasta file here is that:
116 - first line starts with '>'
117 - the following sequence lines are not longer that 80 characters
118 - the characters has to belong to the standard amino acid codes
119
120 @param target: name of fasta file OR file contents as list of strings
121 @type target: str OR [str]
122
123 @return: conforms to the fsata format
124 @rtype: True/False
125 """
126 if not type(target) == types.ListType:
127 if os.path.exists( target ):
128 f = open( target, 'r' )
129 target = f.readlines()
130
131 if not target[0][0] == '>':
132 self.log.writeln('Fasta format does not contain description line.')
133 return False
134
135 for i in range( 1, len(target) ):
136 if len( target[i] ) >= 80:
137 self.log.writeln(
138 'Fasta sequence lines longer that 80 characters')
139 return False
140
141 for j in target[i]:
142 aa_codes = MU.aaDicStandard.values() + [ '\n' ]
143 if not j.upper() in aa_codes:
144 self.log.writeln('Invalid amino acid code: %s'%j.upper())
145 return False
146
147 return True
148
149
151 """
152 If the target id a PDBModel its sequence will be written
153 to disc as a fasta file.
154 If it is a fasta file the path to the file will be passed on.
155 """
156
157 if isinstance( self.target, PDBModel):
158 fastaSeq, self.fastaID = MT.fasta( self.target )
159
160 seq = open( self.fName, 'w' )
161 seq.write( fastaSeq )
162 seq.close()
163
164
165 else:
166 if self.__verify_fasta(self.target):
167 self.fName = self.target
168
169
171 """
172 Parse the output from hmmpfam.
173
174 @return: dictionary witn profile names as keys and a list of
175 lists containing information about the range where the
176 profile matches the sequence
177 @rtype: dict, [list]
178 """
179 matches = {}
180 hits = []
181
182
183 if not os.path.exists( self.f_out ):
184 raise HmmerError,\
185 'Hmmersearch result file %s does not exist.'%self.f_out
186
187 if T.fileLength( self.f_out ) < 10:
188 raise HmmerError,\
189 'Hmmersearch result file %s seems incomplete.'%self.f_out
190
191 try:
192 out = open( self.f_out, 'r' )
193 while 1:
194 l = out.readline()
195
196 if re.match('^-{8}\s{7,8}-{11}.+-{3}$', l):
197 m = string.split( out.readline() )
198 while len(m) != 0:
199 matches[m[0]] = m[1:]
200 m = string.split( out .readline() )
201
202
203 if re.match('^-{8}\s{7,8}-{7}\s-{5}\s-{5}.+-{7}$', l):
204 h = string.split( out.readline() )
205 while len(h) != 0:
206 hits += [ h ]
207 h = string.split( out.readline() )
208 break
209
210 except:
211 raise HmmerError,\
212 'ERROR parsing hmmpfam search result: %s'%self.f_out
213
214 out.close()
215
216 return matches, hits
217
218
225
226
227
229 """
230 Get the hmm profile with name hmmName from hmm databse
231 (using the program hmmfetch).
232 """
233
234 - def __init__( self, hmmName, hmmdb, **kw ):
235 """
236 @param hmmName: hmm profile name
237 @type hmmName: str
238 @param hmmdb: Pfam hmm database
239 @type hmmdb: str
240 """
241 self.hmmName = hmmName
242
243 Executor.__init__( self, 'hmmfetch',
244 args=' %s %s'%(hmmdb, hmmName), **kw )
245
246 self.f_out = tempfile.mktemp('.hmm')
247
248
250 """
251 Extract some information about the profile as well as the
252 match state emmission scores. Keys of the returned dictionary::
253 'AA', 'name', 'NrSeq', 'emmScore', 'accession',
254 'maxAllScale', 'seqNr', 'profLength', 'ent', 'absSum'
255
256 @return: dictionary with warious information about the profile
257 @rtype: dict
258 """
259
260 if not os.path.exists( self.f_out ):
261 raise HmmerError,\
262 'Hmmerfetch result file %s does not exist.'%self.f_out
263
264 if T.fileLength( self.f_out ) < 10:
265 raise HmmerError,\
266 'Hmmerfetch result file %s seems incomplete.'%self.f_out
267
268 profileDic = {}
269
270
271 hmm = open( self.f_out, 'r')
272 out = hmm.read()
273 hmm.close()
274
275
276 profileDic['name'] = self.hmmName
277 profileDic['profLength'] = \
278 int( string.split(re.findall('LENG\s+[0-9]+', out)[0])[1] )
279 profileDic['accession'] = \
280 string.split(re.findall('ACC\s+PF[0-9]+', out)[0])[1]
281 profileDic['NrSeq'] = \
282 int( string.split(re.findall('NSEQ\s+[0-9]+', out)[0])[1] )
283 profileDic['AA'] = \
284 string.split(re.findall('HMM[ ]+' + '[A-Y][ ]+'*20, out)[0] )[1:]
285
286
287 pattern = 'NULE[ ]+' + '[-0-9]+[ ]+'*20
288 nullEmm = [ float(j) for j in string.split(re.findall(pattern, out)[0])[1:] ]
289
290
291 prob=[]
292 for i in range(1, profileDic['profLength']+1):
293 pattern = "[ ]+%i"%i + "[ ]+[-0-9]+"*20
294 e = [ float(j) for j in string.split(re.findall(pattern, out)[0]) ]
295 prob += [ e ]
296
297 profileDic['seqNr'] = N.transpose( N.take( prob, (0,),1 ) )
298 profileDic['emmScore'] = N.array(prob)[:,1:]
299
300
301 emmProb, nullProb = self.hmmEmm2Prob( nullEmm, profileDic['emmScore'])
302
303 ent = [ N.resize( self.entropy(e, nullProb), (1,20) )[0] for e in emmProb ]
304 profileDic['ent'] = N.array(ent)
305
306 profileDic['ent'] = N.array(ent)
307
308
309
310 proba = N.array(prob)[:,1:]
311
312
313
314
315
316
317
318
319
320 p = proba
321 p2 = []
322 for i in range( len(p) ) :
323 p2 += [ N.resize( N.sum( N.absolute( p[i] )), N.shape( p[i] ) ) ]
324 profileDic['absSum'] = p2
325
326
327 p = proba
328 p4 = []
329 for i in range( len(p) ) :
330 p_scale = (p[i] - N.average(p[i]) )/ math.SD(p[i])
331 p4 += [ N.resize( p_scale[N.argmax( N.array(p_scale) )] ,
332 N.shape( p[i] ) ) ]
333 profileDic['maxAllScale'] = p4
334
335 return self.f_out, profileDic
336
337
339 """
340 Convert HMM profile emmisiion scores into emmission probabilities
341
342 @param nullEmm: null scores
343 @type nullEmm: array
344 @param emmScore: emmission scores
345 @type emmScore: array
346
347 @return: null and emmission probabilities, for each amino acid
348 in each position
349 @rtype: array( len_seq x 20 ), array( 1 x 20 )
350 """
351
352 nullProb = N.power( 2, N.array( nullEmm )/1000.0 )*(1./20)
353
354
355 emmProb = nullProb * N.power( 2, ( emmScore/1000.0) )
356
357 return emmProb, nullProb
358
359
360 - def entropy( self, emmProb, nullProb ):
361 """
362 calculate entropy for normalized probabilities scaled by aa freq.
363 emmProb & nullProb is shape 1,len(alphabet)
364
365 @param emmProb: emmission probabilities
366 @type emmProb: array
367 @param nullProb: null probabilities
368 @type nullProb: array
369
370 @return: entropy value
371 @rtype: float
372 """
373
374 emmProb = N.clip(emmProb, 1.e-10, 1.)
375
376 return N.sum( emmProb * N.log(emmProb/nullProb) )
377
378
380 """
381 Clean up after external program has finished (failed or not).
382 Override, but call in child method!
383 """
384 if not self.keep_inp and not self.debug:
385 T.tryRemove( self.f_in )
386
387 if self.f_err and not self.debug:
388 T.tryRemove( self.f_err )
389
390
392 """
393 Called if external program failed, override!
394 """
395 raise HmmerError,\
396 'Hmmerfetch failed retrieving profile: %s'%self.hmmName
397
398
405
406
407
409 """
410 Align fasta formated sequence to hmm profile (using hmmalign).
411 """
412
413 - def __init__( self, hmmFile, fastaFile, fastaID, **kw ):
414 """
415 @param hmmFile: path to hmm file (profile)
416 @type hmmFile: str
417 @param fastaFile: path to fasta search sequence
418 @type fastaFile: str
419 @param fastaID: fasta id of search sequence
420 @type fastaID: str
421 """
422 self.fastaID = fastaID
423
424 Executor.__init__( self, 'hmmalign',
425 args=' -q %s %s'%(hmmFile, fastaFile), **kw )
426
427
429 """
430 Align fasta formated sequence to hmm profile.
431
432 @return: alignment and matching hmm positions with gaps
433 @rtype: str, str
434 """
435
436 if not os.path.exists( self.f_out ):
437 raise HmmerError,\
438 'Hmmeralign result file %s does not exist.'%self.f_out
439
440 if T.fileLength( self.f_out ) < 1:
441 raise HmmerError,\
442 'Hmmeralign result file %s seems incomplete.'%self.f_out
443
444
445 hmm = open( self.f_out, 'r')
446 out = hmm.read()
447 hmm.close()
448
449
450 fastaSeq = re.findall( self.fastaID + '[ ]+[-a-yA-Y]+', out )
451 fastaSeq = string.join([ string.split(i)[1] for i in fastaSeq ], '')
452
453
454 hmmSeq = re.findall( '#=[A-Z]{2}\s[A-Z]{2}\s+[.x]+', out )
455 hmmSeq = string.join([ string.strip( string.split(i)[2] ) for i in hmmSeq ], '')
456
457 return fastaSeq, hmmSeq
458
459
461 """
462 Called if external program failed, override!
463 """
464 raise HmmerError,\
465 'hmmeralign failed aligning sequence file %s with profile %s'\
466 %(self.fastaFile ,self.hmmFile)
467
468
475
476
478 """
479 Search Hmmer Pfam database and retrieve conservation score for model
480 """
481
483 """
484 @param hmmdb: Pfam hmm database
485 @type hmmdb: str
486 @param verbose: verbosity level (default: 1)
487 @type verbose: 1|0
488 @param log: Log file for messages [STDOUT]
489 @type log: Biskit.LogFile
490 """
491 self.hmmdb = hmmdb
492 self.verbose = verbose
493 self.log = log or StdLog()
494 self.tempDir = settings.tempDirShared
495
496 self.fastaID = ''
497
498 self.hmmFile = ''
499 self.fastaFile = tempfile.mktemp('.fasta', dir=self.tempDir)
500 self.sub_fastaFile = tempfile.mktemp('_sub.fasta', dir=self.tempDir)
501
502
504 """
505 Checks if the hmm database has been indexed or not,
506 if not indexing will be done.
507 """
508 idx = HmmerdbIndex( self.hmmdb, verbose=self.verbose, log=self.log )
509
510
512 """
513 Search hmm database with a sequence in fasta format.
514 If the profile names have been provided - skip the search and
515 only write the temporary sequence files.
516
517 @param target: sequence
518 @type target: PDBModel or fasta file
519 @param noSearch: don't perform a seach
520 @type noSearch: 1 OR None
521
522 @return: dictionary witn profile names as keys and a list of
523 lists containing information about the range where the
524 profile matches the sequence
525 @rtype: dict, [list]
526 """
527 if self.verbose:
528 self.log.writeln(
529 '\nSearching hmm database, this will take a while...')
530
531 search = HmmerSearch( target, self.hmmdb, verbose=self.verbose,
532 log=self.log )
533 matches, hits = search.run()
534
535 return matches, hits
536
537
538 - def selectMatches( self, matches, hits, score_cutoff=60 ,
539 eValue_cutoff = 1e-8 ):
540 """
541 select what hmm profiles to use based on score and e-Value cutoff
542
543 @param matches: output from L{searchHmmdb}
544 @type matches: dict
545 @param hits: output from L{searchHmmdb}
546 @type hits: [list]
547 @param score_cutoff: cutoff value for an acceptable score
548 @type score_cutoff: float
549 @param eValue_cutoff: cutoff value for an acceptable e-value
550 @type eValue_cutoff: float
551
552 @return: dictionary with good matches {hmm_name : [[start,stop],[..]]}
553 @rtype: dict
554 """
555
556 if len(matches) == 0:
557 if self.verbose: self.log.writeln('\nNo matching profiles found')
558 return None
559 if len(matches) == 1:
560 if self.verbose:
561 self.log.writeln('\nFound only one matching profile.')
562 if len(matches) > 1:
563 if self.verbose: self.log.writeln('\nFound more than one hit.\n')
564
565 try:
566 if self.verbose:
567 self.log.writeln(
568 '\tName -- Nr hits -- Score -- E-value -- Description')
569 for k in matches.keys():
570 self.log.writeln( '\t'+k+' --'+matches[k][-1] +' -- '+\
571 matches[k][-3]+ ' -- '+ matches[k][-2]+' -- '+\
572 string.join(matches[k][0:-3]))
573 except:
574 pass
575
576
577 hmmHits = {}
578
579 for h in range( len(hits) ):
580
581 hmmName = hits[h][0]
582 score = float( hits[h][-2] )
583 eValue = float( hits[h][-1] )
584
585
586 if score < score_cutoff and eValue > eValue_cutoff \
587 and self.verbose:
588 self.log.writeln('\nWARNING: Bad scores! Profile NOT used.')
589 self.log.writeln('\t Name: %s - Score: %f E-value: %f ' \
590 %( hmmName, score, eValue ) )
591
592
593 else:
594 seqStart = int(hits[h][2])
595 seqEnd = int(hits[h][3])
596 if hmmHits.has_key(hmmName):
597 hmmHits[hmmName] = hmmHits[hmmName] +[[seqStart, seqEnd ]]
598
599 else:
600 hmmHits[hmmName] = [ [ seqStart, seqEnd ] ]
601
602 if self.verbose:
603 self.log.writeln( '\nWill use profile(s):\n '+str( hmmHits ))
604
605 return hmmHits
606
607
609 """
610 Get the hmm profile with name hmmName from hmm databse.
611 Extract some information about the profile as well as the
612 match state emmission scores. Keys of the returned dictionary::
613 'AA', 'name', 'NrSeq', 'emmScore', 'accession',
614 'maxAllScale', 'seqNr', 'profLength', 'ent', 'absSum'
615
616 @param hmmName: hmm profile name
617 @type hmmName: str
618
619 @return: dictionary with warious information about the profile
620 @rtype: dict
621 """
622 profile = HmmerProfile( hmmName, self.hmmdb, verbose=self.verbose,
623 log=self.log)
624 self.hmmFile, profileDic = profile.run()
625
626 return profileDic
627
628
629 - def align( self, model, hits ):
630 """
631 Performs alignment
632 If there is more than one hit with the profile, the sequence will
633 be subdevided and the alignment will be performed on each part.
634 a final merger profile for the profile will be returned.
635
636 @param model: model
637 @type model: PDBModel
638 @param hits: list with matching sections from L{searchHmmdb}
639 @type hits: [[int,int]]
640
641 @return: fastaSeq hmmSeq repete hmmGap::
642 fastaSeq - sequence
643 hmmSeq - matching positions in profile
644 repete - number of repetes of the profile
645 hmmGap - list with gaps (deletions in search sequence) for
646 each repete
647 @rtype: str, str, int, [int]
648 """
649 try:
650
651 fastaSeq, self.fastaID = MT.fasta( model )
652 fastaSeq = fastaSeq.split()[1]
653 l = len(fastaSeq)
654
655
656 j = 0
657 repete = 0
658 hmmGap = []
659
660 for h in hits:
661 start = h[0]-1
662 stop = h[1]
663 sub_fastaSeq, self.fastaID = MT.fasta( model, start, stop )
664
665
666 fName = self.sub_fastaFile
667 sub_seq = open( fName, 'w' )
668 sub_seq.write( sub_fastaSeq )
669 sub_seq.close()
670
671
672 align = HmmerAlign( self.hmmFile, self.sub_fastaFile,
673 self.fastaID, verbose=self.verbose,
674 log=self.log)
675
676 sub_fastaSeq, sub_hmmSeq = align.run()
677
678
679
680 sub_fastaSeq, sub_hmmSeq, del_hmm = \
681 self.removeGapInSeq( sub_fastaSeq, sub_hmmSeq )
682 hmmGap += [ del_hmm ]
683
684
685 sub_hmmSeq = '.'*start + sub_hmmSeq + '.'*(l-stop)
686
687 if j == 0:
688 hmmSeq = sub_hmmSeq
689 if j != 0:
690 hmmSeq = self.mergeHmmSeq( hmmSeq, sub_hmmSeq )
691
692 j+= 1
693 repete += 1
694
695 return fastaSeq, hmmSeq, repete, hmmGap
696
697 except IOError, e:
698 raise HmmerError, "Error creating temporary file %r. "%e.filename+\
699 "Check that Biskit.settings.temDirShared is accessible!\n"+\
700 "See also .biskit/settings.cfg!"
701
702
704 """
705 Removes position scorresponding to insertions in search sequence
706
707 @param fasta: search sequence
708 @type fasta: str
709 @param hmm: sequence, matching hmm positions
710 @type hmm: str
711
712 @return: search sequence and profile match sequence with insertions
713 and deletions removed and a list with the deleted positions
714 @rtype: str, str, [int]
715 """
716 new_hmm = []
717 new_fasta = []
718 del_pos = []
719
720 j = 0
721 for i in range( len(fasta) ):
722 if string.upper( fasta[i] ) in molUtils.allAA():
723 new_hmm += hmm[i]
724 new_fasta += fasta[i]
725
726 if fasta[i] == '-':
727 del_pos += [j]
728
729 if hmm[i] == '.':
730 j -= 1
731 j += 1
732
733 return ''.join(new_fasta), ''.join(new_hmm), del_pos
734
735
737 """
738 Merges two sequence files into one.
739 Multilple hits with one profile cannot overlap!! Overlap == ERROR
740
741 @param seq1: sequence
742 @type seq1: str
743 @param seq2: sequence
744 @type seq2: str
745
746 @return: merged sequence or None
747 @rtype: str OR None
748 """
749 if len(seq1) != len(seq2):
750 EHandler.warning( 'ERR in mergeHmmSeq:\n' +\
751 '\tSequences of different lengths cannot be merged')
752 return None
753 else:
754 result = ''
755 for i in range( len(seq1) ):
756
757 if seq1[i] == seq2[i] == '.':
758 result += '.'
759
760 if seq1[i] > seq2[i]:
761 result += seq1[i]
762
763 if seq1[i] < seq2[i]:
764 result += seq2[i]
765
766 return result
767
768
770 """
771 Align fasta formated sequence to hmm profile.
772
773 @param file: path to hmmalign output file
774 @type file: str
775
776 @return: alignment and matching hmm positions with gaps
777 @rtype: str, str
778 """
779
780 out = os.popen( hmmalignExe + ' -q ' + self.hmmFile + \
781 ' ' + file ).read()
782
783
784 fastaSeq = re.findall( self.fastaID + '[ ]+[-a-yA-Y]+', out )
785 fastaSeq = string.join([ string.split(i)[1] for i in fastaSeq ], '')
786
787
788 hmmSeq = re.findall( '#=[A-Z]{2}\s[A-Z]{2}\s+[.x]+', out )
789 hmmSeq = string.join(
790 [ string.strip( string.split(i)[2] ) for i in hmmSeq ], '')
791
792 return fastaSeq, hmmSeq
793
794
795 - def castHmmDic( self, hmmDic, repete, hmmGap, key ):
796 """
797 Blow up hmmDic to the number of repetes of the profile used.
798 Correct scores for possible deletions in the search sequence.
799
800 @param hmmDic: dictionary from L{getHmmProfile}
801 @type hmmDic: dict
802 @param repete: repete information from L{align}
803 @type repete: int
804 @param hmmGap: information about gaps from L{align}
805 @type hmmGap: [int]
806 @param key: name of scoring method to adjust for gaps and repetes
807 @type key: str
808
809 @return: dictionary with information about the profile
810 @rtype: dict
811 """
812 s = hmmDic[key]
813
814 for i in range( repete ):
815 mask = N.ones( len(s) )
816 N.put( mask, hmmGap[i], 0 )
817 if i == 0:
818 score = N.compress( mask, s, 0 )
819 if i > 0:
820 score = N.concatenate( ( N.compress( mask, s, 0 ), score ) )
821
822 hmmDic[key] = score
823
824 return hmmDic
825
826
827 - def matchScore( self, fastaSeq, hmmSeq, profileDic, key ):
828 """
829 Get match emmision score for residues in search sequence
830
831 @param fastaSeq: search sequence
832 @type fastaSeq: str
833 @param hmmSeq: sequence, matching hmm positions
834 @type hmmSeq: str
835 @param profileDic: from L{castHmmDic}
836 @type profileDic: dict
837 @param key: name of scoring method
838 @type key: str
839
840 @return: list of emmision scores for sequence
841 @rtype: [float]
842 """
843 cons = []
844 hmmShift = 0
845
846 for i in range( len( fastaSeq ) ):
847
848
849 if hmmSeq[i] == '.':
850 cons += [0]
851 hmmShift += 1
852
853
854 if fastaSeq[i] in profileDic['AA'] and hmmSeq[i] != '.':
855 j = profileDic['AA'].index( fastaSeq[i] )
856 cons += [ profileDic[key][i - hmmShift][j] ]
857
858 return cons
859
860
861 - def __score( self, model, key, hmmNames=None ):
862 """
863 Get match emmission scores for search sequence.
864 If profile name(s) are provided, no search will be performed.
865
866 - If names and positions of hmm profiles is B{NOT} provided
867 B{search performed} -> score (array), hmmNames (dictionary)
868
869 - If names and positions of hmm profiles is provided
870 B{NO search performed} -> score (array), hmmNames
871 (dictionary - same as input)
872
873 @param model: model
874 @type model: PDBModel
875 @param key: name of scoring method
876 @type key: str
877 @param hmmNames: profile name OR None (default: None)
878 @type hmmNames: str OR None
879
880 @return: score, hmmNames
881 @rtype: array, dict
882 """
883 if not hmmNames:
884
885 searchResult, searchHits = self.searchHmmdb( model )
886 hmmNames = self.selectMatches( searchResult, searchHits )
887 else:
888 searchResult = self.searchHmmdb( model, noSearch=1 )
889 hmmNames = hmmNames
890
891
892 result = None
893
894 for name in hmmNames.keys():
895
896 hmmDic = self.getHmmProfile( name )
897 if self.verbose:
898 self.log.writeln('Hmm profile ' + str(name) +\
899 ' with accession number ' \
900 + str(hmmDic['accession']) + ' retrieved')
901
902
903 fastaSeq, hmmSeq, repete, hmmGap = self.align( model,
904 hmmNames[ name ] )
905
906 hmmDic = self.castHmmDic( hmmDic, repete, hmmGap, key )
907
908
909 cons = self.matchScore( fastaSeq, hmmSeq, hmmDic, key )
910
911 if result is not None:
912 result = self.mergeProfiles( result, cons )
913 else:
914 result = cons
915
916 return result, hmmNames
917
918
920 return self.__score( model, key='absSum', hmmNames=hmmNames )
921
922
924 return self.__score( model, key='maxAllScale', hmmNames=hmmNames )
925
926
928 return self.__score( model, key='ent', hmmNames=hmmNames )
929
930
932 if type( lstOrAr ) == list:
933 return N.array( lstOrAr )
934 return lstOrAr
935
936
938 """
939 Merge profile p0 with profile p1, as long as they overlap in
940 at most maxOverlap positions
941
942 @param p0: profile
943 @type p0: [float]
944 @param p1: profile
945 @type p1: [float]
946 @param maxOverlap: maximal allowed overlap between profiles
947 @type maxOverlap: int
948
949 @return: array
950 @rtype:
951 """
952 p0 = self.__list2array( p0 )
953 p1 = self.__list2array( p1 )
954
955 overlap = N.greater( N.greater(p0,0) + N.greater(p1,0), 1 )
956
957 if N.sum( overlap ) <= maxOverlap:
958
959
960
961 N.put( p1, N.nonzero( overlap ), 0 )
962 N.put( p0, N.nonzero( overlap ), 0 )
963
964 p0 = p0 + p1
965
966 return p0
967
968
970 """
971 remove temp files
972 """
973 del_lst = [ self.sub_fastaFile, self.hmmFile, self.fastaFile ]
974 for d in del_lst:
975 try:
976 os.remove( d )
977 except:
978 pass
979
980
981
982
983
984 import Biskit.test as BT
985
986 -class Test(BT.BiskitTest):
987 """Hmmer test"""
988
989 TAGS = [ BT.EXE ]
990
992 """Hmmer test """
993
994 from Biskit import PDBModel
995 import Biskit.tools as T
996
997 if self.local: print "Loading PDB...",
998 self.m = PDBModel( T.testRoot()+'/lig/1A19.pdb')
999 self.model = self.m.compress( self.m.maskProtein() )
1000 if self.local: print "Done"
1001
1002
1003 self.hmmer = Hmmer( hmmdb=settings.hmm_db, verbose=self.local,
1004 log=self.log )
1005 self.hmmer.checkHmmdbIndex()
1006
1007
1008 method = [ 'emmScore', 'ent', 'maxAll', 'absSum', 'maxAllScale' ]
1009
1010
1011 searchMatches, searchHits = self.hmmer.searchHmmdb( self.model )
1012 hmmNames = self.hmmer.selectMatches( searchMatches, searchHits )
1013
1014
1015 self.cons = []
1016 self.result = None
1017
1018 for name in hmmNames.keys():
1019
1020
1021 hmmDic = self.hmmer.getHmmProfile( name )
1022
1023
1024 fastaSeq, hmmSeq, repete, hmmGap = \
1025 self.hmmer.align( self.model, hmmNames[ name ] )
1026
1027
1028 hmmDic_cast = \
1029 self.hmmer.castHmmDic( hmmDic, repete, hmmGap, method[0] )
1030
1031
1032 self.cons = self.hmmer.matchScore( fastaSeq, hmmSeq,
1033 hmmDic_cast, method[0] )
1034
1035
1036 if self.result:
1037 self.result = self.hmmer.mergeProfiles( self.result, self.cons )
1038 else:
1039 self.result = self.cons
1040
1041 self.hmmer.cleanup()
1042
1043 self.assertEqual( self.result, self.EXPECTED )
1044
1045
1046
1047 EXPECTED = [2581.0, 3583.0, 1804.0, 2596.0, 3474.0, 2699.0, 3650.0, 2087.0, 2729.0, 2450.0, 2412.0, 2041.0, 3474.0, 1861.0, 2342.0, 2976.0, 5124.0, 2729.0, 2202.0, 2976.0, 3583.0, 2202.0, 2103.0, 2976.0, 1922.0, 2132.0, 4122.0, 2403.0, 4561.0, 4561.0, 3650.0, 2087.0, 4001.0, 2976.0, 3860.0, 3260.0, 2976.0, 6081.0, 3860.0, 5611.0, 2976.0, 3609.0, 3650.0, 6081.0, 3343.0, 2403.0, 3288.0, 4122.0, 2976.0, 2322.0, 2976.0, 1995.0, 4378.0, 2706.0, 2665.0, 4186.0, 3539.0, 2692.0, 3270.0, 2302.0, 2604.0, 2132.0, 2118.0, 2380.0, 2614.0, 2170.0, 3260.0, 2403.0, 1964.0, 3343.0, 2976.0, 2643.0, 3343.0, 2714.0, 2591.0, 3539.0, 3260.0, 2410.0, 1809.0, 3539.0, 2111.0, -774.0, 3860.0, 2450.0, 2063.0, 3474.0, 3474.0, 2057.0, 1861.0]
1048
1049
1050 if __name__ == '__main__':
1051
1052 BT.localTest()
1053