1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 """
28 Seperate PDB into continuous peptide chains for XPlor. Remove duplicate
29 peptide chains. Required by pdb2xplor.py
30
31 This is vintage code. See L{Biskit.PDBCleaner} for a more recent
32 version (though yet lacking some functions).
33
34 @todo: Create an override for the chain comparison if one wants
35 to keep identical chains (i.e homodimers)
36 """
37
38
39 from molUtils import singleAA
40 import Biskit.tools as T
41 from LogFile import LogFile
42
43 from Scientific.IO.PDB import *
44 import numpy.oldnumeric as N
45 import string
46 from difflib import SequenceMatcher
47 import re
48
50 """
51 Open PDB file; give back one chain whenever next() is
52 called. This class is used by the pdb2xplor script.
53
54 This class constitutes vintage code. See
55 L{Biskit.PDBCleaner} and L{Biskit.Mod.TemplateCleaner} for a more
56 recent implementation of PDB cleaning.
57
58 @todo: The removal of duplicate chains should be transferred to
59 the PDBCleaner so that this class can be retired
60 """
61
62 - def __init__(self, fname, outPath='', chainIdOffset=0,
63 capBreaks=0, chainMask=0, log=None ):
64 """
65 @param fname: pdb filename
66 @type fname: str
67 @param outPath: path for log file
68 @type outPath: str
69 @param chainIdOffset: start chain numbering at this offset
70 @type chainIdOffset: int
71 @param capBreaks: add ACE and NME to N- and C-term. of chain breaks [0]
72 @type capBreaks: 0|1
73 @param chainMask: chain mask for overriding the default sequence identity [None]
74 @type chainMask: [1|0]
75 @param log: LogFile object
76 @type log: object
77 """
78 self.pdb = Structure(fname);
79 self.fname = fname
80 self.outPath = T.absfile( outPath )
81 self.chainIdOffset = chainIdOffset
82 self.capBreaks = capBreaks
83 self.log = LogFile( T.absfile(outPath)+'/' + self.pdbname()+'.log')
84 if log:
85 self.log = log
86
87 self.chains = self.pdb.peptide_chains
88 self.counter = -1
89 self.threshold = 0.9
90 self._expressionCheck(
91 "[^\n].*[Hh][Oo][Mm][Oo].?[Dd][Ii][Mm][eE][Rr].*\n", 'HOMODIMER')
92 self._expressionCheck("[^\n].*[Tt][Rr][Ii][Mm][Ee][Rr].*\n", 'TRIMER')
93 self._hetatomCheck()
94
95 self.log.add("Separate chains: \n------------------")
96 self._removeDuplicateChains(chainMask)
97 self._separateChainBreaks()
98 self._assign_seg_ids()
99
100
102 """
103 Extract pdb code from file name.
104
105 @return: (assumed) pdb code
106 @rtype: str
107 """
108 return T.stripFilename(self.pdb.filename)
109
110
112 """
113 Check and report if the regular expression 'findExpression'
114 exists in the PDB-file. Use this to locate data in the REMARK
115 section of a pdb file. Prints a warning to stdOut if the
116 regular expression is found.
117
118 @param findExpression: regular expression
119 @type findExpression: str
120 @param findClean: clean name of regular expression
121 @type findClean: str
122 """
123 pdb = open(self.fname,'r')
124 pdbFile = pdb.read()
125 searchResult = re.findall(findExpression,pdbFile)
126
127 warningMessage = """
128 WARNINGR! The text string'%s' was found in the PDB-file.
129 If this PDB-file contains a homodimer one of the chains will be
130 deleted by this script. To avoid this prepare the file for Xplor manualy \n""" %\
131 ( findClean )
132 warningMessage2 = """--------------------------------------------\n"""
133
134 if len(searchResult) != 0:
135 self.log.add(warningMessage)
136 self.log.add("String found in line(s): \n")
137 for i in range(0,len(searchResult)):
138 self.log.add(searchResult[i])
139 self.log.add(warningMessage2)
140 pdb.close()
141
142
144 """
145 Check and report if there are any none-water HETATMs in the PDB-file
146 """
147 pdb = open(self.fname,'r')
148 pdbFile = pdb.read()
149 findExpression = "HETATM.*\n"
150 searchResult = re.findall(findExpression,pdbFile)
151 i=0
152 j = len(searchResult)
153 while i<j:
154 if searchResult[i][17:20] == "HOH" or \
155 searchResult[i][0:6] != "HETATM" :
156 del searchResult[i]
157 i=i-1
158 j=j-1
159 i=i+1
160
161 warningMessage = """
162 WARNING! The PDB-file contains coordinates for none water HETATMs.
163 If you want to keep the HETATM - prepare the file for Xplor manualy \n"""
164 warningMessage2 = "\n"+ 80*"-" + "\n"
165 if len(searchResult) != 0:
166 self.log.add(warningMessage)
167 self.log.add("String found in line(s): \n")
168 for i in range(0,len(searchResult)):
169 self.log.add(searchResult[i][0:-1])
170 self.log.add(warningMessage2)
171 pdb.close()
172
173
175 """
176 @param seq1: sequence 1 to compare
177 @type seq1: str
178 @param seq2: sequence 1 to compare
179 @type seq2: str
180 @return: identity (0.0 - 1.0) between the two sequences
181 @rtype : float
182 """
183
184
185
186 matcher = SequenceMatcher( None, ''.join(seq1) , ''.join(seq2) )
187 return matcher.ratio()
188
189
191 """
192 Get rid of identical chains by comparing all chains with Blast2seq.
193
194 @param chainMask: chain mask for overriding the
195 chain identity checking (default: None)
196 @type chainMask: [int]
197
198 @return: number of chains removed
199 @rtype: int
200 """
201 chainCount = len(self.chains)
202 matrix = 1.0 * N.zeros((chainCount,chainCount))
203 chain_ids = []
204
205
206 for i in range(0, chainCount):
207 chain_ids = chain_ids + [self.chains[i].chain_id]
208 for j in range(i, len(self.chains)):
209
210
211 seq1 = singleAA( self.chains[i].sequence() )
212 seq2 = singleAA( self.chains[j].sequence() )
213
214
215
216
217
218
219
220
221
222
223
224 matrix[i,j] = self._compareSequences( seq1, seq2 )
225
226
227 self.log.add("\n Chain ID's of compared chains: "+str(chain_ids))
228 self.log.add(" Cross-Identity between chains:\n"+str(matrix))
229 self.log.add(" Identity threshold used: "+str(self.threshold))
230
231
232
233 if chainMask:
234 if len(chainMask) == chainCount:
235 self.chains = N.compress(chainMask, self.chains)
236 self.log.add("NOTE: chain mask %s used for removing chains.\n"%chainMask)
237
238 else:
239 self.log.add("########## ERROR ###############")
240 self.log.add("# Chain mask is only %i chains long"%len(chainMask))
241 self.log.add("# when a mask of length %i is needed"%chainCount)
242 self.log.add("# No cleaning will be performed.\n")
243
244 if not chainMask:
245
246
247 duplicate = len(self.chains)
248 for offset in range(1,chainCount):
249 diag = N.diagonal(matrix, offset ,0,1)
250
251 avg = 1.0 * N.sum(diag)/len(diag)
252 if (avg >= self.threshold):
253 duplicate = offset
254 break
255 self.chains = self.chains[:duplicate]
256 self.log.add("NOTE: Identity matrix will be used for removing identical chains.")
257
258
259 self.log.add(str(chainCount - len(self.chains))+\
260 " chains have been removed.\n")
261
262
263 return (chainCount - len(self.chains))
264
265
267 """
268 Assign new segment id to each chain.
269 """
270 counter = self.chainIdOffset
271 for chain in self.chains:
272
273
274 chain.segment_id = self.pdbname()[:3] + string.uppercase[counter]
275 counter = counter + 1
276 try:
277 chain_id = chain.chain_id
278 self.log.add("changed segment ID of chain "+chain_id+\
279 " to "+chain.segment_id)
280 except:
281 T.errWriteln("_assign_seg_ids(): logerror")
282
283
285 """
286 Calculate sequential atom-atom distance, report residues with
287 longer distance than cutoff (chain break positions).
288
289 @param chain: Scientific.IO.PDB.PeptideChain object
290 @type chain: object
291 @param cutoff: threshold for reporting gap (chain break)
292 @type cutoff: float
293 @param atom: type of atoms to check (i.e. 'CA')
294 @type atom: str
295
296 @return: list of chain break positions (residue index for each
297 first residue of two that are too distant)
298 @rtype: list of int
299 """
300 distanceList = []
301 v0 = Vector( 0,0,0 )
302 jump = 1
303
304 for res in range(0,len(chain)-2):
305
306 try:
307 v1 = Vector(chain[res][atom].position.array)
308
309
310 if v1 != v0:
311
312 jump = 1
313 v2 = Vector(chain[ res+jump ][atom].position.array)
314
315
316 while v2 == v0 and jump + res < len( chain ):
317 jump += 1
318 v2 = Vector(chain[ res+jump ][atom].position.array)
319
320 if (v1 - v2).length() > cutoff * jump:
321 distanceList = distanceList + [res + jump - 1]
322
323 except:
324 self.log.add(
325 "_sequentialDist():\nError while checking CA-CA distance"+\
326 " between residues "+str(chain[res].name)+\
327 str(chain[res].number)+" and "+\
328 str(chain[res+jump].name)+\
329 str(chain[res+jump].number)+ " in chain "+chain.chain_id)
330 self.log.add("Error: " + T.lastError() )
331
332 return distanceList
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
366 """
367 Separate chains with breaks into 2 chains.
368 The new chain(s) is/are added to the internal PDB instance
369 (self.chains).
370 """
371 fragments = []
372
373 for chain in self.chains:
374
375 breaks = self._sequentialDist(chain, 4.5, 'CA')
376 self.log.add(str(len(breaks)) + " breaks found in chain " +\
377 "(" + str(len(chain)) \
378 + " residues) " + chain.chain_id + ": "+str(breaks))
379
380 previous = 0
381 ncap_next = 0
382 for breakRes in breaks:
383
384 residues = chain.residues[previous:breakRes+1]
385 previous = breakRes + 1
386
387 chainNew = PeptideChain(residues, chain.chain_id,
388 chain.segment_id)
389 if ncap_next:
390 self.__nCap( chainNew )
391 ncap_next = 0
392
393 if self.capBreaks:
394
395 self.__cCap( chainNew )
396
397 ncap_next = 1
398
399 fragments = fragments + [chainNew]
400
401 chainNew = PeptideChain(chain.residues[previous:], chain.chain_id,
402 chain.segment_id)
403 if ncap_next:
404 self.__nCap( chainNew )
405
406 fragments = fragments + [chainNew]
407
408 self.chains = fragments
409
410
411 - def __nCap( self, pep_chain ):
412 """
413 Add acetyl capping to N-terminal of peptide chain
414 """
415 n = (pep_chain[0].number or 1) - 1
416
417 r = AminoAcidResidue('ACE', number=n, atoms=[Atom('CA', Vector(0,0,0),
418 element='C')])
419 pep_chain.residues = [r] + pep_chain.residues
420
421 self.log.add('Capping chain break with ACE %i' % n)
422
423
424 - def __cCap( self, pep_chain ):
425 """
426 Add methyle amine capping to C-terminal of peptide chain
427 """
428 n = (pep_chain[-1].number or len(pep_chain)) + 1
429
430 r = AminoAcidResidue('NME', number=n, atoms=[Atom('CA', Vector(0,0,0),
431 element='C')])
432 pep_chain.residues = pep_chain.residues + [r]
433
434 self.log.add('Capping chain break at with NME %i' % n)
435
436
438 """
439 Write waters into separate pdb file, called |pdbCode|_waters.pdb.
440 """
441 try:
442 fTarget = self.outPath + '/' +\
443 self.pdbname()[:4] + '_waters.pdb'
444 pdb = PDBFile( fTarget, mode='w' )
445
446 waters = []
447 for key in ['HOH', 'DOD']:
448
449 if self.pdb.molecules.has_key( key ):
450
451 waters += self.pdb.molecules[ key ]
452
453 pdb.nextChain(chain_id='', segment_id='1XWW')
454 for w in waters:
455 pdb.nextResidue('TIP3')
456
457 pdb.het_flag = 0
458 pdb.writeAtom('OH2', w.atoms['O'].position)
459
460
461 if len(waters) == 0:
462 try:
463 TIP3_waters = self.pdb.molecules[ 'TIP3' ]
464 except:
465 TIP3_waters = []
466
467 for w in TIP3_waters:
468 pdb.nextResidue('TIP3')
469
470 pdb.het_flag = 0
471 pdb.writeAtom('OH2', w.atoms['OH2'].position)
472 pdb.writeAtom('H1', w.atoms['H1'].position)
473 pdb.writeAtom('H2', w.atoms['H2'].position)
474 pdb.close()
475
476 except:
477 T.errWriteln("Error writing waters to %s: " % fTarget )
478 T.errWriteln( T.lastError() )
479
480
481
483 """
484 Return next 'clean', non-redundant, non-broken chain from PDB
485
486 @return: Scientific.IO.PDB.PeptideChain, completed chain OR
487 if no chain is left
488 @rtype: chain object OR None
489 """
490 self.counter = self.counter + 1
491 if (len(self.chains) > self.counter):
492 return self.chains[self.counter]
493 else:
494 return None
495
496
497
498
499
500 import Biskit.test as BT
501
502 -class Test(BT.BiskitTest):
503 """Test ChainSeparator """
504
506 self.fname = T.testRoot() + '/com/1BGS_original.pdb'
507 self.outPath = T.tempDir()
508
511
513 """ChainSeparator test"""
514 self.sep = ChainSeparator( self.fname, self.outPath, 1)
515
516 self.chain = self.sep.next()
517
518 i=1
519 all_chains = []
520 while self.chain <> None:
521 if self.local:
522 print 'Chain %i:'%i, ''.join(singleAA(self.chain.sequence() ) )
523 all_chains += self.chain.sequence()
524 self.chain = self.sep.next()
525 i += 1
526
527 if self.local:
528 print 'ChainSeparator log file written to: %s'%self.sep.log.fname
529
530 r = ''.join( singleAA( all_chains ) )
531 self.assertEqual(r, self.EXPECTED)
532
533
534 EXPECTED='AQVINTFDGVADYLQTYHKLPDNYITKSEAQALGWVASKGNLADVAPGKSIGGDIFSNREGKLPGKSGRTWREADINYTSGFRNSDRILYSSDWLIYKTTDHYQTFTKIRAQVINTFDGVADYLQTYHKLPDNYITKSEAQALGWVASKGNLADVAPGKSIGGDIFSNREGKLPGKSGRTWREADINYTSGFRNSDRILYSSDWLIYKTTDHYQTFTKIRAQVINTFDGVADYLQTYHKLPDNYITKSEAQALGWVASKGNLADVAPGKSIGGDIFSNREGKLPGKSGRTWREADINYTSGFRNSDRILYSSDWLIYKTTDHYQTFTKIRKKAVINGEQIRSISDLHQTLKKELALPEYYGENLDALWDALTGWVEYPLVLEWRQFEQSKQLTENGAESVLQVFREAKAEGADITIILSKKAVINGEQIRSISDLHQTLKKELALPEYYGENLDALWDALTGWVEYPLVLEWRQFEQSKQLTENGAESVLQVFREAKAEGADITIILSKKAVINGEQIRSISDLHQTLKKELALPEYYGENLDALWDALTGWVEYPLVLEWRQFEQSKQLTENGAESVLQVFREAKAEGADITIILS'
535
536
537
538 if __name__ == '__main__':
539
540 BT.localTest()
541