Biskit.ChainSeparator

50 """ 51 Open PDB file; give back one chain whenever next() is 52 called. This class is used by the pdb2xplor script. 53 54 This class constitutes vintage code. See 55 L{Biskit.PDBCleaner} and L{Biskit.Mod.TemplateCleaner} for a more 56 recent implementation of PDB cleaning. 57 58 @todo: The removal of duplicate chains should be transferred to 59 the PDBCleaner so that this class can be retired 60 """ 61

62 - def __init__(self, fname, outPath='', chainIdOffset=0, 63 capBreaks=0, chainMask=0, log=None ):

64 """ 65 @param fname: pdb filename 66 @type fname: str 67 @param outPath: path for log file 68 @type outPath: str 69 @param chainIdOffset: start chain numbering at this offset 70 @type chainIdOffset: int 71 @param capBreaks: add ACE and NME to N- and C-term. of chain breaks [0] 72 @type capBreaks: 0|1 73 @param chainMask: chain mask for overriding the default sequence identity [None] 74 @type chainMask: [1|0] 75 @param log: LogFile object 76 @type log: object 77 """ 78 self.pdb = Structure(fname); 79 self.fname = fname 80 self.outPath = T.absfile( outPath ) 81 self.chainIdOffset = chainIdOffset 82 self.capBreaks = capBreaks 83 self.log = LogFile( T.absfile(outPath)+'/' + self.pdbname()+'.log') 84 if log: 85 self.log = log 86 87 self.chains = self.pdb.peptide_chains 88 self.counter = -1 89 self.threshold = 0.9 # sequence identity between multiple copies in PDB 90 self._expressionCheck( 91 "[^\n].*[Hh][Oo][Mm][Oo].?[Dd][Ii][Mm][eE][Rr].*\n", 'HOMODIMER') 92 self._expressionCheck("[^\n].*[Tt][Rr][Ii][Mm][Ee][Rr].*\n", 'TRIMER') 93 self._hetatomCheck() 94 95 self.log.add("Separate chains: \n------------------") 96 self._removeDuplicateChains(chainMask) # keep only one copy of molecule 97 self._separateChainBreaks() 98 self._assign_seg_ids() # new segment id for each chain

99 100

101 - def pdbname(self):

102 """ 103 Extract pdb code from file name. 104 105 @return: (assumed) pdb code 106 @rtype: str 107 """ 108 return T.stripFilename(self.pdb.filename)

109 110

111 - def _expressionCheck(self, findExpression, findClean):

112 """ 113 Check and report if the regular expression 'findExpression' 114 exists in the PDB-file. Use this to locate data in the REMARK 115 section of a pdb file. Prints a warning to stdOut if the 116 regular expression is found. 117 118 @param findExpression: regular expression 119 @type findExpression: str 120 @param findClean: clean name of regular expression 121 @type findClean: str 122 """ 123 pdb = open(self.fname,'r') 124 pdbFile = pdb.read() 125 searchResult = re.findall(findExpression,pdbFile) 126 127 warningMessage = """ 128 WARNINGR! The text string'%s' was found in the PDB-file. 129 If this PDB-file contains a homodimer one of the chains will be 130 deleted by this script. To avoid this prepare the file for Xplor manualy \n""" %\ 131 ( findClean ) 132 warningMessage2 = """--------------------------------------------\n""" 133 134 if len(searchResult) != 0: 135 self.log.add(warningMessage) 136 self.log.add("String found in line(s): \n") 137 for i in range(0,len(searchResult)): 138 self.log.add(searchResult[i]) 139 self.log.add(warningMessage2) 140 pdb.close()

141 142

143 - def _hetatomCheck(self):

144 """ 145 Check and report if there are any none-water HETATMs in the PDB-file 146 """ 147 pdb = open(self.fname,'r') 148 pdbFile = pdb.read() 149 findExpression = "HETATM.*\n" 150 searchResult = re.findall(findExpression,pdbFile) 151 i=0 152 j = len(searchResult) 153 while i<j: 154 if searchResult[i][17:20] == "HOH" or \ 155 searchResult[i][0:6] != "HETATM" : 156 del searchResult[i] 157 i=i-1 158 j=j-1 159 i=i+1 160 161 warningMessage = """ 162 WARNING! The PDB-file contains coordinates for none water HETATMs. 163 If you want to keep the HETATM - prepare the file for Xplor manualy \n""" 164 warningMessage2 = "\n"+ 80*"-" + "\n" 165 if len(searchResult) != 0: 166 self.log.add(warningMessage) 167 self.log.add("String found in line(s): \n") 168 for i in range(0,len(searchResult)): 169 self.log.add(searchResult[i][0:-1]) 170 self.log.add(warningMessage2) 171 pdb.close()

172 173

174 - def _compareSequences( self, seq1, seq2 ):

175 """ 176 @param seq1: sequence 1 to compare 177 @type seq1: str 178 @param seq2: sequence 1 to compare 179 @type seq2: str 180 @return: identity (0.0 - 1.0) between the two sequences 181 @rtype : float 182 """ 183 # compare the 2 sequences 184 ## blast = Blast2Seq( seq1, seq2 ) 185 ## id = blast.run() 186 matcher = SequenceMatcher( None, ''.join(seq1) , ''.join(seq2) ) 187 return matcher.ratio()

188 189

190 - def _removeDuplicateChains(self, chainMask=None):

191 """ 192 Get rid of identical chains by comparing all chains with Blast2seq. 193 194 @param chainMask: chain mask for overriding the 195 chain identity checking (default: None) 196 @type chainMask: [int] 197 198 @return: number of chains removed 199 @rtype: int 200 """ 201 chainCount = len(self.chains) 202 matrix = 1.0 * N.zeros((chainCount,chainCount)) 203 chain_ids = [] 204 205 ## create identity matrix for all chains against all chains 206 for i in range(0, chainCount): 207 chain_ids = chain_ids + [self.chains[i].chain_id] # collect for log file 208 for j in range(i, len(self.chains)): 209 210 # convert 3-letter-code res list into 1-letter-code String 211 seq1 = singleAA( self.chains[i].sequence() ) 212 seq2 = singleAA( self.chains[j].sequence() ) 213 214 ## if len(seq1) > len(seq2): # take shorter sequence 215 ## # aln len at least half the len of the shortest sequence 216 ## alnCutoff = len(seq2) * 0.5 217 ## else: 218 ## alnCutoff = len(seq1) * 0.5 219 ## if id['aln_len'] > alnCutoff: 220 ## matrix[i,j] = id['aln_id'] 221 ## else: # aln length too short, ignore 222 ## matrix[i,j] = 0 223 224 matrix[i,j] = self._compareSequences( seq1, seq2 ) 225 226 ## report activity 227 self.log.add("\n Chain ID's of compared chains: "+str(chain_ids)) 228 self.log.add(" Cross-Identity between chains:\n"+str(matrix)) 229 self.log.add(" Identity threshold used: "+str(self.threshold)) 230 231 ## override the automatic chain deletion by supplying a 232 ## chain mask to this function 233 if chainMask: 234 if len(chainMask) == chainCount: 235 self.chains = N.compress(chainMask, self.chains) 236 self.log.add("NOTE: chain mask %s used for removing chains.\n"%chainMask) 237 238 else: 239 self.log.add("########## ERROR ###############") 240 self.log.add("# Chain mask is only %i chains long"%len(chainMask)) 241 self.log.add("# when a mask of length %i is needed"%chainCount) 242 self.log.add("# No cleaning will be performed.\n") 243 244 if not chainMask: 245 ## look at diagonals in "identity matrix" 246 ## (each chain against each) 247 duplicate = len(self.chains) 248 for offset in range(1,chainCount): 249 diag = N.diagonal(matrix, offset ,0,1) 250 # diagonal of 1's mark begin of duplicate 251 avg = 1.0 * N.sum(diag)/len(diag) 252 if (avg >= self.threshold): 253 duplicate = offset 254 break 255 self.chains = self.chains[:duplicate] 256 self.log.add("NOTE: Identity matrix will be used for removing identical chains.") 257 258 ## report activit 259 self.log.add(str(chainCount - len(self.chains))+\ 260 " chains have been removed.\n") 261 262 # how many chains have been removed? 263 return (chainCount - len(self.chains))

264 265

266 - def _assign_seg_ids(self):

267 """ 268 Assign new segment id to each chain. 269 """ 270 counter = self.chainIdOffset 271 for chain in self.chains: 272 273 ## Assemble segid from pdb code + one letter out of A to Z 274 chain.segment_id = self.pdbname()[:3] + string.uppercase[counter] 275 counter = counter + 1 276 try: # report changed segement ids 277 chain_id = chain.chain_id 278 self.log.add("changed segment ID of chain "+chain_id+\ 279 " to "+chain.segment_id) 280 except: 281 T.errWriteln("_assign_seg_ids(): logerror")

282 283

284 - def _sequentialDist(self, chain, cutoff, atom):

285 """ 286 Calculate sequential atom-atom distance, report residues with 287 longer distance than cutoff (chain break positions). 288 289 @param chain: Scientific.IO.PDB.PeptideChain object 290 @type chain: object 291 @param cutoff: threshold for reporting gap (chain break) 292 @type cutoff: float 293 @param atom: type of atoms to check (i.e. 'CA') 294 @type atom: str 295 296 @return: list of chain break positions (residue index for each 297 first residue of two that are too distant) 298 @rtype: list of int 299 """ 300 distanceList = [] 301 v0 = Vector( 0,0,0 ) 302 jump = 1 303 304 for res in range(0,len(chain)-2): 305 306 try: 307 v1 = Vector(chain[res][atom].position.array) 308 309 ## ignore CA with 0,0,0 coordinate 310 if v1 != v0: 311 312 jump = 1 313 v2 = Vector(chain[ res+jump ][atom].position.array) 314 315 ## look for next CA with non-zero coordinate 316 while v2 == v0 and jump + res < len( chain ): 317 jump += 1 318 v2 = Vector(chain[ res+jump ][atom].position.array) 319 320 if (v1 - v2).length() > cutoff * jump: 321 distanceList = distanceList + [res + jump - 1] 322 323 except: 324 self.log.add( 325 "_sequentialDist():\nError while checking CA-CA distance"+\ 326 " between residues "+str(chain[res].name)+\ 327 str(chain[res].number)+" and "+\ 328 str(chain[res+jump].name)+\ 329 str(chain[res+jump].number)+ " in chain "+chain.chain_id) 330 self.log.add("Error: " + T.lastError() ) 331 332 return distanceList

333 334 ## def _sequentialDist(self, chain, cutoff, atom): 335 ## """ 336 ## Calculate sequential atom-atom distance, report residues with 337 ## longer distance than cutoff (chain break positions). 338 ## chain - PDB.PeptideChain 339 ## cutoff - float, threshold for reporting gap (chain break) 340 ## atom - str, type of atoms to check (i.e. 'CA') 341 342 ## -> [int, int, ...], list of chain break positions (residue index 343 ## for each first residue of two that are too distant) 344 ## """ 345 ## distanceList = [] 346 ## for residue in range(0,len(chain)-1): 347 ## # iterate through residue 1 to ter-1 348 ## try: 349 ## vectorAtom1 = Vector(chain[residue][atom].position.array) 350 ## vectorAtom2 = Vector(chain[residue+1][atom].position.array) 351 352 ## if (vectorAtom1 - vectorAtom2).length() > cutoff: 353 ## distanceList = distanceList + [residue] 354 ## except: 355 ## self.log.add( 356 ## "_sequentialDist():\nError while checking CA-CA distance"+ \ 357 ## " between residues "+str(chain[residue].name)+\ 358 ## str(chain[residue].number)+" and "+str(chain[residue+1].name)+\ 359 ## str(chain[residue+1].number)+ " in chain "+chain.chain_id) 360 ## self.log.add("Error: " + T.lastError() ) 361 362 ## return distanceList 363 364

365 - def _separateChainBreaks(self):

366 """ 367 Separate chains with breaks into 2 chains. 368 The new chain(s) is/are added to the internal PDB instance 369 (self.chains). 370 """ 371 fragments = [] 372 373 for chain in self.chains: 374 # res number of residues before a break 375 breaks = self._sequentialDist(chain, 4.5, 'CA') 376 self.log.add(str(len(breaks)) + " breaks found in chain " +\ 377 "(" + str(len(chain)) \ 378 + " residues) " + chain.chain_id + ": "+str(breaks)) 379 380 previous = 0 381 ncap_next = 0 382 for breakRes in breaks: 383 384 residues = chain.residues[previous:breakRes+1] 385 previous = breakRes + 1 386 387 chainNew = PeptideChain(residues, chain.chain_id, 388 chain.segment_id) 389 if ncap_next: 390 self.__nCap( chainNew ) 391 ncap_next = 0 392 393 if self.capBreaks: 394 ## add N-Methyl to c terminal 395 self.__cCap( chainNew ) 396 397 ncap_next = 1 398 399 fragments = fragments + [chainNew] 400 401 chainNew = PeptideChain(chain.residues[previous:], chain.chain_id, 402 chain.segment_id) 403 if ncap_next: 404 self.__nCap( chainNew ) 405 406 fragments = fragments + [chainNew] 407 408 self.chains = fragments

409 410

411 - def __nCap( self, pep_chain ):

412 """ 413 Add acetyl capping to N-terminal of peptide chain 414 """ 415 n = (pep_chain[0].number or 1) - 1 416 417 r = AminoAcidResidue('ACE', number=n, atoms=[Atom('CA', Vector(0,0,0), 418 element='C')]) 419 pep_chain.residues = [r] + pep_chain.residues 420 421 self.log.add('Capping chain break with ACE %i' % n)

422 423

424 - def __cCap( self, pep_chain ):

425 """ 426 Add methyle amine capping to C-terminal of peptide chain 427 """ 428 n = (pep_chain[-1].number or len(pep_chain)) + 1 429 430 r = AminoAcidResidue('NME', number=n, atoms=[Atom('CA', Vector(0,0,0), 431 element='C')]) 432 pep_chain.residues = pep_chain.residues + [r] 433 434 self.log.add('Capping chain break at with NME %i' % n)

435 436

437 - def extractWaters(self):

438 """ 439 Write waters into separate pdb file, called |pdbCode|_waters.pdb. 440 """ 441 try: 442 fTarget = self.outPath + '/' +\ 443 self.pdbname()[:4] + '_waters.pdb' 444 pdb = PDBFile( fTarget, mode='w' ) 445 446 waters = [] 447 for key in ['HOH', 'DOD']: 448 449 if self.pdb.molecules.has_key( key ): 450 451 waters += self.pdb.molecules[ key ] 452 453 pdb.nextChain(chain_id='', segment_id='1XWW') 454 for w in waters: 455 pdb.nextResidue('TIP3') 456 ## XPLOR wants "ATOM" not "HETATM": 457 pdb.het_flag = 0 458 pdb.writeAtom('OH2', w.atoms['O'].position) 459 460 ## keep TIP3 waters as well 461 if len(waters) == 0: 462 try: 463 TIP3_waters = self.pdb.molecules[ 'TIP3' ] 464 except: 465 TIP3_waters = [] 466 467 for w in TIP3_waters: 468 pdb.nextResidue('TIP3') 469 ## XPLOR wants "ATOM" not "HETATM": 470 pdb.het_flag = 0 471 pdb.writeAtom('OH2', w.atoms['OH2'].position) 472 pdb.writeAtom('H1', w.atoms['H1'].position) 473 pdb.writeAtom('H2', w.atoms['H2'].position) 474 pdb.close() 475 476 except: 477 T.errWriteln("Error writing waters to %s: " % fTarget ) 478 T.errWriteln( T.lastError() )

479 480 481

482 - def next(self):

483 """ 484 Return next 'clean', non-redundant, non-broken chain from PDB 485 486 @return: Scientific.IO.PDB.PeptideChain, completed chain OR 487 if no chain is left 488 @rtype: chain object OR None 489 """ 490 self.counter = self.counter + 1 491 if (len(self.chains) > self.counter): 492 return self.chains[self.counter] 493 else: 494 return None

Source Code for Module Biskit.ChainSeparator