Biskit.difflib

1 #! /usr/bin/env python2.2 2 """ 3 Older version of difflib. Here due to compability problems. 4 """ 5 6 from __future__ import generators 7 8 """ 9 Module difflib -- helpers for computing deltas between objects. 10 11 Function get_close_matches(word, possibilities, n=3, cutoff=0.6): 12 Use SequenceMatcher to return list of the best "good enough" matches. 13 14 Function ndiff(a, b): 15 Return a delta: the difference between `a` and `b` (lists of strings). 16 17 Function restore(delta, which): 18 Return one of the two sequences that generated an ndiff delta. 19 20 Class SequenceMatcher: 21 A flexible class for comparing pairs of sequences of any type. 22 23 Class Differ: 24 For producing human-readable deltas from sequences of lines of text. 25 """ 26 27 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', 28 'Differ', 'IS_CHARACTER_JUNK', 'IS_LINE_JUNK'] 29 30

31 -class SequenceMatcher:

32 """ 33 SequenceMatcher is a flexible class for comparing pairs of sequences of 34 any type, so long as the sequence elements are hashable. The basic 35 algorithm predates, and is a little fancier than, an algorithm 36 published in the late 1980's by Ratcliff and Obershelp under the 37 hyperbolic name "gestalt pattern matching". The basic idea is to find 38 the longest contiguous matching subsequence that contains no "junk" 39 elements (R-O doesn't address junk). The same idea is then applied 40 recursively to the pieces of the sequences to the left and to the right 41 of the matching subsequence. This does not yield minimal edit 42 sequences, but does tend to yield matches that "look right" to people. 43 44 SequenceMatcher tries to compute a "human-friendly diff" between two 45 sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the 46 longest *contiguous* & junk-free matching subsequence. That's what 47 catches peoples' eyes. The Windows(tm) windiff has another interesting 48 notion, pairing up elements that appear uniquely in each sequence. 49 That, and the method here, appear to yield more intuitive difference 50 reports than does diff. This method appears to be the least vulnerable 51 to synching up on blocks of "junk lines", though (like blank lines in 52 ordinary text files, or maybe "<P>" lines in HTML files). That may be 53 because this is the only method of the 3 that has a *concept* of 54 "junk" <wink>. 55 56 Example, comparing two strings, and considering blanks to be "junk": 57 58 >>> s = SequenceMatcher(lambda x: x == " ", 59 ... "private Thread currentThread;", 60 ... "private volatile Thread currentThread;") 61 >>> 62 63 .ratio() returns a float in [0, 1], measuring the "similarity" of the 64 sequences. As a rule of thumb, a .ratio() value over 0.6 means the 65 sequences are close matches: 66 67 >>> print round(s.ratio(), 3) 68 ... 0.866 69 >>> 70 71 If you're only interested in where the sequences match, 72 .get_matching_blocks() is handy: 73 74 >>> for block in s.get_matching_blocks(): 75 >>> print "a[%d] and b[%d] match for %d elements" % block 76 a[0] and b[0] match for 8 elements 77 a[8] and b[17] match for 6 elements 78 a[14] and b[23] match for 15 elements 79 a[29] and b[38] match for 0 elements 80 >>> 81 82 Note that the last tuple returned by .get_matching_blocks() is always a 83 dummy, (len(a), len(b), 0), and this is the only case in which the last 84 tuple element (number of elements matched) is 0. 85 86 If you want to know how to change the first sequence into the second, 87 use .get_opcodes(): 88 89 >>> for opcode in s.get_opcodes(): 90 ... print "%6s a[%d:%d] b[%d:%d]" % opcode 91 equal a[0:8] b[0:8] 92 insert a[8:8] b[8:17] 93 equal a[8:14] b[17:23] 94 equal a[14:29] b[23:38] 95 96 See the Differ class for a fancy human-friendly file differencer, which 97 uses SequenceMatcher both to compare sequences of lines, and to compare 98 sequences of characters within similar (near-matching) lines. 99 100 See also function get_close_matches() in this module, which shows how 101 simple code building on SequenceMatcher can be used to do useful work. 102 103 Timing: Basic R-O is cubic time worst case and quadratic time expected 104 case. SequenceMatcher is quadratic time for the worst case and has 105 expected-case behavior dependent in a complicated way on how many 106 elements the sequences have in common; best case time is linear. 107 108 Methods:: 109 110 __init__(isjunk=None, a='', b='') 111 Construct a SequenceMatcher. 112 113 set_seqs(a, b) 114 Set the two sequences to be compared. 115 116 set_seq1(a) 117 Set the first sequence to be compared. 118 119 set_seq2(b) 120 Set the second sequence to be compared. 121 122 find_longest_match(alo, ahi, blo, bhi) 123 Find longest matching block in a[alo:ahi] and b[blo:bhi]. 124 125 get_matching_blocks() 126 Return list of triples describing matching subsequences. 127 128 get_opcodes() 129 Return list of 5-tuples describing how to turn a into b. 130 131 ratio() 132 Return a measure of the sequences' similarity (float in [0,1]). 133 134 quick_ratio() 135 Return an upper bound on .ratio() relatively quickly. 136 137 real_quick_ratio() 138 Return an upper bound on ratio() very quickly. 139 """ 140

141 - def __init__(self, isjunk=None, a='', b=''):

142 """Construct a SequenceMatcher. 143 144 Optional arg isjunk is None (the default), or a one-argument 145 function that takes a sequence element and returns true iff the 146 element is junk. None is equivalent to passing "lambda x: 0", i.e. 147 no elements are considered to be junk. For example, pass 148 lambda x: x in " \\t" 149 if you're comparing lines as sequences of characters, and don't 150 want to synch up on blanks or hard tabs. 151 152 Optional arg a is the first of two sequences to be compared. By 153 default, an empty string. The elements of a must be hashable. See 154 also .set_seqs() and .set_seq1(). 155 156 Optional arg b is the second of two sequences to be compared. By 157 default, an empty string. The elements of b must be hashable. See 158 also .set_seqs() and .set_seq2(). 159 """ 160 161 # Members: 162 # a 163 # first sequence 164 # b 165 # second sequence; differences are computed as "what do 166 # we need to do to 'a' to change it into 'b'?" 167 # b2j 168 # for x in b, b2j[x] is a list of the indices (into b) 169 # at which x appears; junk elements do not appear 170 # b2jhas 171 # b2j.has_key 172 # fullbcount 173 # for x in b, fullbcount[x] == the number of times x 174 # appears in b; only materialized if really needed (used 175 # only for computing quick_ratio()) 176 # matching_blocks 177 # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; 178 # ascending & non-overlapping in i and in j; terminated by 179 # a dummy (len(a), len(b), 0) sentinel 180 # opcodes 181 # a list of (tag, i1, i2, j1, j2) tuples, where tag is 182 # one of 183 # 'replace' a[i1:i2] should be replaced by b[j1:j2] 184 # 'delete' a[i1:i2] should be deleted 185 # 'insert' b[j1:j2] should be inserted 186 # 'equal' a[i1:i2] == b[j1:j2] 187 # isjunk 188 # a user-supplied function taking a sequence element and 189 # returning true iff the element is "junk" -- this has 190 # subtle but helpful effects on the algorithm, which I'll 191 # get around to writing up someday <0.9 wink>. 192 # DON'T USE! Only __chain_b uses this. Use isbjunk. 193 # isbjunk 194 # for x in b, isbjunk(x) == isjunk(x) but much faster; 195 # it's really the has_key method of a hidden dict. 196 # DOES NOT WORK for x in a! 197 198 self.isjunk = isjunk 199 self.a = self.b = None 200 self.set_seqs(a, b)

201

202 - def set_seqs(self, a, b):

203 """Set the two sequences to be compared. 204 205 >>> s = SequenceMatcher() 206 >>> s.set_seqs("abcd", "bcde") 207 >>> s.ratio() 208 0.75 209 """ 210 211 self.set_seq1(a) 212 self.set_seq2(b)

213

214 - def set_seq1(self, a):

215 """Set the first sequence to be compared. 216 217 The second sequence to be compared is not changed. 218 219 >>> s = SequenceMatcher(None, "abcd", "bcde") 220 >>> s.ratio() 221 0.75 222 >>> s.set_seq1("bcde") 223 >>> s.ratio() 224 1.0 225 >>> 226 227 SequenceMatcher computes and caches detailed information about the 228 second sequence, so if you want to compare one sequence S against 229 many sequences, use .set_seq2(S) once and call .set_seq1(x) 230 repeatedly for each of the other sequences. 231 232 See also set_seqs() and set_seq2(). 233 """ 234 235 if a is self.a: 236 return 237 self.a = a 238 self.matching_blocks = self.opcodes = None

239

240 - def set_seq2(self, b):

241 """Set the second sequence to be compared. 242 243 The first sequence to be compared is not changed. 244 245 >>> s = SequenceMatcher(None, "abcd", "bcde") 246 >>> s.ratio() 247 0.75 248 >>> s.set_seq2("abcd") 249 >>> s.ratio() 250 1.0 251 >>> 252 253 SequenceMatcher computes and caches detailed information about the 254 second sequence, so if you want to compare one sequence S against 255 many sequences, use .set_seq2(S) once and call .set_seq1(x) 256 repeatedly for each of the other sequences. 257 258 See also set_seqs() and set_seq1(). 259 """ 260 261 if b is self.b: 262 return 263 self.b = b 264 self.matching_blocks = self.opcodes = None 265 self.fullbcount = None 266 self.__chain_b()

267 268 # For each element x in b, set b2j[x] to a list of the indices in 269 # b where x appears; the indices are in increasing order; note that 270 # the number of times x appears in b is len(b2j[x]) ... 271 # when self.isjunk is defined, junk elements don't show up in this 272 # map at all, which stops the central find_longest_match method 273 # from starting any matching block at a junk element ... 274 # also creates the fast isbjunk function ... 275 # note that this is only called when b changes; so for cross-product 276 # kinds of matches, it's best to call set_seq2 once, then set_seq1 277 # repeatedly 278

279 - def __chain_b(self):

280 # Because isjunk is a user-defined (not C) function, and we test 281 # for junk a LOT, it's important to minimize the number of calls. 282 # Before the tricks described here, __chain_b was by far the most 283 # time-consuming routine in the whole module! If anyone sees 284 # Jim Roskind, thank him again for profile.py -- I never would 285 # have guessed that. 286 # The first trick is to build b2j ignoring the possibility 287 # of junk. I.e., we don't call isjunk at all yet. Throwing 288 # out the junk later is much cheaper than building b2j "right" 289 # from the start. 290 b = self.b 291 self.b2j = b2j = {} 292 self.b2jhas = b2jhas = b2j.has_key 293 for i in xrange(len(b)): 294 elt = b[i] 295 if b2jhas(elt): 296 b2j[elt].append(i) 297 else: 298 b2j[elt] = [i] 299 300 # Now b2j.keys() contains elements uniquely, and especially when 301 # the sequence is a string, that's usually a good deal smaller 302 # than len(string). The difference is the number of isjunk calls 303 # saved. 304 isjunk, junkdict = self.isjunk, {} 305 if isjunk: 306 for elt in b2j.keys(): 307 if isjunk(elt): 308 junkdict[elt] = 1 # value irrelevant; it's a set 309 del b2j[elt] 310 311 # Now for x in b, isjunk(x) == junkdict.has_key(x), but the 312 # latter is much faster. Note too that while there may be a 313 # lot of junk in the sequence, the number of *unique* junk 314 # elements is probably small. So the memory burden of keeping 315 # this dict alive is likely trivial compared to the size of b2j. 316 self.isbjunk = junkdict.has_key

317

318 - def find_longest_match(self, alo, ahi, blo, bhi):

319 """ 320 Find longest matching block in a[alo:ahi] and b[blo:bhi]. 321 322 If isjunk is not defined:: 323 324 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where 325 alo <= i <= i+k <= ahi 326 blo <= j <= j+k <= bhi 327 and for all (i',j',k') meeting those conditions, 328 k >= k' 329 i <= i' 330 and if i == i', j <= j' 331 332 In other words, of all maximal matching blocks, return one that 333 starts earliest in a, and of all those maximal matching blocks that 334 start earliest in a, return the one that starts earliest in b. 335 336 >>> s = SequenceMatcher(None, " abcd", "abcd abcd") 337 >>> s.find_longest_match(0, 5, 0, 9) 338 (0, 4, 5) 339 340 If isjunk is defined, first the longest matching block is 341 determined as above, but with the additional restriction that no 342 junk element appears in the block. Then that block is extended as 343 far as possible by matching (only) junk elements on both sides. So 344 the resulting block never matches on junk except as identical junk 345 happens to be adjacent to an "interesting" match. 346 347 Here's the same example as before, but considering blanks to be 348 junk. That prevents " abcd" from matching the " abcd" at the tail 349 end of the second sequence directly. Instead only the "abcd" can 350 match, and matches the leftmost "abcd" in the second sequence: 351 352 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd") 353 >>> s.find_longest_match(0, 5, 0, 9) 354 (1, 0, 4) 355 356 If no blocks match, return (alo, blo, 0). 357 358 >>> s = SequenceMatcher(None, "ab", "c") 359 >>> s.find_longest_match(0, 2, 0, 1) 360 (0, 0, 0) 361 """ 362 363 # CAUTION: stripping common prefix or suffix would be incorrect. 364 # E.g., 365 # ab 366 # acab 367 # Longest matching block is "ab", but if common prefix is 368 # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so 369 # strip, so ends up claiming that ab is changed to acab by 370 # inserting "ca" in the middle. That's minimal but unintuitive: 371 # "it's obvious" that someone inserted "ac" at the front. 372 # Windiff ends up at the same place as diff, but by pairing up 373 # the unique 'b's and then matching the first two 'a's. 374 375 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk 376 besti, bestj, bestsize = alo, blo, 0 377 # find longest junk-free match 378 # during an iteration of the loop, j2len[j] = length of longest 379 # junk-free match ending with a[i-1] and b[j] 380 j2len = {} 381 nothing = [] 382 for i in xrange(alo, ahi): 383 # look at all instances of a[i] in b; note that because 384 # b2j has no junk keys, the loop is skipped if a[i] is junk 385 j2lenget = j2len.get 386 newj2len = {} 387 for j in b2j.get(a[i], nothing): 388 # a[i] matches b[j] 389 if j < blo: 390 continue 391 if j >= bhi: 392 break 393 k = newj2len[j] = j2lenget(j-1, 0) + 1 394 if k > bestsize: 395 besti, bestj, bestsize = i-k+1, j-k+1, k 396 j2len = newj2len 397 398 # Now that we have a wholly interesting match (albeit possibly 399 # empty!), we may as well suck up the matching junk on each 400 # side of it too. Can't think of a good reason not to, and it 401 # saves post-processing the (possibly considerable) expense of 402 # figuring out what to do with it. In the case of an empty 403 # interesting match, this is clearly the right thing to do, 404 # because no other kind of match is possible in the regions. 405 while besti > alo and bestj > blo and \ 406 isbjunk(b[bestj-1]) and \ 407 a[besti-1] == b[bestj-1]: 408 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 409 while besti+bestsize < ahi and bestj+bestsize < bhi and \ 410 isbjunk(b[bestj+bestsize]) and \ 411 a[besti+bestsize] == b[bestj+bestsize]: 412 bestsize = bestsize + 1 413 414 return besti, bestj, bestsize

415

416 - def get_matching_blocks(self):

417 """Return list of triples describing matching subsequences. 418 419 Each triple is of the form (i, j, n), and means that 420 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in 421 i and in j. 422 423 The last triple is a dummy, (len(a), len(b), 0), and is the only 424 triple with n==0. 425 426 >>> s = SequenceMatcher(None, "abxcd", "abcd") 427 >>> s.get_matching_blocks() 428 [(0, 0, 2), (3, 2, 2), (5, 4, 0)] 429 >>> 430 """ 431 432 if self.matching_blocks is not None: 433 return self.matching_blocks 434 self.matching_blocks = [] 435 la, lb = len(self.a), len(self.b) 436 self.__helper(0, la, 0, lb, self.matching_blocks) 437 self.matching_blocks.append( (la, lb, 0) ) 438 return self.matching_blocks

439 440 # builds list of matching blocks covering a[alo:ahi] and 441 # b[blo:bhi], appending them in increasing order to answer 442

443 - def __helper(self, alo, ahi, blo, bhi, answer):

444 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) 445 # a[alo:i] vs b[blo:j] unknown 446 # a[i:i+k] same as b[j:j+k] 447 # a[i+k:ahi] vs b[j+k:bhi] unknown 448 if k: 449 if alo < i and blo < j: 450 self.__helper(alo, i, blo, j, answer) 451 answer.append(x) 452 if i+k < ahi and j+k < bhi: 453 self.__helper(i+k, ahi, j+k, bhi, answer)

454

455 - def get_opcodes(self):

456 """ 457 Return list of 5-tuples describing how to turn a into b. 458 459 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple 460 has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the 461 tuple preceding it, and likewise for j1 == the previous j2. 462 463 The tags are strings, with these meanings: 464 465 - 'replace': a[i1:i2] should be replaced by b[j1:j2] 466 - 'delete': a[i1:i2] should be deleted. 467 Note that j1==j2 in this case. 468 - 'insert': b[j1:j2] should be inserted at a[i1:i1]. 469 Note that i1==i2 in this case. 470 - 'equal': a[i1:i2] == b[j1:j2] 471 472 >>> a = "qabxcd" 473 >>> b = "abycdf" 474 >>> s = SequenceMatcher(None, a, b) 475 >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): 476 ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % 477 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])) 478 delete a[0:1] (q) b[0:0] () 479 equal a[1:3] (ab) b[0:2] (ab) 480 replace a[3:4] (x) b[2:3] (y) 481 equal a[4:6] (cd) b[3:5] (cd) 482 insert a[6:6] () b[5:6] (f) 483 """ 484 485 if self.opcodes is not None: 486 return self.opcodes 487 i = j = 0 488 self.opcodes = answer = [] 489 for ai, bj, size in self.get_matching_blocks(): 490 # invariant: we've pumped out correct diffs to change 491 # a[:i] into b[:j], and the next matching block is 492 # a[ai:ai+size] == b[bj:bj+size]. So we need to pump 493 # out a diff to change a[i:ai] into b[j:bj], pump out 494 # the matching block, and move (i,j) beyond the match 495 tag = '' 496 if i < ai and j < bj: 497 tag = 'replace' 498 elif i < ai: 499 tag = 'delete' 500 elif j < bj: 501 tag = 'insert' 502 if tag: 503 answer.append( (tag, i, ai, j, bj) ) 504 i, j = ai+size, bj+size 505 # the list of matching blocks is terminated by a 506 # sentinel with size 0 507 if size: 508 answer.append( ('equal', ai, i, bj, j) ) 509 return answer

510

511 - def ratio(self):

512 """Return a measure of the sequences' similarity (float in [0,1]). 513 514 Where T is the total number of elements in both sequences, and 515 M is the number of matches, this is 2,0*M / T. 516 Note that this is 1 if the sequences are identical, and 0 if 517 they have nothing in common. 518 519 .ratio() is expensive to compute if you haven't already computed 520 .get_matching_blocks() or .get_opcodes(), in which case you may 521 want to try .quick_ratio() or .real_quick_ratio() first to get an 522 upper bound. 523 524 >>> s = SequenceMatcher(None, "abcd", "bcde") 525 >>> s.ratio() 526 0.75 527 >>> s.quick_ratio() 528 0.75 529 >>> s.real_quick_ratio() 530 1.0 531 """ 532 533 matches = reduce(lambda sum, triple: sum + triple[-1], 534 self.get_matching_blocks(), 0) 535 return 2.0 * matches / (len(self.a) + len(self.b))

536

537 - def quick_ratio(self):

538 """Return an upper bound on ratio() relatively quickly. 539 540 This isn't defined beyond that it is an upper bound on .ratio(), and 541 is faster to compute. 542 """ 543 544 # viewing a and b as multisets, set matches to the cardinality 545 # of their intersection; this counts the number of matches 546 # without regard to order, so is clearly an upper bound 547 if self.fullbcount is None: 548 self.fullbcount = fullbcount = {} 549 for elt in self.b: 550 fullbcount[elt] = fullbcount.get(elt, 0) + 1 551 fullbcount = self.fullbcount 552 # avail[x] is the number of times x appears in 'b' less the 553 # number of times we've seen it in 'a' so far ... kinda 554 avail = {} 555 availhas, matches = avail.has_key, 0 556 for elt in self.a: 557 if availhas(elt): 558 numb = avail[elt] 559 else: 560 numb = fullbcount.get(elt, 0) 561 avail[elt] = numb - 1 562 if numb > 0: 563 matches = matches + 1 564 return 2.0 * matches / (len(self.a) + len(self.b))

565

566 - def real_quick_ratio(self):

567 """Return an upper bound on ratio() very quickly. 568 569 This isn't defined beyond that it is an upper bound on .ratio(), and 570 is faster to compute than either .ratio() or .quick_ratio(). 571 """ 572 573 la, lb = len(self.a), len(self.b) 574 # can't have more matches than the number of elements in the 575 # shorter sequence 576 return 2.0 * min(la, lb) / (la + lb)

577

578 -def get_close_matches(word, possibilities, n=3, cutoff=0.6):

579 """Use SequenceMatcher to return list of the best "good enough" matches. 580 581 word is a sequence for which close matches are desired (typically a 582 string). 583 584 possibilities is a list of sequences against which to match word 585 (typically a list of strings). 586 587 Optional arg n (default 3) is the maximum number of close matches to 588 return. n must be > 0. 589 590 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities 591 that don't score at least that similar to word are ignored. 592 593 The best (no more than n) matches among the possibilities are returned 594 in a list, sorted by similarity score, most similar first. 595 596 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"]) 597 ['apple', 'ape'] 598 >>> import keyword as _keyword 599 >>> get_close_matches("wheel", _keyword.kwlist) 600 ['while'] 601 >>> get_close_matches("apple", _keyword.kwlist) 602 [] 603 >>> get_close_matches("accept", _keyword.kwlist) 604 ['except'] 605 """ 606 607 if not n > 0: 608 raise ValueError("n must be > 0: " + `n`) 609 if not 0.0 <= cutoff <= 1.0: 610 raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`) 611 result = [] 612 s = SequenceMatcher() 613 s.set_seq2(word) 614 for x in possibilities: 615 s.set_seq1(x) 616 if s.real_quick_ratio() >= cutoff and \ 617 s.quick_ratio() >= cutoff and \ 618 s.ratio() >= cutoff: 619 result.append((s.ratio(), x)) 620 # Sort by score. 621 result.sort() 622 # Retain only the best n. 623 result = result[-n:] 624 # Move best-scorer to head of list. 625 result.reverse() 626 # Strip scores. 627 return [x for score, x in result]

628 629

630 -def _count_leading(line, ch):

631 """ 632 Return number of `ch` characters at the start of `line`. 633 634 Example: 635 636 >>> _count_leading(' abc', ' ') 637 3 638 """ 639 640 i, n = 0, len(line) 641 while i < n and line[i] == ch: 642 i += 1 643 return i

644

645 -class Differ:

646 r""" 647 Differ is a class for comparing sequences of lines of text, and 648 producing human-readable differences or deltas. Differ uses 649 SequenceMatcher both to compare sequences of lines, and to compare 650 sequences of characters within similar (near-matching) lines. 651 652 Each line of a Differ delta begins with a two-letter code:: 653 654 '- ' line unique to sequence 1 655 '+ ' line unique to sequence 2 656 ' ' line common to both sequences 657 '? ' line not present in either input sequence 658 659 Lines beginning with '? ' attempt to guide the eye to intraline 660 differences, and were not present in either input sequence. These lines 661 can be confusing if the sequences contain tab characters. 662 663 Note that Differ makes no claim to produce a *minimal* diff. To the 664 contrary, minimal diffs are often counter-intuitive, because they synch 665 up anywhere possible, sometimes accidental matches 100 pages apart. 666 Restricting synch points to contiguous matches preserves some notion of 667 locality, at the occasional cost of producing a longer diff. 668 669 Example: Comparing two texts. 670 671 First we set up the texts, sequences of individual single-line strings 672 ending with newlines (such sequences can also be obtained from the 673 `readlines()` method of file-like objects): 674 675 >>> text1 = ''' 1. Beautiful is better than ugly. 676 ... 2. Explicit is better than implicit. 677 ... 3. Simple is better than complex. 678 ... 4. Complex is better than complicated. 679 ... '''.splitlines(1) 680 >>> len(text1) 681 4 682 >>> text1[0][-1] 683 '\n' 684 >>> text2 = ''' 1. Beautiful is better than ugly. 685 ... 3. Simple is better than complex. 686 ... 4. Complicated is better than complex. 687 ... 5. Flat is better than nested. 688 ... '''.splitlines(1) 689 690 Next we instantiate a Differ object: 691 692 >>> d = Differ() 693 694 Note that when instantiating a Differ object we may pass functions to 695 filter out line and character 'junk'. See Differ.__init__ for details. 696 697 Finally, we compare the two: 698 699 >>> result = list(d.compare(text1, text2)) 700 701 'result' is a list of strings, so let's pretty-print it: 702 703 >>> from pprint import pprint as _pprint 704 >>> _pprint(result) 705 [' 1. Beautiful is better than ugly.\n', 706 '- 2. Explicit is better than implicit.\n', 707 '- 3. Simple is better than complex.\n', 708 '+ 3. Simple is better than complex.\n', 709 '? ++\n', 710 '- 4. Complex is better than complicated.\n', 711 '? ^ ---- ^\n', 712 '+ 4. Complicated is better than complex.\n', 713 '? ++++ ^ ^\n', 714 '+ 5. Flat is better than nested.\n'] 715 716 As a single multi-line string it looks like this: 717 718 >>> print ''.join(result), 719 1. Beautiful is better than ugly. 720 - 2. Explicit is better than implicit. 721 - 3. Simple is better than complex. 722 + 3. Simple is better than complex. 723 ? ++ 724 - 4. Complex is better than complicated. 725 ? ^ ---- ^ 726 + 4. Complicated is better than complex. 727 ? ++++ ^ ^ 728 + 5. Flat is better than nested. 729 730 Methods:: 731 732 __init__(linejunk=None, charjunk=None) 733 Construct a text differencer, with optional filters. 734 735 compare(a, b) 736 Compare two sequences of lines; generate the resulting delta. 737 """ 738

739 - def __init__(self, linejunk=None, charjunk=None):

740 """ 741 Construct a text differencer, with optional filters. 742 743 The two optional keyword parameters are for filter functions: 744 745 - `linejunk`: A function that should accept a single string argument, 746 and return true iff the string is junk. The module-level function 747 `IS_LINE_JUNK` may be used to filter out lines without visible 748 characters, except for at most one splat ('#'). 749 750 - `charjunk`: A function that should accept a string of length 1. The 751 module-level function `IS_CHARACTER_JUNK` may be used to filter out 752 whitespace characters (a blank or tab; **note**: bad idea to include 753 newline in this!). 754 """ 755 756 self.linejunk = linejunk 757 self.charjunk = charjunk

758

759 - def compare(self, a, b):

760 r""" 761 Compare two sequences of lines; generate the resulting delta. 762 763 Each sequence must contain individual single-line strings ending with 764 newlines. Such sequences can be obtained from the `readlines()` method 765 of file-like objects. The delta generated also consists of newline- 766 terminated strings, ready to be printed as-is via the writeline() 767 method of a file-like object. 768 769 Example: 770 771 >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1), 772 ... 'ore\ntree\nemu\n'.splitlines(1))), 773 - one 774 ? ^ 775 + ore 776 ? ^ 777 - two 778 - three 779 ? - 780 + tree 781 + emu 782 """ 783 784 cruncher = SequenceMatcher(self.linejunk, a, b) 785 for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): 786 if tag == 'replace': 787 g = self._fancy_replace(a, alo, ahi, b, blo, bhi) 788 elif tag == 'delete': 789 g = self._dump('-', a, alo, ahi) 790 elif tag == 'insert': 791 g = self._dump('+', b, blo, bhi) 792 elif tag == 'equal': 793 g = self._dump(' ', a, alo, ahi) 794 else: 795 raise ValueError, 'unknown tag ' + `tag` 796 797 for line in g: 798 yield line

799

800 - def _dump(self, tag, x, lo, hi):

801 """Generate comparison results for a same-tagged range.""" 802 for i in xrange(lo, hi): 803 yield '%s %s' % (tag, x[i])

804

805 - def _plain_replace(self, a, alo, ahi, b, blo, bhi):

806 assert alo < ahi and blo < bhi 807 # dump the shorter block first -- reduces the burden on short-term 808 # memory if the blocks are of very different sizes 809 if bhi - blo < ahi - alo: 810 first = self._dump('+', b, blo, bhi) 811 second = self._dump('-', a, alo, ahi) 812 else: 813 first = self._dump('-', a, alo, ahi) 814 second = self._dump('+', b, blo, bhi) 815 816 for g in first, second: 817 for line in g: 818 yield line

819

820 - def _fancy_replace(self, a, alo, ahi, b, blo, bhi):

821 r""" 822 When replacing one block of lines with another, search the blocks 823 for *similar* lines; the best-matching pair (if any) is used as a 824 synch point, and intraline difference marking is done on the 825 similar pair. Lots of work, but often worth it. 826 827 Example: 828 829 >>> d = Differ() 830 >>> d._fancy_replace(['abcDefghiJkl\n'], 0, 1, ['abcdefGhijkl\n'], 0, 1) 831 >>> print ''.join(d.results), 832 - abcDefghiJkl 833 ? ^ ^ ^ 834 + abcdefGhijkl 835 ? ^ ^ ^ 836 """ 837 838 # don't synch up unless the lines have a similarity score of at 839 # least cutoff; best_ratio tracks the best score seen so far 840 best_ratio, cutoff = 0.74, 0.75 841 cruncher = SequenceMatcher(self.charjunk) 842 eqi, eqj = None, None # 1st indices of equal lines (if any) 843 844 # search for the pair that matches best without being identical 845 # (identical lines must be junk lines, & we don't want to synch up 846 # on junk -- unless we have to) 847 for j in xrange(blo, bhi): 848 bj = b[j] 849 cruncher.set_seq2(bj) 850 for i in xrange(alo, ahi): 851 ai = a[i] 852 if ai == bj: 853 if eqi is None: 854 eqi, eqj = i, j 855 continue 856 cruncher.set_seq1(ai) 857 # computing similarity is expensive, so use the quick 858 # upper bounds first -- have seen this speed up messy 859 # compares by a factor of 3. 860 # note that ratio() is only expensive to compute the first 861 # time it's called on a sequence pair; the expensive part 862 # of the computation is cached by cruncher 863 if cruncher.real_quick_ratio() > best_ratio and \ 864 cruncher.quick_ratio() > best_ratio and \ 865 cruncher.ratio() > best_ratio: 866 best_ratio, best_i, best_j = cruncher.ratio(), i, j 867 if best_ratio < cutoff: 868 # no non-identical "pretty close" pair 869 if eqi is None: 870 # no identical pair either -- treat it as a straight replace 871 for line in self._plain_replace(a, alo, ahi, b, blo, bhi): 872 yield line 873 return 874 # no close pair, but an identical pair -- synch up on that 875 best_i, best_j, best_ratio = eqi, eqj, 1.0 876 else: 877 # there's a close pair, so forget the identical pair (if any) 878 eqi = None 879 880 # a[best_i] very similar to b[best_j]; eqi is None iff they're not 881 # identical 882 883 # pump out diffs from before the synch point 884 for line in self._fancy_helper(a, alo, best_i, b, blo, best_j): 885 yield line 886 887 # do intraline marking on the synch pair 888 aelt, belt = a[best_i], b[best_j] 889 if eqi is None: 890 # pump out a '-', '?', '+', '?' quad for the synched lines 891 atags = btags = "" 892 cruncher.set_seqs(aelt, belt) 893 for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes(): 894 la, lb = ai2 - ai1, bj2 - bj1 895 if tag == 'replace': 896 atags += '^' * la 897 btags += '^' * lb 898 elif tag == 'delete': 899 atags += '-' * la 900 elif tag == 'insert': 901 btags += '+' * lb 902 elif tag == 'equal': 903 atags += ' ' * la 904 btags += ' ' * lb 905 else: 906 raise ValueError, 'unknown tag ' + `tag` 907 for line in self._qformat(aelt, belt, atags, btags): 908 yield line 909 else: 910 # the synch pair is identical 911 yield ' ' + aelt 912 913 # pump out diffs from after the synch point 914 for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi): 915 yield line

916

917 - def _fancy_helper(self, a, alo, ahi, b, blo, bhi):

918 g = [] 919 if alo < ahi: 920 if blo < bhi: 921 g = self._fancy_replace(a, alo, ahi, b, blo, bhi) 922 else: 923 g = self._dump('-', a, alo, ahi) 924 elif blo < bhi: 925 g = self._dump('+', b, blo, bhi) 926 927 for line in g: 928 yield line

929

930 - def _qformat(self, aline, bline, atags, btags):

931 r""" 932 Format "?" output and deal with leading tabs. 933 934 Example: 935 936 >>> d = Differ() 937 >>> d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n', 938 ... ' ^ ^ ^ ', '+ ^ ^ ^ ') 939 >>> for line in d.results: print repr(line) 940 ... 941 '- \tabcDefghiJkl\n' 942 '? \t ^ ^ ^\n' 943 '+ \t\tabcdefGhijkl\n' 944 '? \t ^ ^ ^\n' 945 """ 946 947 # Can hurt, but will probably help most of the time. 948 common = min(_count_leading(aline, "\t"), 949 _count_leading(bline, "\t")) 950 common = min(common, _count_leading(atags[:common], " ")) 951 atags = atags[common:].rstrip() 952 btags = btags[common:].rstrip() 953 954 yield "- " + aline 955 if atags: 956 yield "? %s%s\n" % ("\t" * common, atags) 957 958 yield "+ " + bline 959 if btags: 960 yield "? %s%s\n" % ("\t" * common, btags)

961 962 # With respect to junk, an earlier version of ndiff simply refused to 963 # *start* a match with a junk element. The result was cases like this: 964 # before: private Thread currentThread; 965 # after: private volatile Thread currentThread; 966 # If you consider whitespace to be junk, the longest contiguous match 967 # not starting with junk is "e Thread currentThread". So ndiff reported 968 # that "e volatil" was inserted between the 't' and the 'e' in "private". 969 # While an accurate view, to people that's absurd. The current version 970 # looks for matching blocks that are entirely junk-free, then extends the 971 # longest one of those as far as possible but only with matching junk. 972 # So now "currentThread" is matched, then extended to suck up the 973 # preceding blank; then "private" is matched, and extended to suck up the 974 # following blank; then "Thread" is matched; and finally ndiff reports 975 # that "volatile " was inserted before "Thread". The only quibble 976 # remaining is that perhaps it was really the case that " volatile" 977 # was inserted after "private". I can live with that <wink>. 978 979 import re 980

981 -def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):

982 r""" 983 Return 1 for ignorable line: iff `line` is blank or contains a single '#'. 984 985 Examples: 986 987 >>> IS_LINE_JUNK('\n') 988 1 989 >>> IS_LINE_JUNK(' # \n') 990 1 991 >>> IS_LINE_JUNK('hello\n') 992 0 993 """ 994 995 return pat(line) is not None

996

997 -def IS_CHARACTER_JUNK(ch, ws=" \t"):

998 r""" 999 Return 1 for ignorable character: iff `ch` is a space or tab. 1000 1001 Examples: 1002 1003 >>> IS_CHARACTER_JUNK(' ') 1004 1 1005 >>> IS_CHARACTER_JUNK('\t') 1006 1 1007 >>> IS_CHARACTER_JUNK('\n') 1008 0 1009 >>> IS_CHARACTER_JUNK('x') 1010 0 1011 """ 1012 1013 return ch in ws

1014 1015 del re 1016

1017 -def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK):

1018 r""" 1019 Compare `a` and `b` (lists of strings); return a `Differ`-style delta. 1020 1021 Optional keyword parameters `linejunk` and `charjunk` are for filter 1022 functions (or None): 1023 1024 - linejunk: A function that should accept a single string argument, and 1025 return true iff the string is junk. The default is module-level function 1026 IS_LINE_JUNK, which filters out lines without visible characters, except 1027 for at most one splat ('#'). 1028 1029 - charjunk: A function that should accept a string of length 1. The 1030 default is module-level function IS_CHARACTER_JUNK, which filters out 1031 whitespace characters (a blank or tab; note: bad idea to include newline 1032 in this!). 1033 1034 Tools/scripts/ndiff.py is a command-line front-end to this function. 1035 1036 Example: 1037 1038 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), 1039 ... 'ore\ntree\nemu\n'.splitlines(1)) 1040 >>> print ''.join(diff), 1041 - one 1042 ? ^ 1043 + ore 1044 ? ^ 1045 - two 1046 - three 1047 ? - 1048 + tree 1049 + emu 1050 """ 1051 return Differ(linejunk, charjunk).compare(a, b)

1052

1053 -def restore(delta, which):

1054 r""" 1055 Generate one of the two sequences that generated a delta. 1056 1057 Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract 1058 lines originating from file 1 or 2 (parameter `which`), stripping off line 1059 prefixes. 1060 1061 Examples: 1062 1063 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), 1064 ... 'ore\ntree\nemu\n'.splitlines(1)) 1065 >>> diff = list(diff) 1066 >>> print ''.join(restore(diff, 1)), 1067 one 1068 two 1069 three 1070 >>> print ''.join(restore(diff, 2)), 1071 ore 1072 tree 1073 emu 1074 """ 1075 try: 1076 tag = {1: "- ", 2: "+ "}[int(which)] 1077 except KeyError: 1078 raise ValueError, ('unknown delta choice (must be 1 or 2): %r' 1079 % which) 1080 prefixes = (" ", tag) 1081 for line in delta: 1082 if line[:2] in prefixes: 1083 yield line[2:]

1084 1085 ################ 1086 ## empty test ## 1087 import Biskit.test as BT 1088

1089 -class Test(BT.BiskitTest):

1090 """Mock test""" 1091 pass

1092 1093

1094 -def _test():

1095 import doctest, difflib 1096 return doctest.testmod(difflib)

1097 1098 if __name__ == "__main__": 1099 _test() 1100

Source Code for Module Biskit.difflib_old