1
2 """
3 Older version of difflib. Here due to compability problems.
4 """
5
6 from __future__ import generators
7
8 """
9 Module difflib -- helpers for computing deltas between objects.
10
11 Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
12 Use SequenceMatcher to return list of the best "good enough" matches.
13
14 Function ndiff(a, b):
15 Return a delta: the difference between `a` and `b` (lists of strings).
16
17 Function restore(delta, which):
18 Return one of the two sequences that generated an ndiff delta.
19
20 Class SequenceMatcher:
21 A flexible class for comparing pairs of sequences of any type.
22
23 Class Differ:
24 For producing human-readable deltas from sequences of lines of text.
25 """
26
27 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
28 'Differ', 'IS_CHARACTER_JUNK', 'IS_LINE_JUNK']
29
30
32 """
33 SequenceMatcher is a flexible class for comparing pairs of sequences of
34 any type, so long as the sequence elements are hashable. The basic
35 algorithm predates, and is a little fancier than, an algorithm
36 published in the late 1980's by Ratcliff and Obershelp under the
37 hyperbolic name "gestalt pattern matching". The basic idea is to find
38 the longest contiguous matching subsequence that contains no "junk"
39 elements (R-O doesn't address junk). The same idea is then applied
40 recursively to the pieces of the sequences to the left and to the right
41 of the matching subsequence. This does not yield minimal edit
42 sequences, but does tend to yield matches that "look right" to people.
43
44 SequenceMatcher tries to compute a "human-friendly diff" between two
45 sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the
46 longest *contiguous* & junk-free matching subsequence. That's what
47 catches peoples' eyes. The Windows(tm) windiff has another interesting
48 notion, pairing up elements that appear uniquely in each sequence.
49 That, and the method here, appear to yield more intuitive difference
50 reports than does diff. This method appears to be the least vulnerable
51 to synching up on blocks of "junk lines", though (like blank lines in
52 ordinary text files, or maybe "<P>" lines in HTML files). That may be
53 because this is the only method of the 3 that has a *concept* of
54 "junk" <wink>.
55
56 Example, comparing two strings, and considering blanks to be "junk":
57
58 >>> s = SequenceMatcher(lambda x: x == " ",
59 ... "private Thread currentThread;",
60 ... "private volatile Thread currentThread;")
61 >>>
62
63 .ratio() returns a float in [0, 1], measuring the "similarity" of the
64 sequences. As a rule of thumb, a .ratio() value over 0.6 means the
65 sequences are close matches:
66
67 >>> print round(s.ratio(), 3)
68 ... 0.866
69 >>>
70
71 If you're only interested in where the sequences match,
72 .get_matching_blocks() is handy:
73
74 >>> for block in s.get_matching_blocks():
75 >>> print "a[%d] and b[%d] match for %d elements" % block
76 a[0] and b[0] match for 8 elements
77 a[8] and b[17] match for 6 elements
78 a[14] and b[23] match for 15 elements
79 a[29] and b[38] match for 0 elements
80 >>>
81
82 Note that the last tuple returned by .get_matching_blocks() is always a
83 dummy, (len(a), len(b), 0), and this is the only case in which the last
84 tuple element (number of elements matched) is 0.
85
86 If you want to know how to change the first sequence into the second,
87 use .get_opcodes():
88
89 >>> for opcode in s.get_opcodes():
90 ... print "%6s a[%d:%d] b[%d:%d]" % opcode
91 equal a[0:8] b[0:8]
92 insert a[8:8] b[8:17]
93 equal a[8:14] b[17:23]
94 equal a[14:29] b[23:38]
95
96 See the Differ class for a fancy human-friendly file differencer, which
97 uses SequenceMatcher both to compare sequences of lines, and to compare
98 sequences of characters within similar (near-matching) lines.
99
100 See also function get_close_matches() in this module, which shows how
101 simple code building on SequenceMatcher can be used to do useful work.
102
103 Timing: Basic R-O is cubic time worst case and quadratic time expected
104 case. SequenceMatcher is quadratic time for the worst case and has
105 expected-case behavior dependent in a complicated way on how many
106 elements the sequences have in common; best case time is linear.
107
108 Methods::
109
110 __init__(isjunk=None, a='', b='')
111 Construct a SequenceMatcher.
112
113 set_seqs(a, b)
114 Set the two sequences to be compared.
115
116 set_seq1(a)
117 Set the first sequence to be compared.
118
119 set_seq2(b)
120 Set the second sequence to be compared.
121
122 find_longest_match(alo, ahi, blo, bhi)
123 Find longest matching block in a[alo:ahi] and b[blo:bhi].
124
125 get_matching_blocks()
126 Return list of triples describing matching subsequences.
127
128 get_opcodes()
129 Return list of 5-tuples describing how to turn a into b.
130
131 ratio()
132 Return a measure of the sequences' similarity (float in [0,1]).
133
134 quick_ratio()
135 Return an upper bound on .ratio() relatively quickly.
136
137 real_quick_ratio()
138 Return an upper bound on ratio() very quickly.
139 """
140
141 - def __init__(self, isjunk=None, a='', b=''):
142 """Construct a SequenceMatcher.
143
144 Optional arg isjunk is None (the default), or a one-argument
145 function that takes a sequence element and returns true iff the
146 element is junk. None is equivalent to passing "lambda x: 0", i.e.
147 no elements are considered to be junk. For example, pass
148 lambda x: x in " \\t"
149 if you're comparing lines as sequences of characters, and don't
150 want to synch up on blanks or hard tabs.
151
152 Optional arg a is the first of two sequences to be compared. By
153 default, an empty string. The elements of a must be hashable. See
154 also .set_seqs() and .set_seq1().
155
156 Optional arg b is the second of two sequences to be compared. By
157 default, an empty string. The elements of b must be hashable. See
158 also .set_seqs() and .set_seq2().
159 """
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198 self.isjunk = isjunk
199 self.a = self.b = None
200 self.set_seqs(a, b)
201
203 """Set the two sequences to be compared.
204
205 >>> s = SequenceMatcher()
206 >>> s.set_seqs("abcd", "bcde")
207 >>> s.ratio()
208 0.75
209 """
210
211 self.set_seq1(a)
212 self.set_seq2(b)
213
215 """Set the first sequence to be compared.
216
217 The second sequence to be compared is not changed.
218
219 >>> s = SequenceMatcher(None, "abcd", "bcde")
220 >>> s.ratio()
221 0.75
222 >>> s.set_seq1("bcde")
223 >>> s.ratio()
224 1.0
225 >>>
226
227 SequenceMatcher computes and caches detailed information about the
228 second sequence, so if you want to compare one sequence S against
229 many sequences, use .set_seq2(S) once and call .set_seq1(x)
230 repeatedly for each of the other sequences.
231
232 See also set_seqs() and set_seq2().
233 """
234
235 if a is self.a:
236 return
237 self.a = a
238 self.matching_blocks = self.opcodes = None
239
241 """Set the second sequence to be compared.
242
243 The first sequence to be compared is not changed.
244
245 >>> s = SequenceMatcher(None, "abcd", "bcde")
246 >>> s.ratio()
247 0.75
248 >>> s.set_seq2("abcd")
249 >>> s.ratio()
250 1.0
251 >>>
252
253 SequenceMatcher computes and caches detailed information about the
254 second sequence, so if you want to compare one sequence S against
255 many sequences, use .set_seq2(S) once and call .set_seq1(x)
256 repeatedly for each of the other sequences.
257
258 See also set_seqs() and set_seq1().
259 """
260
261 if b is self.b:
262 return
263 self.b = b
264 self.matching_blocks = self.opcodes = None
265 self.fullbcount = None
266 self.__chain_b()
267
268
269
270
271
272
273
274
275
276
277
278
280
281
282
283
284
285
286
287
288
289
290 b = self.b
291 self.b2j = b2j = {}
292 self.b2jhas = b2jhas = b2j.has_key
293 for i in xrange(len(b)):
294 elt = b[i]
295 if b2jhas(elt):
296 b2j[elt].append(i)
297 else:
298 b2j[elt] = [i]
299
300
301
302
303
304 isjunk, junkdict = self.isjunk, {}
305 if isjunk:
306 for elt in b2j.keys():
307 if isjunk(elt):
308 junkdict[elt] = 1
309 del b2j[elt]
310
311
312
313
314
315
316 self.isbjunk = junkdict.has_key
317
319 """
320 Find longest matching block in a[alo:ahi] and b[blo:bhi].
321
322 If isjunk is not defined::
323
324 Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
325 alo <= i <= i+k <= ahi
326 blo <= j <= j+k <= bhi
327 and for all (i',j',k') meeting those conditions,
328 k >= k'
329 i <= i'
330 and if i == i', j <= j'
331
332 In other words, of all maximal matching blocks, return one that
333 starts earliest in a, and of all those maximal matching blocks that
334 start earliest in a, return the one that starts earliest in b.
335
336 >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
337 >>> s.find_longest_match(0, 5, 0, 9)
338 (0, 4, 5)
339
340 If isjunk is defined, first the longest matching block is
341 determined as above, but with the additional restriction that no
342 junk element appears in the block. Then that block is extended as
343 far as possible by matching (only) junk elements on both sides. So
344 the resulting block never matches on junk except as identical junk
345 happens to be adjacent to an "interesting" match.
346
347 Here's the same example as before, but considering blanks to be
348 junk. That prevents " abcd" from matching the " abcd" at the tail
349 end of the second sequence directly. Instead only the "abcd" can
350 match, and matches the leftmost "abcd" in the second sequence:
351
352 >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
353 >>> s.find_longest_match(0, 5, 0, 9)
354 (1, 0, 4)
355
356 If no blocks match, return (alo, blo, 0).
357
358 >>> s = SequenceMatcher(None, "ab", "c")
359 >>> s.find_longest_match(0, 2, 0, 1)
360 (0, 0, 0)
361 """
362
363
364
365
366
367
368
369
370
371
372
373
374
375 a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
376 besti, bestj, bestsize = alo, blo, 0
377
378
379
380 j2len = {}
381 nothing = []
382 for i in xrange(alo, ahi):
383
384
385 j2lenget = j2len.get
386 newj2len = {}
387 for j in b2j.get(a[i], nothing):
388
389 if j < blo:
390 continue
391 if j >= bhi:
392 break
393 k = newj2len[j] = j2lenget(j-1, 0) + 1
394 if k > bestsize:
395 besti, bestj, bestsize = i-k+1, j-k+1, k
396 j2len = newj2len
397
398
399
400
401
402
403
404
405 while besti > alo and bestj > blo and \
406 isbjunk(b[bestj-1]) and \
407 a[besti-1] == b[bestj-1]:
408 besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
409 while besti+bestsize < ahi and bestj+bestsize < bhi and \
410 isbjunk(b[bestj+bestsize]) and \
411 a[besti+bestsize] == b[bestj+bestsize]:
412 bestsize = bestsize + 1
413
414 return besti, bestj, bestsize
415
417 """Return list of triples describing matching subsequences.
418
419 Each triple is of the form (i, j, n), and means that
420 a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
421 i and in j.
422
423 The last triple is a dummy, (len(a), len(b), 0), and is the only
424 triple with n==0.
425
426 >>> s = SequenceMatcher(None, "abxcd", "abcd")
427 >>> s.get_matching_blocks()
428 [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
429 >>>
430 """
431
432 if self.matching_blocks is not None:
433 return self.matching_blocks
434 self.matching_blocks = []
435 la, lb = len(self.a), len(self.b)
436 self.__helper(0, la, 0, lb, self.matching_blocks)
437 self.matching_blocks.append( (la, lb, 0) )
438 return self.matching_blocks
439
440
441
442
443 - def __helper(self, alo, ahi, blo, bhi, answer):
444 i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
445
446
447
448 if k:
449 if alo < i and blo < j:
450 self.__helper(alo, i, blo, j, answer)
451 answer.append(x)
452 if i+k < ahi and j+k < bhi:
453 self.__helper(i+k, ahi, j+k, bhi, answer)
454
456 """
457 Return list of 5-tuples describing how to turn a into b.
458
459 Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple
460 has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
461 tuple preceding it, and likewise for j1 == the previous j2.
462
463 The tags are strings, with these meanings:
464
465 - 'replace': a[i1:i2] should be replaced by b[j1:j2]
466 - 'delete': a[i1:i2] should be deleted.
467 Note that j1==j2 in this case.
468 - 'insert': b[j1:j2] should be inserted at a[i1:i1].
469 Note that i1==i2 in this case.
470 - 'equal': a[i1:i2] == b[j1:j2]
471
472 >>> a = "qabxcd"
473 >>> b = "abycdf"
474 >>> s = SequenceMatcher(None, a, b)
475 >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
476 ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
477 ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
478 delete a[0:1] (q) b[0:0] ()
479 equal a[1:3] (ab) b[0:2] (ab)
480 replace a[3:4] (x) b[2:3] (y)
481 equal a[4:6] (cd) b[3:5] (cd)
482 insert a[6:6] () b[5:6] (f)
483 """
484
485 if self.opcodes is not None:
486 return self.opcodes
487 i = j = 0
488 self.opcodes = answer = []
489 for ai, bj, size in self.get_matching_blocks():
490
491
492
493
494
495 tag = ''
496 if i < ai and j < bj:
497 tag = 'replace'
498 elif i < ai:
499 tag = 'delete'
500 elif j < bj:
501 tag = 'insert'
502 if tag:
503 answer.append( (tag, i, ai, j, bj) )
504 i, j = ai+size, bj+size
505
506
507 if size:
508 answer.append( ('equal', ai, i, bj, j) )
509 return answer
510
512 """Return a measure of the sequences' similarity (float in [0,1]).
513
514 Where T is the total number of elements in both sequences, and
515 M is the number of matches, this is 2,0*M / T.
516 Note that this is 1 if the sequences are identical, and 0 if
517 they have nothing in common.
518
519 .ratio() is expensive to compute if you haven't already computed
520 .get_matching_blocks() or .get_opcodes(), in which case you may
521 want to try .quick_ratio() or .real_quick_ratio() first to get an
522 upper bound.
523
524 >>> s = SequenceMatcher(None, "abcd", "bcde")
525 >>> s.ratio()
526 0.75
527 >>> s.quick_ratio()
528 0.75
529 >>> s.real_quick_ratio()
530 1.0
531 """
532
533 matches = reduce(lambda sum, triple: sum + triple[-1],
534 self.get_matching_blocks(), 0)
535 return 2.0 * matches / (len(self.a) + len(self.b))
536
538 """Return an upper bound on ratio() relatively quickly.
539
540 This isn't defined beyond that it is an upper bound on .ratio(), and
541 is faster to compute.
542 """
543
544
545
546
547 if self.fullbcount is None:
548 self.fullbcount = fullbcount = {}
549 for elt in self.b:
550 fullbcount[elt] = fullbcount.get(elt, 0) + 1
551 fullbcount = self.fullbcount
552
553
554 avail = {}
555 availhas, matches = avail.has_key, 0
556 for elt in self.a:
557 if availhas(elt):
558 numb = avail[elt]
559 else:
560 numb = fullbcount.get(elt, 0)
561 avail[elt] = numb - 1
562 if numb > 0:
563 matches = matches + 1
564 return 2.0 * matches / (len(self.a) + len(self.b))
565
567 """Return an upper bound on ratio() very quickly.
568
569 This isn't defined beyond that it is an upper bound on .ratio(), and
570 is faster to compute than either .ratio() or .quick_ratio().
571 """
572
573 la, lb = len(self.a), len(self.b)
574
575
576 return 2.0 * min(la, lb) / (la + lb)
577
579 """Use SequenceMatcher to return list of the best "good enough" matches.
580
581 word is a sequence for which close matches are desired (typically a
582 string).
583
584 possibilities is a list of sequences against which to match word
585 (typically a list of strings).
586
587 Optional arg n (default 3) is the maximum number of close matches to
588 return. n must be > 0.
589
590 Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
591 that don't score at least that similar to word are ignored.
592
593 The best (no more than n) matches among the possibilities are returned
594 in a list, sorted by similarity score, most similar first.
595
596 >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
597 ['apple', 'ape']
598 >>> import keyword as _keyword
599 >>> get_close_matches("wheel", _keyword.kwlist)
600 ['while']
601 >>> get_close_matches("apple", _keyword.kwlist)
602 []
603 >>> get_close_matches("accept", _keyword.kwlist)
604 ['except']
605 """
606
607 if not n > 0:
608 raise ValueError("n must be > 0: " + `n`)
609 if not 0.0 <= cutoff <= 1.0:
610 raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`)
611 result = []
612 s = SequenceMatcher()
613 s.set_seq2(word)
614 for x in possibilities:
615 s.set_seq1(x)
616 if s.real_quick_ratio() >= cutoff and \
617 s.quick_ratio() >= cutoff and \
618 s.ratio() >= cutoff:
619 result.append((s.ratio(), x))
620
621 result.sort()
622
623 result = result[-n:]
624
625 result.reverse()
626
627 return [x for score, x in result]
628
629
631 """
632 Return number of `ch` characters at the start of `line`.
633
634 Example:
635
636 >>> _count_leading(' abc', ' ')
637 3
638 """
639
640 i, n = 0, len(line)
641 while i < n and line[i] == ch:
642 i += 1
643 return i
644
646 r"""
647 Differ is a class for comparing sequences of lines of text, and
648 producing human-readable differences or deltas. Differ uses
649 SequenceMatcher both to compare sequences of lines, and to compare
650 sequences of characters within similar (near-matching) lines.
651
652 Each line of a Differ delta begins with a two-letter code::
653
654 '- ' line unique to sequence 1
655 '+ ' line unique to sequence 2
656 ' ' line common to both sequences
657 '? ' line not present in either input sequence
658
659 Lines beginning with '? ' attempt to guide the eye to intraline
660 differences, and were not present in either input sequence. These lines
661 can be confusing if the sequences contain tab characters.
662
663 Note that Differ makes no claim to produce a *minimal* diff. To the
664 contrary, minimal diffs are often counter-intuitive, because they synch
665 up anywhere possible, sometimes accidental matches 100 pages apart.
666 Restricting synch points to contiguous matches preserves some notion of
667 locality, at the occasional cost of producing a longer diff.
668
669 Example: Comparing two texts.
670
671 First we set up the texts, sequences of individual single-line strings
672 ending with newlines (such sequences can also be obtained from the
673 `readlines()` method of file-like objects):
674
675 >>> text1 = ''' 1. Beautiful is better than ugly.
676 ... 2. Explicit is better than implicit.
677 ... 3. Simple is better than complex.
678 ... 4. Complex is better than complicated.
679 ... '''.splitlines(1)
680 >>> len(text1)
681 4
682 >>> text1[0][-1]
683 '\n'
684 >>> text2 = ''' 1. Beautiful is better than ugly.
685 ... 3. Simple is better than complex.
686 ... 4. Complicated is better than complex.
687 ... 5. Flat is better than nested.
688 ... '''.splitlines(1)
689
690 Next we instantiate a Differ object:
691
692 >>> d = Differ()
693
694 Note that when instantiating a Differ object we may pass functions to
695 filter out line and character 'junk'. See Differ.__init__ for details.
696
697 Finally, we compare the two:
698
699 >>> result = list(d.compare(text1, text2))
700
701 'result' is a list of strings, so let's pretty-print it:
702
703 >>> from pprint import pprint as _pprint
704 >>> _pprint(result)
705 [' 1. Beautiful is better than ugly.\n',
706 '- 2. Explicit is better than implicit.\n',
707 '- 3. Simple is better than complex.\n',
708 '+ 3. Simple is better than complex.\n',
709 '? ++\n',
710 '- 4. Complex is better than complicated.\n',
711 '? ^ ---- ^\n',
712 '+ 4. Complicated is better than complex.\n',
713 '? ++++ ^ ^\n',
714 '+ 5. Flat is better than nested.\n']
715
716 As a single multi-line string it looks like this:
717
718 >>> print ''.join(result),
719 1. Beautiful is better than ugly.
720 - 2. Explicit is better than implicit.
721 - 3. Simple is better than complex.
722 + 3. Simple is better than complex.
723 ? ++
724 - 4. Complex is better than complicated.
725 ? ^ ---- ^
726 + 4. Complicated is better than complex.
727 ? ++++ ^ ^
728 + 5. Flat is better than nested.
729
730 Methods::
731
732 __init__(linejunk=None, charjunk=None)
733 Construct a text differencer, with optional filters.
734
735 compare(a, b)
736 Compare two sequences of lines; generate the resulting delta.
737 """
738
739 - def __init__(self, linejunk=None, charjunk=None):
740 """
741 Construct a text differencer, with optional filters.
742
743 The two optional keyword parameters are for filter functions:
744
745 - `linejunk`: A function that should accept a single string argument,
746 and return true iff the string is junk. The module-level function
747 `IS_LINE_JUNK` may be used to filter out lines without visible
748 characters, except for at most one splat ('#').
749
750 - `charjunk`: A function that should accept a string of length 1. The
751 module-level function `IS_CHARACTER_JUNK` may be used to filter out
752 whitespace characters (a blank or tab; **note**: bad idea to include
753 newline in this!).
754 """
755
756 self.linejunk = linejunk
757 self.charjunk = charjunk
758
760 r"""
761 Compare two sequences of lines; generate the resulting delta.
762
763 Each sequence must contain individual single-line strings ending with
764 newlines. Such sequences can be obtained from the `readlines()` method
765 of file-like objects. The delta generated also consists of newline-
766 terminated strings, ready to be printed as-is via the writeline()
767 method of a file-like object.
768
769 Example:
770
771 >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),
772 ... 'ore\ntree\nemu\n'.splitlines(1))),
773 - one
774 ? ^
775 + ore
776 ? ^
777 - two
778 - three
779 ? -
780 + tree
781 + emu
782 """
783
784 cruncher = SequenceMatcher(self.linejunk, a, b)
785 for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
786 if tag == 'replace':
787 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
788 elif tag == 'delete':
789 g = self._dump('-', a, alo, ahi)
790 elif tag == 'insert':
791 g = self._dump('+', b, blo, bhi)
792 elif tag == 'equal':
793 g = self._dump(' ', a, alo, ahi)
794 else:
795 raise ValueError, 'unknown tag ' + `tag`
796
797 for line in g:
798 yield line
799
800 - def _dump(self, tag, x, lo, hi):
801 """Generate comparison results for a same-tagged range."""
802 for i in xrange(lo, hi):
803 yield '%s %s' % (tag, x[i])
804
806 assert alo < ahi and blo < bhi
807
808
809 if bhi - blo < ahi - alo:
810 first = self._dump('+', b, blo, bhi)
811 second = self._dump('-', a, alo, ahi)
812 else:
813 first = self._dump('-', a, alo, ahi)
814 second = self._dump('+', b, blo, bhi)
815
816 for g in first, second:
817 for line in g:
818 yield line
819
821 r"""
822 When replacing one block of lines with another, search the blocks
823 for *similar* lines; the best-matching pair (if any) is used as a
824 synch point, and intraline difference marking is done on the
825 similar pair. Lots of work, but often worth it.
826
827 Example:
828
829 >>> d = Differ()
830 >>> d._fancy_replace(['abcDefghiJkl\n'], 0, 1, ['abcdefGhijkl\n'], 0, 1)
831 >>> print ''.join(d.results),
832 - abcDefghiJkl
833 ? ^ ^ ^
834 + abcdefGhijkl
835 ? ^ ^ ^
836 """
837
838
839
840 best_ratio, cutoff = 0.74, 0.75
841 cruncher = SequenceMatcher(self.charjunk)
842 eqi, eqj = None, None
843
844
845
846
847 for j in xrange(blo, bhi):
848 bj = b[j]
849 cruncher.set_seq2(bj)
850 for i in xrange(alo, ahi):
851 ai = a[i]
852 if ai == bj:
853 if eqi is None:
854 eqi, eqj = i, j
855 continue
856 cruncher.set_seq1(ai)
857
858
859
860
861
862
863 if cruncher.real_quick_ratio() > best_ratio and \
864 cruncher.quick_ratio() > best_ratio and \
865 cruncher.ratio() > best_ratio:
866 best_ratio, best_i, best_j = cruncher.ratio(), i, j
867 if best_ratio < cutoff:
868
869 if eqi is None:
870
871 for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
872 yield line
873 return
874
875 best_i, best_j, best_ratio = eqi, eqj, 1.0
876 else:
877
878 eqi = None
879
880
881
882
883
884 for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
885 yield line
886
887
888 aelt, belt = a[best_i], b[best_j]
889 if eqi is None:
890
891 atags = btags = ""
892 cruncher.set_seqs(aelt, belt)
893 for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
894 la, lb = ai2 - ai1, bj2 - bj1
895 if tag == 'replace':
896 atags += '^' * la
897 btags += '^' * lb
898 elif tag == 'delete':
899 atags += '-' * la
900 elif tag == 'insert':
901 btags += '+' * lb
902 elif tag == 'equal':
903 atags += ' ' * la
904 btags += ' ' * lb
905 else:
906 raise ValueError, 'unknown tag ' + `tag`
907 for line in self._qformat(aelt, belt, atags, btags):
908 yield line
909 else:
910
911 yield ' ' + aelt
912
913
914 for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
915 yield line
916
918 g = []
919 if alo < ahi:
920 if blo < bhi:
921 g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
922 else:
923 g = self._dump('-', a, alo, ahi)
924 elif blo < bhi:
925 g = self._dump('+', b, blo, bhi)
926
927 for line in g:
928 yield line
929
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979 import re
980
982 r"""
983 Return 1 for ignorable line: iff `line` is blank or contains a single '#'.
984
985 Examples:
986
987 >>> IS_LINE_JUNK('\n')
988 1
989 >>> IS_LINE_JUNK(' # \n')
990 1
991 >>> IS_LINE_JUNK('hello\n')
992 0
993 """
994
995 return pat(line) is not None
996
998 r"""
999 Return 1 for ignorable character: iff `ch` is a space or tab.
1000
1001 Examples:
1002
1003 >>> IS_CHARACTER_JUNK(' ')
1004 1
1005 >>> IS_CHARACTER_JUNK('\t')
1006 1
1007 >>> IS_CHARACTER_JUNK('\n')
1008 0
1009 >>> IS_CHARACTER_JUNK('x')
1010 0
1011 """
1012
1013 return ch in ws
1014
1015 del re
1016
1018 r"""
1019 Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
1020
1021 Optional keyword parameters `linejunk` and `charjunk` are for filter
1022 functions (or None):
1023
1024 - linejunk: A function that should accept a single string argument, and
1025 return true iff the string is junk. The default is module-level function
1026 IS_LINE_JUNK, which filters out lines without visible characters, except
1027 for at most one splat ('#').
1028
1029 - charjunk: A function that should accept a string of length 1. The
1030 default is module-level function IS_CHARACTER_JUNK, which filters out
1031 whitespace characters (a blank or tab; note: bad idea to include newline
1032 in this!).
1033
1034 Tools/scripts/ndiff.py is a command-line front-end to this function.
1035
1036 Example:
1037
1038 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1039 ... 'ore\ntree\nemu\n'.splitlines(1))
1040 >>> print ''.join(diff),
1041 - one
1042 ? ^
1043 + ore
1044 ? ^
1045 - two
1046 - three
1047 ? -
1048 + tree
1049 + emu
1050 """
1051 return Differ(linejunk, charjunk).compare(a, b)
1052
1054 r"""
1055 Generate one of the two sequences that generated a delta.
1056
1057 Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
1058 lines originating from file 1 or 2 (parameter `which`), stripping off line
1059 prefixes.
1060
1061 Examples:
1062
1063 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
1064 ... 'ore\ntree\nemu\n'.splitlines(1))
1065 >>> diff = list(diff)
1066 >>> print ''.join(restore(diff, 1)),
1067 one
1068 two
1069 three
1070 >>> print ''.join(restore(diff, 2)),
1071 ore
1072 tree
1073 emu
1074 """
1075 try:
1076 tag = {1: "- ", 2: "+ "}[int(which)]
1077 except KeyError:
1078 raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
1079 % which)
1080 prefixes = (" ", tag)
1081 for line in delta:
1082 if line[:2] in prefixes:
1083 yield line[2:]
1084
1085
1086
1087 import Biskit.test as BT
1088
1089 -class Test(BT.BiskitTest):
1090 """Mock test"""
1091 pass
1092
1093
1095 import doctest, difflib
1096 return doctest.testmod(difflib)
1097
1098 if __name__ == "__main__":
1099 _test()
1100