[Python-checkins] CVS: python/dist/src/Tools/scripts ndiff.py,1.7,1.8

Tim Peters tim_one@users.sourceforge.net
Sat, 10 Feb 2001 00:00:55 -0800


Update of /cvsroot/python/python/dist/src/Tools/scripts
In directory usw-pr-cvs1:/tmp/cvs-serv20941/python/dist/src/Tools/scripts

Modified Files:
	ndiff.py 
Log Message:
Moved SequenceMatcher from ndiff into new std library module difflib.py.
Guido told me to do this <wink>.
Greatly expanded docstrings, and fleshed out with examples.
New std test.
Added new get_close_matches() function for ESR.
Needs docs, but LaTeXification of the module docstring is all it needs.
\CVS: ----------------------------------------------------------------------


Index: ndiff.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Tools/scripts/ndiff.py,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -r1.7 -r1.8
*** ndiff.py	2001/01/17 08:48:39	1.7
--- ndiff.py	2001/02/10 08:00:53	1.8
***************
*** 98,101 ****
--- 98,103 ----
  # have been in sys.argv[1:] had the cmd-line form been used.
  
+ from difflib import SequenceMatcher
+ 
  import string
  TRACE = 0
***************
*** 111,406 ****
  
  del re
- 
- class SequenceMatcher:
-     def __init__(self, isjunk=None, a='', b=''):
-         # Members:
-         # a
-         #      first sequence
-         # b
-         #      second sequence; differences are computed as "what do
-         #      we need to do to 'a' to change it into 'b'?"
-         # b2j
-         #      for x in b, b2j[x] is a list of the indices (into b)
-         #      at which x appears; junk elements do not appear
-         # b2jhas
-         #      b2j.has_key
-         # fullbcount
-         #      for x in b, fullbcount[x] == the number of times x
-         #      appears in b; only materialized if really needed (used
-         #      only for computing quick_ratio())
-         # matching_blocks
-         #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
-         #      ascending & non-overlapping in i and in j; terminated by
-         #      a dummy (len(a), len(b), 0) sentinel
-         # opcodes
-         #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
-         #      one of
-         #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
-         #          'delete'    a[i1:i2] should be deleted
-         #          'insert'    b[j1:j2] should be inserted
-         #          'equal'     a[i1:i2] == b[j1:j2]
-         # isjunk
-         #      a user-supplied function taking a sequence element and
-         #      returning true iff the element is "junk" -- this has
-         #      subtle but helpful effects on the algorithm, which I'll
-         #      get around to writing up someday <0.9 wink>.
-         #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
-         # isbjunk
-         #      for x in b, isbjunk(x) == isjunk(x) but much faster;
-         #      it's really the has_key method of a hidden dict.
-         #      DOES NOT WORK for x in a!
- 
-         self.isjunk = isjunk
-         self.a = self.b = None
-         self.set_seqs(a, b)
- 
-     def set_seqs(self, a, b):
-         self.set_seq1(a)
-         self.set_seq2(b)
- 
-     def set_seq1(self, a):
-         if a is self.a:
-             return
-         self.a = a
-         self.matching_blocks = self.opcodes = None
- 
-     def set_seq2(self, b):
-         if b is self.b:
-             return
-         self.b = b
-         self.matching_blocks = self.opcodes = None
-         self.fullbcount = None
-         self.__chain_b()
- 
-     # For each element x in b, set b2j[x] to a list of the indices in
-     # b where x appears; the indices are in increasing order; note that
-     # the number of times x appears in b is len(b2j[x]) ...
-     # when self.isjunk is defined, junk elements don't show up in this
-     # map at all, which stops the central find_longest_match method
-     # from starting any matching block at a junk element ...
-     # also creates the fast isbjunk function ...
-     # note that this is only called when b changes; so for cross-product
-     # kinds of matches, it's best to call set_seq2 once, then set_seq1
-     # repeatedly
- 
-     def __chain_b(self):
-         # Because isjunk is a user-defined (not C) function, and we test
-         # for junk a LOT, it's important to minimize the number of calls.
-         # Before the tricks described here, __chain_b was by far the most
-         # time-consuming routine in the whole module!  If anyone sees
-         # Jim Roskind, thank him again for profile.py -- I never would
-         # have guessed that.
-         # The first trick is to build b2j ignoring the possibility
-         # of junk.  I.e., we don't call isjunk at all yet.  Throwing
-         # out the junk later is much cheaper than building b2j "right"
-         # from the start.
-         b = self.b
-         self.b2j = b2j = {}
-         self.b2jhas = b2jhas = b2j.has_key
-         for i in xrange(len(b)):
-             elt = b[i]
-             if b2jhas(elt):
-                 b2j[elt].append(i)
-             else:
-                 b2j[elt] = [i]
- 
-         # Now b2j.keys() contains elements uniquely, and especially when
-         # the sequence is a string, that's usually a good deal smaller
-         # than len(string).  The difference is the number of isjunk calls
-         # saved.
-         isjunk, junkdict = self.isjunk, {}
-         if isjunk:
-             for elt in b2j.keys():
-                 if isjunk(elt):
-                     junkdict[elt] = 1   # value irrelevant; it's a set
-                     del b2j[elt]
- 
-         # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
-         # latter is much faster.  Note too that while there may be a
-         # lot of junk in the sequence, the number of *unique* junk
-         # elements is probably small.  So the memory burden of keeping
-         # this dict alive is likely trivial compared to the size of b2j.
-         self.isbjunk = junkdict.has_key
- 
-     def find_longest_match(self, alo, ahi, blo, bhi):
-         """Find longest matching block in a[alo:ahi] and b[blo:bhi].
- 
-         If isjunk is not defined:
- 
-         Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
-             alo <= i <= i+k <= ahi
-             blo <= j <= j+k <= bhi
-         and for all (i',j',k') meeting those conditions,
-             k >= k'
-             i <= i'
-             and if i == i', j <= j'
-         In other words, of all maximal matching blocks, return one
-         that starts earliest in a, and of all those maximal matching
-         blocks that start earliest in a, return the one that starts
-         earliest in b.
- 
-         If isjunk is defined, first the longest matching block is
-         determined as above, but with the additional restriction that
-         no junk element appears in the block.  Then that block is
-         extended as far as possible by matching (only) junk elements on
-         both sides.  So the resulting block never matches on junk except
-         as identical junk happens to be adjacent to an "interesting"
-         match.
- 
-         If no blocks match, return (alo, blo, 0).
-         """
- 
-         # CAUTION:  stripping common prefix or suffix would be incorrect.
-         # E.g.,
-         #    ab
-         #    acab
-         # Longest matching block is "ab", but if common prefix is
-         # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
-         # strip, so ends up claiming that ab is changed to acab by
-         # inserting "ca" in the middle.  That's minimal but unintuitive:
-         # "it's obvious" that someone inserted "ac" at the front.
-         # Windiff ends up at the same place as diff, but by pairing up
-         # the unique 'b's and then matching the first two 'a's.
- 
-         a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
-         besti, bestj, bestsize = alo, blo, 0
-         # find longest junk-free match
-         # during an iteration of the loop, j2len[j] = length of longest
-         # junk-free match ending with a[i-1] and b[j]
-         j2len = {}
-         nothing = []
-         for i in xrange(alo, ahi):
-             # look at all instances of a[i] in b; note that because
-             # b2j has no junk keys, the loop is skipped if a[i] is junk
-             j2lenget = j2len.get
-             newj2len = {}
-             for j in b2j.get(a[i], nothing):
-                 # a[i] matches b[j]
-                 if j < blo:
-                     continue
-                 if j >= bhi:
-                     break
-                 k = newj2len[j] = j2lenget(j-1, 0) + 1
-                 if k > bestsize:
-                     besti, bestj, bestsize = i-k+1, j-k+1, k
-             j2len = newj2len
- 
-         # Now that we have a wholly interesting match (albeit possibly
-         # empty!), we may as well suck up the matching junk on each
-         # side of it too.  Can't think of a good reason not to, and it
-         # saves post-processing the (possibly considerable) expense of
-         # figuring out what to do with it.  In the case of an empty
-         # interesting match, this is clearly the right thing to do,
-         # because no other kind of match is possible in the regions.
-         while besti > alo and bestj > blo and \
-               isbjunk(b[bestj-1]) and \
-               a[besti-1] == b[bestj-1]:
-             besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
-         while besti+bestsize < ahi and bestj+bestsize < bhi and \
-               isbjunk(b[bestj+bestsize]) and \
-               a[besti+bestsize] == b[bestj+bestsize]:
-             bestsize = bestsize + 1
- 
-         if TRACE:
-             print "get_matching_blocks", alo, ahi, blo, bhi
-             print "    returns", besti, bestj, bestsize
-         return besti, bestj, bestsize
- 
-     def get_matching_blocks(self):
-         if self.matching_blocks is not None:
-             return self.matching_blocks
-         self.matching_blocks = []
-         la, lb = len(self.a), len(self.b)
-         self.__helper(0, la, 0, lb, self.matching_blocks)
-         self.matching_blocks.append( (la, lb, 0) )
-         if TRACE:
-             print '*** matching blocks', self.matching_blocks
-         return self.matching_blocks
- 
-     # builds list of matching blocks covering a[alo:ahi] and
-     # b[blo:bhi], appending them in increasing order to answer
- 
-     def __helper(self, alo, ahi, blo, bhi, answer):
-         i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
-         # a[alo:i] vs b[blo:j] unknown
-         # a[i:i+k] same as b[j:j+k]
-         # a[i+k:ahi] vs b[j+k:bhi] unknown
-         if k:
-             if alo < i and blo < j:
-                 self.__helper(alo, i, blo, j, answer)
-             answer.append(x)
-             if i+k < ahi and j+k < bhi:
-                 self.__helper(i+k, ahi, j+k, bhi, answer)
- 
-     def ratio(self):
-         """Return a measure of the sequences' similarity (float in [0,1]).
- 
-         Where T is the total number of elements in both sequences, and
-         M is the number of matches, this is 2*M / T.
-         Note that this is 1 if the sequences are identical, and 0 if
-         they have nothing in common.
-         """
- 
-         matches = reduce(lambda sum, triple: sum + triple[-1],
-                          self.get_matching_blocks(), 0)
-         return 2.0 * matches / (len(self.a) + len(self.b))
- 
-     def quick_ratio(self):
-         """Return an upper bound on ratio() relatively quickly."""
-         # viewing a and b as multisets, set matches to the cardinality
-         # of their intersection; this counts the number of matches
-         # without regard to order, so is clearly an upper bound
-         if self.fullbcount is None:
-             self.fullbcount = fullbcount = {}
-             for elt in self.b:
-                 fullbcount[elt] = fullbcount.get(elt, 0) + 1
-         fullbcount = self.fullbcount
-         # avail[x] is the number of times x appears in 'b' less the
-         # number of times we've seen it in 'a' so far ... kinda
-         avail = {}
-         availhas, matches = avail.has_key, 0
-         for elt in self.a:
-             if availhas(elt):
-                 numb = avail[elt]
-             else:
-                 numb = fullbcount.get(elt, 0)
-             avail[elt] = numb - 1
-             if numb > 0:
-                 matches = matches + 1
-         return 2.0 * matches / (len(self.a) + len(self.b))
- 
-     def real_quick_ratio(self):
-         """Return an upper bound on ratio() very quickly"""
-         la, lb = len(self.a), len(self.b)
-         # can't have more matches than the number of elements in the
-         # shorter sequence
-         return 2.0 * min(la, lb) / (la + lb)
- 
-     def get_opcodes(self):
-         if self.opcodes is not None:
-             return self.opcodes
-         i = j = 0
-         self.opcodes = answer = []
-         for ai, bj, size in self.get_matching_blocks():
-             # invariant:  we've pumped out correct diffs to change
-             # a[:i] into b[:j], and the next matching block is
-             # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
-             # out a diff to change a[i:ai] into b[j:bj], pump out
-             # the matching block, and move (i,j) beyond the match
-             tag = ''
-             if i < ai and j < bj:
-                 tag = 'replace'
-             elif i < ai:
-                 tag = 'delete'
-             elif j < bj:
-                 tag = 'insert'
-             if tag:
-                 answer.append( (tag, i, ai, j, bj) )
-             i, j = ai+size, bj+size
-             # the list of matching blocks is terminated by a
-             # sentinel with size 0
-             if size:
-                 answer.append( ('equal', ai, i, bj, j) )
-         return answer
  
  # meant for dumping lines
--- 113,116 ----