[Python-checkins] CVS: python/dist/src/Doc/lib libdifflib.tex,1.7,1.8

Mon, 13 Aug 2001 12:32:01 -0700

Update of /cvsroot/python/python/dist/src/Doc/lib
In directory usw-pr-cvs1:/tmp/cvs-serv30267/lib

Modified Files:
	libdifflib.tex 
Log Message:

David Goodger <dgoodger@atsautomation.com>:
Documentation for difflib/ndiff refactoring: more of the ndiff functionality
has been moved to the underlying library (difflib).

This closes SF patch #445413.

Index: libdifflib.tex
===================================================================
RCS file: /cvsroot/python/python/dist/src/Doc/lib/libdifflib.tex,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** libdifflib.tex	2001/05/11 15:49:19	1.7
--- libdifflib.tex	2001/08/13 19:31:59	1.8
***************
*** 11,14 ****
--- 11,56 ----

+ \begin{classdesc*}{SequenceMatcher}
+   This is a flexible class for comparing pairs of sequences of any
+   type, so long as the sequence elements are hashable.  The basic
+   algorithm predates, and is a little fancier than, an algorithm
+   published in the late 1980's by Ratcliff and Obershelp under the
+   hyperbolic name ``gestalt pattern matching.''  The idea is to find
+   the longest contiguous matching subsequence that contains no
+   ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
+   address junk).  The same idea is then applied recursively to the
+   pieces of the sequences to the left and to the right of the matching
+   subsequence.  This does not yield minimal edit sequences, but does
+   tend to yield matches that ``look right'' to people.
+ 
+   \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
+   time in the worst case and quadratic time in the expected case.
+   \class{SequenceMatcher} is quadratic time for the worst case and has
+   expected-case behavior dependent in a complicated way on how many
+   elements the sequences have in common; best case time is linear.
+ \end{classdesc*}
+ 
+ \begin{classdesc*}{Differ}
+   This is a class for comparing sequences of lines of text, and
+   producing human-readable differences or deltas. Differ uses
+   \class{SequenceMatcher} both to compare sequences of lines, and to
+   compare sequences of characters within similar (near-matching)
+   lines.
+ 
+   Each line of a \class{Differ} delta begins with a two-letter code:
+ 
+ \begin{tableii}{l|l}{code}{Code}{Meaning}
+   \lineii{'- '}{line unique to sequence 1}
+   \lineii{'+ '}{line unique to sequence 2}
+   \lineii{'  '}{line common to both sequences}
+   \lineii{'? '}{line not present in either input sequence}
+ \end{tableii}
+ 
+   Lines beginning with `\code{?~}' attempt to guide the eye to
+   intraline differences, and were not present in either input
+   sequence. These lines can be confusing if the sequences contain tab
+   characters.
+ \end{classdesc*}
+ 
  \begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
                   n\optional{, cutoff}}}
***************
*** 41,65 ****
  \end{funcdesc}

! \begin{classdesc*}{SequenceMatcher}
!   This is a flexible class for comparing pairs of sequences of any
!   type, so long as the sequence elements are hashable.  The basic
!   algorithm predates, and is a little fancier than, an algorithm
!   published in the late 1980's by Ratcliff and Obershelp under the
!   hyperbolic name ``gestalt pattern matching.''  The idea is to find
!   the longest contiguous matching subsequence that contains no
!   ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
!   address junk).  The same idea is then applied recursively to the
!   pieces of the sequences to the left and to the right of the matching
!   subsequence.  This does not yield minimal edit sequences, but does
!   tend to yield matches that ``look right'' to people.

!   \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
!   time in the worst case and quadratic time in the expected case.
!   \class{SequenceMatcher} is quadratic time for the worst case and has
!   expected-case behavior dependent in a complicated way on how many
!   elements the sequences have in common; best case time is linear.
! \end{classdesc*}

  \begin{seealso}
    \seetitle{Pattern Matching: The Gestalt Approach}{Discussion of a
--- 83,167 ----
  \end{funcdesc}

! \begin{funcdesc}{ndiff}{a, b\optional{, linejunk\optional{,
!                  charjunk}}}
!   Compare \var{a} and \var{b} (lists of strings); return a
!   \class{Differ}-style delta.

!   Optional keyword parameters \var{linejunk} and \var{charjunk} are
!   for filter functions (or \code{None}):
! 
!   \var{linejunk}: A function that should accept a single string
!   argument, and return true if the string is junk (or false if it is
!   not). The default is module-level function
!   \function{IS_LINE_JUNK()}, which filters out lines without visible
!   characters, except for at most one pound character (\character{\#}).
! 
!   \var{charjunk}: A function that should accept a string of length 1.
!   The default is module-level function \function{IS_CHARACTER_JUNK()},
!   which filters out whitespace characters (a blank or tab; note: bad
!   idea to include newline in this!).
! 
!   \file{Tools/scripts/ndiff.py} is a command-line front-end to this
!   function.
! 
! \begin{verbatim}
! >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
! ...              'ore\ntree\nemu\n'.splitlines(1)))
! >>> print ''.join(diff),
! - one
! ?  ^  
! + ore
! ?  ^  
! - two
! - three
! ?  -    
! + tree
! + emu
! \end{verbatim}
! \end{funcdesc}

+ \begin{funcdesc}{restore}{sequence, which}
+   Return one of the two sequences that generated a delta.

+   Given a \var{sequence} produced by \method{Differ.compare()} or
+   \function{ndiff()}, extract lines originating from file 1 or 2
+   (parameter \var{which}), stripping off line prefixes.
+ 
+   Example:
+ 
+ \begin{verbatim}
+ >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
+ ...              'ore\ntree\nemu\n'.splitlines(1))
+ >>> print ''.join(restore(diff, 1)),
+ one
+ two
+ three
+ >>> print ''.join(restore(diff, 2)),
+ ore
+ tree
+ emu
+ \end{verbatim}
+ 
+ \end{funcdesc}
+ 
+ 
+ \begin{funcdesc}{IS_LINE_JUNK}{line}:
+ 
+   Return 1 for ignorable line: iff \var{line} is blank or contains a
+   single \character{\#}.  Used as a default for parameter
+   \var{linejunk} in \function{ndiff()}.
+ 
+ \end{funcdesc}
+ 
+ 
+ \begin{funcdesc}{IS_CHARACTER_JUNK}{ch}:
+ 
+   Return 1 for ignorable character: iff \var{ch} is a space or tab.
+   Used as a default for parameter \var{charjunk} in
+   \function{ndiff()}.
+ 
+ \end{funcdesc}
+ 
+ 
  \begin{seealso}
    \seetitle{Pattern Matching: The Gestalt Approach}{Discussion of a
***************
*** 232,238 ****

    Where T is the total number of elements in both sequences, and M is
!   the number of matches, this is 2.0*M / T. Note that this is \code{1.}
!   if the sequences are identical, and \code{0.} if they have nothing in
!   common.

    This is expensive to compute if \method{get_matching_blocks()} or
--- 334,340 ----

    Where T is the total number of elements in both sequences, and M is
!   the number of matches, this is 2.0*M / T. Note that this is
!   \code{1.0} if the sequences are identical, and \code{0.0} if they
!   have nothing in common.

    This is expensive to compute if \method{get_matching_blocks()} or
***************
*** 273,277 ****

! \subsection{Examples \label{difflib-examples}}

--- 375,379 ----

! \subsection{SequenceMatcher Examples \label{sequencematcher-examples}}

***************
*** 322,331 ****
  \end{verbatim}

- See \file{Tools/scripts/ndiff.py} from the Python source distribution
- for a fancy human-friendly file differencer, which uses
- \class{SequenceMatcher} both to view files as sequences of lines, and
- lines as sequences of characters.
- 
  See also the function \function{get_close_matches()} in this module,
  which shows how simple code building on \class{SequenceMatcher} can be
  used to do useful work.
--- 424,544 ----
  \end{verbatim}

  See also the function \function{get_close_matches()} in this module,
  which shows how simple code building on \class{SequenceMatcher} can be
  used to do useful work.
+ 
+ 
+ \subsection{Differ Objects \label{differ-objects}}
+ 
+ Note that \class{Differ}-generated deltas make no claim to be
+ \strong{minimal} diffs. To the contrary, minimal diffs are often
+ counter-intuitive, because they synch up anywhere possible, sometimes
+ accidental matches 100 pages apart. Restricting synch points to
+ contiguous matches preserves some notion of locality, at the
+ occasional cost of producing a longer diff.
+ 
+ The \class{Differ} class has this constructor:
+ 
+ \begin{classdesc}{Differ}{\optional{linejunk\optional{, charjunk}}}
+   Optional keyword parameters \var{linejunk} and \var{charjunk} are
+   for filter functions (or \code{None}):
+ 
+   \var{linejunk}: A function that should accept a single string
+   argument, and return true iff the string is junk. The default is
+   module-level function \function{IS_LINE_JUNK()}, which filters out
+   lines without visible characters, except for at most one pound
+   character (\character{\#}).
+ 
+   \var{charjunk}: A function that should accept a string of length 1.
+   The default is module-level function \function{IS_CHARACTER_JUNK()},
+   which filters out whitespace characters (a blank or tab; note: bad
+   idea to include newline in this!).
+ \end{classdesc}
+ 
+ \class{Differ} objects are used (deltas generated) via a single
+ method:
+ 
+ \begin{methoddesc}{compare}{a, b}
+   Compare two sequences of lines; return the resulting delta (list).
+ 
+   Each sequence must contain individual single-line strings ending
+   with newlines. Such sequences can be obtained from the
+   \method{readlines()} method of file-like objects. The list returned
+   is also made up of newline-terminated strings, and ready to be used
+   with the \method{writelines()} method of a file-like object.
+ \end{methoddesc}
+ 
+ 
+ \subsection{Differ Example \label{differ-examples}}
+ 
+ This example compares two texts. First we set up the texts, sequences
+ of individual single-line strings ending with newlines (such sequences
+ can also be obtained from the \method{readlines()} method of file-like
+ objects):
+ 
+ \begin{verbatim}
+ >>> text1 = '''  1. Beautiful is better than ugly.
+ ...   2. Explicit is better than implicit.
+ ...   3. Simple is better than complex.
+ ...   4. Complex is better than complicated.
+ ... '''.splitlines(1)
+ >>> len(text1)
+ 4
+ >>> text1[0][-1]
+ '\n'
+ >>> text2 = '''  1. Beautiful is better than ugly.
+ ...   3.   Simple is better than complex.
+ ...   4. Complicated is better than complex.
+ ...   5. Flat is better than nested.
+ ... '''.splitlines(1)
+ \end{verbatim}
+ 
+ Next we instantiate a Differ object:
+ 
+ \begin{verbatim}
+ >>> d = Differ()
+ \end{verbatim}
+ 
+ Note that when instantiating a \class{Differ} object we may pass
+ functions to filter out line and character ``junk.''  See the
+ \method{Differ()} constructor for details.
+ 
+ Finally, we compare the two:
+ 
+ \begin{verbatim}
+ >>> result = d.compare(text1, text2)
+ \end{verbatim}
+ 
+ \code{result} is a list of strings, so let's pretty-print it:
+ 
+ \begin{verbatim}
+ >>> from pprint import pprint
+ >>> pprint(result)
+ ['    1. Beautiful is better than ugly.\n',
+  '-   2. Explicit is better than implicit.\n',
+  '-   3. Simple is better than complex.\n',
+  '+   3.   Simple is better than complex.\n',
+  '?     ++                                \n',
+  '-   4. Complex is better than complicated.\n',
+  '?            ^                     ---- ^  \n',
+  '+   4. Complicated is better than complex.\n',
+  '?           ++++ ^                      ^  \n',
+  '+   5. Flat is better than nested.\n']
+ \end{verbatim}
+ 
+ As a single multi-line string it looks like this:
+ 
+ \begin{verbatim}
+ >>> import sys
+ >>> sys.stdout.writelines(result)
+     1. Beautiful is better than ugly.
+ -   2. Explicit is better than implicit.
+ -   3. Simple is better than complex.
+ +   3.   Simple is better than complex.
+ ?     ++
+ -   4. Complex is better than complicated.
+ ?            ^                     ---- ^
+ +   4. Complicated is better than complex.
+ ?           ++++ ^                      ^
+ +   5. Flat is better than nested.
+ \end{verbatim}