[pypy-commit] benchmarks single-run: I think I got it right, but who knows - try to kill the implicit warmup

Wed Aug 7 16:58:52 CEST 2013

Author: Maciej Fijalkowski <fijall at gmail.com>
Branch: single-run
Changeset: r229:4ab1e9967170
Date: 2013-08-07 16:58 +0200
http://bitbucket.org/pypy/benchmarks/changeset/4ab1e9967170/

Log:	I think I got it right, but who knows - try to kill the implicit
	warmup

diff --git a/bench-data.json b/bench-data.json
--- a/bench-data.json
+++ b/bench-data.json
@@ -5,6 +5,7 @@
    "description": "Brute force n-queens solver."
  },
  "bm_chameleon": {
+   "total_runs": 500
  },
  "bm_mako": {
  },
@@ -12,7 +13,8 @@
    "description": "Creates chaosgame-like fractals"
  },
  "cpython_doc": {
-   "description": "Run sphinx over cpython documentation"
+   "description": "Run sphinx over cpython documentation",
+   "total_runs": 1
  },
  "crypto_pyaes": {
    "description": "A pure python implementation of AES"
@@ -30,6 +32,10 @@
  "float": {
    "description": "Creates an array of points using circular projection and then normalizes and maximizes them. Floating-point heavy."
  },
+ "gcbench": {
+   "total_runs": 5,
+   "description": "Classic gcbench"
+ },
  "genshi_text": {
    "description": "Genshi template rendering using text, generator heavy"
  },
@@ -43,7 +49,7 @@
  },
  "html5lib": {
    "warmup": 0,
-   "total_runs": 50,
+   "total_runs": 5,
    "description": "Parses the HTML 5 spec using html5lib."
  },
  "json_bench": {
@@ -58,7 +64,8 @@
    "description": "Double-precision N-body simulation. It models the orbits of Jovian planets, using a simple symplectic-integrator."
  },
  "pidigits": {
-   "description": "Computes the digits of PI. Long heavy"
+   "description": "Computes the digits of PI. Long heavy",
+   "total_runs": 5
  },
  "pyflate-fast": {
    "description": "Stand-alone pure-Python DEFLATE (gzip) and bzip2 decoder/decompressor."
@@ -91,31 +98,42 @@
    "description": "Uses the Spitfire template system to build a 1000x1000-cell HTML table; it differs from spitfire in that it uses .join(list) instead of cStringIO."
  },
  "spambayes": {
-   "description": "Spambayes spam classification filter"
+   "description": "Spambayes spam classification filter",
+   "warmup": 1,
+   "total_runs": 51
  },
  "spectral-norm": {
  },
  "spitfire": {
-   "description": "Uses the Spitfire template system to build a 100x100-cell HTML table; it differs from spitfire in that it uses .join(list) instead of cStringIO."
+   "description": "Uses the Spitfire template system to build a 100x100-cell HTML table; it differs from spitfire in that it uses .join(list) instead of cStringIO.",
+   "warmup": 2,
+   "total_runs": 52
  },
  "spitfire_cstringio": {
    "description": "ses the Spitfire template system to build a 1000x1000-cell HTML table, using the cStringIO module."
  },
  "sympy_expand": {
+   "total_runs": 5,
    "description": "Use sympy (pure python symbolic math lib) do to expansion"
  },
  "sympy_integrate": {
+   "total_runs": 5,
    "description": "Use sympy (pure python symbolic math lib) do to integration"
  },
  "sympy_str": {
+   "total_runs": 5,
    "description": "Use sympy (pure python symbolic math lib) do to str() operation"
  },
  "sympy_sum": {
+   "total_runs": 5,
    "description": "Use sympy (pure python symbolic math lib) do to summation"
  },
  "telco": {
    "description": "A small program which is intended to capture the essence of a telephone company billing application, with a realistic balance between Input/Output activity and application calculations."
  },
+ "translate": {
+   "description": "Translation benchmarks"
+ }, 
  "trans2_annotate": {
    "description": "PyPy translation -O2 - annotation"
  },
@@ -132,18 +150,28 @@
    "description": "PyPy translation -O2 - C source"
  },
  "twisted_iteration" : {
-   "description": "Iterates a Twisted reactor as quickly as possible without doing any work."
+   "description": "Iterates a Twisted reactor as quickly as possible without doing any work.",
+   "total_runs": 65,
+   "warmup": 15
  },
  "twisted_names": {
-   "description": "Runs a DNS server with Twisted Names and then issues requests to it over loopback UDP."
+   "description": "Runs a DNS server with Twisted Names and then issues requests to it over loopback UDP.",
+   "total_runs": 65,
+   "warmup": 15
  },
  "twisted_pb": {
-   "description": "Runs a Perspective Broker server with a no-op method and invokes that method over loopback TCP with some strings, dictionaries, and tuples as arguments."
+   "description": "Runs a Perspective Broker server with a no-op method and invokes that method over loopback TCP with some strings, dictionaries, and tuples as arguments.",
+   "total_runs": 65,
+   "warmup": 15
  },
  "twisted_tcp": {
-   "description": "Connects one Twised client to one Twisted server over TCP (on the loopback interface) and then writes bytes as fast as it can."
+   "description": "Connects one Twised client to one Twisted server over TCP (on the loopback interface) and then writes bytes as fast as it can.",
+   "total_runs": 65,
+   "warmup": 15
  },
  "twisted_web": {
-   "description": "Runs twisted web server and connects through twisted HTTP client"
+   "description": "Runs twisted web server and connects through twisted HTTP client",
+   "total_runs": 25,
+   "warmup": 15
  }
 }
diff --git a/benchmarks.py b/benchmarks.py
--- a/benchmarks.py
+++ b/benchmarks.py
@@ -8,9 +8,9 @@
     return os.path.join(os.path.dirname(os.path.abspath(__file__)), *args)
 
 def _register_new_bm(name, bm_name, d, **opts):
-    def Measure(python, options):
+    def Measure(python, options, bench_data):
         bm_path = relative('own', name + '.py')
-        return MeasureGeneric(python, options, bm_path, **opts)
+        return MeasureGeneric(python, options, bench_data, bm_path, **opts)
     Measure.func_name = 'Measure' + name.capitalize()
 
     def BM(*args, **kwds):
@@ -20,7 +20,7 @@
     d[BM.func_name] = BM
 
 def _register_new_bm_twisted(name, bm_name, d, **opts):
-    def Measure(python, options):
+    def Measure(python, options, bench_data):
         def parser(line):
             number = float(line.split(" ")[0])
             if name == 'tcp':
@@ -30,7 +30,8 @@
             else:
                 return 100/number
         bm_path = relative('own', 'twisted', name + '.py')
-        return MeasureGeneric(python, options, bm_path, parser=parser, **opts)
+        return MeasureGeneric(python, options, bench_data, bm_path,
+                              parser=parser, **opts)
     Measure.func_name = 'Measure' + name.capitalize()
 
     def BM(*args, **kwds):
@@ -40,9 +41,9 @@
     d[BM.func_name] = BM
 
 def _register_new_bm_base_only(name, bm_name, d, **opts):
-    def benchmark_function(python, options):
+    def benchmark_function(python, options, bench_data):
         bm_path = relative('own', name + '.py')
-        return MeasureGeneric(python, options, bm_path, **opts)
+        return MeasureGeneric(python, options, bench_data, bm_path, **opts)
 
     def BM(python, options, *args, **kwargs):
         try:
@@ -58,19 +59,15 @@
 TWISTED = [relative('lib/twisted-trunk'), relative('lib/zope.interface-3.5.3/src'), relative('own/twisted')]
 
 opts = {
-    'gcbench' : {'iteration_scaling' : .10},
-    'pidigits': {'iteration_scaling' : .10},
     'eparse'  : {'bm_env': {'PYTHONPATH': relative('lib/monte')}},
     'bm_mako' : {'bm_env': {'PYTHONPATH': relative('lib/mako')}},
-    'bm_chameleon': {'bm_env': {'PYTHONPATH': relative('lib/chameleon/src')},
-                     'iteration_scaling': 3},
+    'bm_chameleon': {'bm_env': {'PYTHONPATH': relative('lib/chameleon/src')}},
 }
 
 for name in ['expand', 'integrate', 'sum', 'str']:
     _register_new_bm('bm_sympy', 'sympy_' + name,
                      globals(), bm_env={'PYTHONPATH': relative('lib/sympy')},
-                     extra_args=['--benchmark=' + name],
-                     iteration_scaling=.1)
+                     extra_args=['--benchmark=' + name])
 
 for name in ['xml', 'text']:
     _register_new_bm('bm_genshi', 'genshi_' + name,
@@ -84,13 +81,8 @@
     _register_new_bm(name, name, globals(), **opts.get(name, {}))
 
 for name in ['names', 'iteration', 'tcp', 'pb', ]:#'web']:#, 'accepts']:
-    if name == 'web':
-        iteration_scaling = 0.2
-    else:
-        iteration_scaling = 1.0
     _register_new_bm_twisted(name, 'twisted_' + name,
-                     globals(), bm_env={'PYTHONPATH': ':'.join(TWISTED)},
-                                 iteration_scaling=iteration_scaling)
+                     globals(), bm_env={'PYTHONPATH': ':'.join(TWISTED)})
 
 _register_new_bm('spitfire', 'spitfire', globals(),
     extra_args=['--benchmark=spitfire_o4'])
@@ -141,7 +133,7 @@
         ('database', 0.4)
         ]
 
-def BM_translate(python, options):
+def BM_translate(python, options, bench_data):
     """
     Run translate.py and returns a benchmark result for each of the phases.
     Note that we run it only with ``base_python`` (which corresponds to
@@ -177,7 +169,7 @@
     return result
 BM_translate.benchmark_name = 'trans2'
 
-def BM_cpython_doc(python, options):
+def BM_cpython_doc(python, options, bench_data):
     from unladen_swallow.perf import RawResult
     import subprocess, shutil
 
diff --git a/own/bm_sympy.py b/own/bm_sympy.py
--- a/own/bm_sympy.py
+++ b/own/bm_sympy.py
@@ -1,4 +1,5 @@
 
+import sys
 from sympy import expand, symbols, integrate, tan, summation
 from sympy.core.cache import clear_cache
 import time
@@ -27,6 +28,7 @@
         clear_cache()
         t0 = time.time()
         func()
+        print >>sys.stderr, time.time() - t0
         l.append(time.time() - t0)
     return l
 
diff --git a/own/twisted/benchlib.py b/own/twisted/benchlib.py
--- a/own/twisted/benchlib.py
+++ b/own/twisted/benchlib.py
@@ -54,7 +54,7 @@
         optParameters = [
             ('iterations', 'n', 1, 'number of iterations', int),
             ('duration', 'd', 1, 'duration of each iteration', float),
-            ('warmup', 'w', 15, 'number of warmup iterations', int),
+            ('warmup', 'w', 0, 'number of warmup iterations', int),
         ]
 
     options = BenchmarkOptions()
diff --git a/runner.py b/runner.py
--- a/runner.py
+++ b/runner.py
@@ -22,8 +22,12 @@
 def run_and_store(benchmark_set, result_filename, path, revision=0,
                   options='', branch='default', args='', upload=False,
                   fast=False, full_store=False):
-    funcs = perf.BENCH_FUNCS.copy()
-    funcs.update(perf._FindAllBenchmarks(benchmarks.__dict__))
+    _funcs = perf.BENCH_FUNCS.copy()
+    _funcs.update(perf._FindAllBenchmarks(benchmarks.__dict__))
+    bench_data = json.load(open('bench-data.json'))
+    funcs = {}
+    for key, value in _funcs.iteritems():
+        funcs[key] = (value, bench_data[key])
     opts = ['-b', ','.join(benchmark_set),
             '--inherit_env=PATH',
             '--no_charts']
@@ -146,7 +150,7 @@
             benchmarks = list(BENCHMARK_SET)
 
     for benchmark in benchmarks:
-        if benchmark not in BENCHMARK_SET:
+        if benchmark not in BENCHMARK_SET and not benchmark.startswith('-'):
             raise WrongBenchmark(benchmark)
 
     path = options.python
diff --git a/unladen_swallow/perf.py b/unladen_swallow/perf.py
--- a/unladen_swallow/perf.py
+++ b/unladen_swallow/perf.py
@@ -405,7 +405,7 @@
 ### Utility functions
 
 def SimpleBenchmark(benchmark_function, python, options,
-                    *args, **kwargs):
+                    bench_data, *args, **kwargs):
     """Abstract out the body for most simple benchmarks.
 
     Example usage:
@@ -426,12 +426,12 @@
         Comes with string_representation method.
     """
     try:
-        data = benchmark_function(python, options,
+        data = benchmark_function(python, options, bench_data,
                                   *args, **kwargs)
     except subprocess.CalledProcessError, e:
         return ResultError(e)
 
-    return CompareBenchmarkData(data, options)
+    return CompareBenchmarkData(data, options, bench_data)
 
 
 def ShortenUrl(url):
@@ -577,7 +577,7 @@
     return fixed_env
 
 
-def CompareMultipleRuns(times, options):
+def CompareMultipleRuns(times, options, bench_data):
     """Compare multiple control vs experiment runs of the same benchmark.
 
     Args:
@@ -596,15 +596,16 @@
         # below.
         return SimpleResult(times[0])
 
-    times = sorted(times)
-
-    min_time = times[0]
-    avg_time = avg(times)
-    std_time = SampleStdDev(times)
+    min_time = sorted(times)[0]
+    warmup = bench_data.get('warmup', 0)
+    if bench_data.get('legacy_multiplier'):
+        times = [time * bench_data['legacy_multiplier'] for time in times]
+    avg_time = avg(times[warmup:])
+    std_time = SampleStdDev(times[warmup:])
 
     return Result(times, min_time, avg_time, std_time)
 
-def CompareBenchmarkData(data, options):
+def CompareBenchmarkData(data, options, bench_data):
     """Compare performance and memory usage.
 
     Args:
@@ -625,7 +626,7 @@
             return CompareMemoryUsage(base_mem, changed_mem, options)
         return "Benchmark does not report memory usage yet"
 
-    return CompareMultipleRuns(times, options)
+    return CompareMultipleRuns(times, options, bench_data)
 
 
 def CallAndCaptureOutput(command, env=None, track_memory=False, inherit_env=[]):
@@ -663,8 +664,8 @@
     return result, mem_usage
 
 
-def MeasureGeneric(python, options, bm_path, bm_env=None,
-                   extra_args=[], iteration_scaling=1, parser=float):
+def MeasureGeneric(python, options, bench_data, bm_path, bm_env=None,
+                   extra_args=[], parser=float):
     """Abstract measurement function for Unladen's bm_* scripts.
 
     Based on the values of options.fast/rigorous, will pass -n {5,50,100} to
@@ -690,12 +691,13 @@
     if bm_env is None:
         bm_env = {}
 
-    trials = 50
+    trials = bench_data.get('total_runs', 50)
+    warmup = bench_data.get('warmup', 0)
     if options.rigorous:
-        trials = 100
+        trials = (trials - warmup) * 2 + warmup
     elif options.fast:
-        trials = 5
-    trials = max(1, int(trials * iteration_scaling))
+        trials = (trials - warmup) // 10 + warmup
+    trials = max(1, trials)
 
     RemovePycs()
     command = python + [bm_path, "-n", trials] + extra_args
@@ -708,84 +710,7 @@
 
 ### Benchmarks
 
-_PY_BENCH_TOTALS_LINE = re.compile("""
-    Totals:\s+(?P<min_base>\d+)ms\s+
-    (?P<min_changed>\d+)ms\s+
-    \S+\s+  # Percent change, which we re-compute
-    (?P<avg_base>\d+)ms\s+
-    (?P<avg_changed>\d+)ms\s+
-    \S+  # Second percent change, also re-computed
-    """, re.X)
-def MungePyBenchTotals(line):
-    m = _PY_BENCH_TOTALS_LINE.search(line)
-    if m:
-        min_base, min_changed, avg_base, avg_changed = map(float, m.group(
-            "min_base", "min_changed", "avg_base", "avg_changed"))
-        delta_min = TimeDelta(min_base, min_changed)
-        delta_avg = TimeDelta(avg_base, avg_changed)
-        return (("Min: %(min_base)d -> %(min_changed)d: %(delta_min)s\n" +
-                 "Avg: %(avg_base)d -> %(avg_changed)d: %(delta_avg)s")
-                % locals())
-    return line
-
-
-def BM_PyBench(base_python, changed_python, options):
-    if options.track_memory:
-        return "Benchmark does not report memory usage yet"
-
-    warp = "10"
-    if options.rigorous:
-        warp = "1"
-    if options.fast:
-        warp = "100"
-
-    PYBENCH_PATH = Relative("performance/pybench/pybench.py")
-    PYBENCH_ENV = BuildEnv({"PYTHONPATH": ""}, inherit_env=options.inherit_env)
-
-    try:
-        with contextlib.nested(open(os.devnull, "wb"),
-                               TemporaryFilename(prefix="baseline."),
-                               TemporaryFilename(prefix="changed.")
-                               ) as (dev_null, base_pybench, changed_pybench):
-            RemovePycs()
-            subprocess.check_call(LogCall(changed_python + [
-                                           PYBENCH_PATH,
-                                           "-w", warp,
-                                           "-f", changed_pybench,
-                                           ]), stdout=dev_null,
-                                           env=PYBENCH_ENV)
-            RemovePycs()
-            subprocess.check_call(LogCall(base_python + [
-                                           PYBENCH_PATH,
-                                           "-w", warp,
-                                           "-f", base_pybench,
-                                           ]), stdout=dev_null,
-                                           env=PYBENCH_ENV)
-            comparer = subprocess.Popen(base_python + [
-                                         PYBENCH_PATH,
-                                         "--debug",
-                                         "-s", base_pybench,
-                                         "-c", changed_pybench,
-                                         ], stdout=subprocess.PIPE,
-                                         stderr=subprocess.PIPE,
-                                         env=PYBENCH_ENV)
-            result, err = comparer.communicate()
-            if comparer.returncode != 0:
-                return "pybench died: " + err
-    except subprocess.CalledProcessError, e:
-        return str(e)
-
-    if options.verbose:
-        return result
-    else:
-        for line in result.splitlines():
-            if line.startswith("Totals:"):
-                return MungePyBenchTotals(line)
-        # The format's wrong...
-        return result
-
-
-def Measure2to3(python, options):
+def Measure2to3(python, options, bench_data):
     FAST_TARGET = Relative("lib/2to3/lib2to3/refactor.py")
     TWO_TO_THREE_PROG = Relative("lib/2to3/2to3")
     TWO_TO_THREE_DIR = Relative("lib/2to3")
@@ -836,24 +761,25 @@
     return times, mem_usage
 
 
-def BM_2to3(*args, **kwargs):
+# XXX we should enable this one
+def _BM_2to3(*args, **kwargs):
     return SimpleBenchmark(Measure2to3, *args, **kwargs)
 
 
 DJANGO_DIR = Relative("lib/django")
 
 
-def MeasureDjango(python, options):
+def MeasureDjango(python, options, bench_data):
     bm_path = Relative("performance/bm_django.py")
     bm_env = {"PYTHONPATH": DJANGO_DIR}
-    return MeasureGeneric(python, options, bm_path, bm_env)
+    return MeasureGeneric(python, options, bench_data, bm_path, bm_env)
 
 
 def BM_Django(*args, **kwargs):
     return SimpleBenchmark(MeasureDjango, *args, **kwargs)
 
 
-def MeasureRietveld(python, options):
+def MeasureRietveld(python, options, bench_data):
     PYTHONPATH = ":".join([DJANGO_DIR,
                            # These paths are lifted from
                            # lib/google_appengine.appcfg.py.  Note that we use
@@ -866,7 +792,7 @@
     bm_path = Relative("performance/bm_rietveld.py")
     bm_env = {"PYTHONPATH": PYTHONPATH, "DJANGO_SETTINGS_MODULE": "settings"}
 
-    return MeasureGeneric(python, options, bm_path, bm_env)
+    return MeasureGeneric(python, options, bench_data, bm_path, bm_env)
 
 
 def BM_Rietveld(*args, **kwargs):
@@ -914,7 +840,7 @@
     return psyco_build_dir
 
 
-def MeasureSpitfire(python, options, env=None, extra_args=[]):
+def MeasureSpitfire(python, options, bench_data, env=None, extra_args=[]):
     """Use Spitfire to test a Python binary's performance.
 
     Args:
@@ -930,7 +856,7 @@
         memory usage samples in kilobytes.
     """
     bm_path = Relative("performance/bm_spitfire.py")
-    return MeasureGeneric(python, options, bm_path, env, extra_args)
+    return MeasureGeneric(python, options, bench_data, bm_path, env, extra_args)
 
 
 def MeasureSpitfireWithPsyco(python, options):
@@ -967,83 +893,19 @@
     return SimpleBenchmark(MeasureSpitfireWithPsyco, *args, **kwargs)
 
 
-def BM_SlowSpitfire(python, options):
+def BM_SlowSpitfire(python, options, bench_data):
     extra_args = ["--disable_psyco"]
     spitfire_env = {"PYTHONPATH": Relative("lib/spitfire")}
 
     try:
-        data = MeasureSpitfire(python, options,
+        data = MeasureSpitfire(python, options, bench_data,
                                spitfire_env, extra_args)
     except subprocess.CalledProcessError, e:
         return str(e)
 
-    return CompareBenchmarkData(data, options)
+    return CompareBenchmarkData(data, options, bench_data)
 
-
-def MeasurePickle(python, options, extra_args):
-    """Test the performance of Python's pickle implementations.
-
-    Args:
-        python: prefix of a command line for the Python binary.
-        options: optparse.Values instance.
-        extra_args: list of arguments to append to the command line.
-
-    Returns:
-        (perf_data, mem_usage), where perf_data is a list of floats, each the
-        time it took to run the pickle test once; mem_usage is a list of
-        memory usage samples in kilobytes.
-    """
-    bm_path = Relative("performance/bm_pickle.py")
-    return MeasureGeneric(python, options, bm_path, extra_args=extra_args)
-
-
-def _PickleBenchmark(base_python, changed_python, options, extra_args):
-    """Test the performance of Python's pickle implementations.
-
-    Args:
-        base_python: prefix of a command line for the reference
-                Python binary.
-        changed_python: prefix of a command line for the
-                experimental Python binary.
-        options: optparse.Values instance.
-        extra_args: list of arguments to append to the command line.
-
-    Returns:
-        Summary of whether the experiemental Python is better/worse than the
-        baseline.
-    """
-    return SimpleBenchmark(MeasurePickle,
-                           base_python, changed_python, options, extra_args)
-
-
-def BM_Pickle(base_python, changed_python, options):
-    args = ["--use_cpickle", "pickle"]
-    return _PickleBenchmark(base_python, changed_python, options, args)
-
-def BM_Unpickle(base_python, changed_python, options):
-    args = ["--use_cpickle", "unpickle"]
-    return _PickleBenchmark(base_python, changed_python, options, args)
-
-def BM_Pickle_List(base_python, changed_python, options):
-    args = ["--use_cpickle", "pickle_list"]
-    return _PickleBenchmark(base_python, changed_python, options, args)
-
-def BM_Unpickle_List(base_python, changed_python, options):
-    args = ["--use_cpickle", "unpickle_list"]
-    return _PickleBenchmark(base_python, changed_python, options, args)
-
-def BM_Pickle_Dict(base_python, changed_python, options):
-    args = ["--use_cpickle", "pickle_dict"]
-    return _PickleBenchmark(base_python, changed_python, options, args)
-
-def BM_SlowPickle(base_python, changed_python, options):
-    return _PickleBenchmark(base_python, changed_python, options, ["pickle"])
-
-def BM_SlowUnpickle(base_python, changed_python, options):
-    return _PickleBenchmark(base_python, changed_python, options, ["unpickle"])
-
-
-def MeasureAi(python, options):
+def MeasureAi(python, options, bench_data):
     """Test the performance of some small AI problem solvers.
 
     Args:
@@ -1056,228 +918,12 @@
         memory usage samples in kilobytes.
     """
     bm_path = Relative("performance/bm_ai.py")
-    return MeasureGeneric(python, options, bm_path)
-
+    return MeasureGeneric(python, options, bench_data, bm_path)
 
 def BM_Ai(*args, **kwargs):
     return SimpleBenchmark(MeasureAi, *args, **kwargs)
 
-
-def _StartupPython(command, mem_usage, track_memory, inherit_env):
-    startup_env = BuildEnv(inherit_env=inherit_env)
-    if not track_memory:
-        subprocess.check_call(command, env=startup_env)
-    else:
-        subproc = subprocess.Popen(command, env=startup_env)
-        future = MemoryUsageFuture(subproc.pid)
-        if subproc.wait() != 0:
-            raise RuntimeError("Startup benchmark died")
-        mem_usage.extend(future.GetMemoryUsage())
-
-
-def MeasureStartup(python, cmd_opts, num_loops, track_memory, inherit_env):
-    times = []
-    work = ""
-    if track_memory:
-        # Without this, Python may start and exit before the memory sampler
-        # thread has time to work. We can't just do 'time.sleep(x)', because
-        # under -S, 'import time' fails.
-        work = "for _ in xrange(200000): pass"
-    command = python + cmd_opts + ["-c", work]
-    mem_usage = []
-    info("Running `%s` %d times", command, num_loops * 20)
-    for _ in xrange(num_loops):
-        t0 = time.time()
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        _StartupPython(command, mem_usage, track_memory, inherit_env)
-        t1 = time.time()
-        times.append(t1 - t0)
-    if not track_memory:
-      mem_usage = None
-    return times, mem_usage
-
-
-def BM_normal_startup(base_python, changed_python, options):
-    if options.rigorous:
-        num_loops = 100
-    elif options.fast:
-        num_loops = 5
-    else:
-        num_loops = 50
-
-    opts = []
-    changed_data = MeasureStartup(changed_python, opts, num_loops,
-                                  options.track_memory, options.inherit_env)
-    base_data = MeasureStartup(base_python, opts, num_loops,
-                               options.track_memory, options.inherit_env)
-
-    return CompareBenchmarkData(base_data, changed_data, options)
-
-
-def BM_startup_nosite(base_python, changed_python, options):
-    if options.rigorous:
-        num_loops = 200
-    elif options.fast:
-        num_loops = 10
-    else:
-        num_loops = 100
-
-    opts = ["-S"]
-    changed_data = MeasureStartup(changed_python, opts, num_loops,
-                                  options.track_memory, options.inherit_env)
-    base_data = MeasureStartup(base_python, opts, num_loops,
-                               options.track_memory, options.inherit_env)
-
-    return CompareBenchmarkData(base_data, changed_data, options)
-
-
-def MeasureRegexPerformance(python, options, bm_path):
-    """Test the performance of Python's regex engine.
-
-    Args:
-        python: prefix of a command line for the Python binary.
-        options: optparse.Values instance.
-        bm_path: relative path; which benchmark script to run.
-
-    Returns:
-        (perf_data, mem_usage), where perf_data is a list of floats, each the
-        time it took to run all the regexes routines once; mem_usage is a list
-        of memory usage samples in kilobytes.
-    """
-    return MeasureGeneric(python, options, Relative(bm_path))
-
-
-def RegexBenchmark(base_python, changed_python, options, bm_path):
-    return SimpleBenchmark(MeasureRegexPerformance,
-                           base_python, changed_python, options, bm_path)
-
-
-def BM_regex_v8(base_python, changed_python, options):
-    bm_path = "performance/bm_regex_v8.py"
-    return RegexBenchmark(base_python, changed_python, options, bm_path)
-
-
-def BM_regex_effbot(base_python, changed_python, options):
-    bm_path = "performance/bm_regex_effbot.py"
-    return RegexBenchmark(base_python, changed_python, options, bm_path)
-
-
-def BM_regex_compile(base_python, changed_python, options):
-    bm_path = "performance/bm_regex_compile.py"
-    return RegexBenchmark(base_python, changed_python, options, bm_path)
-
-
-def MeasureThreading(python, options, bm_name):
-    """Test the performance of Python's threading support.
-
-    Args:
-        python: prefix of a command line for the Python binary.
-        options: optparse.Values instance.
-        bm_name: name of the threading benchmark to run.
-
-    Returns:
-        (perf_data, mem_usage), where perf_data is a list of floats, each the
-        time it took to run the threading benchmark once; mem_usage is a list
-        of memory usage samples in kilobytes.
-    """
-    bm_path = Relative("performance/bm_threading.py")
-    return MeasureGeneric(python, options, bm_path, extra_args=[bm_name])
-
-
-def ThreadingBenchmark(base_python, changed_python, options, bm_name):
-    return SimpleBenchmark(MeasureThreading,
-                           base_python, changed_python, options, bm_name)
-
-
-def BM_threaded_count(base_python, changed_python, options):
-    bm_name = "threaded_count"
-    return ThreadingBenchmark(base_python, changed_python, options, bm_name)
-
-
-def BM_iterative_count(base_python, changed_python, options):
-    bm_name = "iterative_count"
-    return ThreadingBenchmark(base_python, changed_python, options, bm_name)
-
-
-def MeasureUnpackSequence(python, options):
-    """Test the performance of sequence unpacking.
-
-    Args:
-        python: prefix of a command line for the Python binary.
-        options: optparse.Values instance.
-
-    Returns:
-        (perf_data, mem_usage), where perf_data is a list of floats, each the
-        time it took to run the threading benchmark once; mem_usage is a list
-        of memory usage samples in kilobytes.
-    """
-    bm_path = Relative("performance/bm_unpack_sequence.py")
-    return MeasureGeneric(python, options, bm_path, iteration_scaling=1000)
-
-
-def BM_unpack_sequence(*args, **kwargs):
-    return SimpleBenchmark(MeasureUnpackSequence, *args, **kwargs)
-
-
-def MeasureCallSimple(python, options):
-    """Test the performance of simple function calls.
-
-    Args:
-        python: prefix of a command line for the Python binary.
-        options: optparse.Values instance.
-
-    Returns:
-        (perf_data, mem_usage), where perf_data is a list of floats, each the
-        time it took to run the threading benchmark once; mem_usage is a list
-        of memory usage samples in kilobytes.
-    """
-    bm_path = Relative("performance/bm_call_simple.py")
-    return MeasureGeneric(python, options, bm_path)
-
-
-def BM_call_simple(*args, **kwargs):
-    return SimpleBenchmark(MeasureCallSimple, *args, **kwargs)
-
-
-def MeasureNbody(python, options):
-    """Test the performance of math operations using an n-body benchmark.
-
-    Args:
-        python: prefix of a command line for the Python binary.
-        options: optparse.Values instance.
-
-    Returns:
-        (perf_data, mem_usage), where perf_data is a list of floats, each the
-        time it took to run the benchmark loop once; mem_usage is a list
-        of memory usage samples in kilobytes.
-    """
-    bm_path = Relative("performance/bm_nbody.py")
-    return MeasureGeneric(python, options, bm_path)
-
-
-def BM_nbody(*args, **kwargs):
-    return SimpleBenchmark(MeasureNbody, *args, **kwargs)
-
-
-def MeasureSpamBayes(python, options):
+def MeasureSpamBayes(python, options, bench_data):
     """Test the performance of the SpamBayes spam filter and its tokenizer.
 
     Args:
@@ -1292,14 +938,14 @@
     pypath = ":".join([Relative("lib/spambayes"), Relative("lib/lockfile")])
     bm_path = Relative("performance/bm_spambayes.py")
     bm_env = {"PYTHONPATH": pypath}
-    return MeasureGeneric(python, options, bm_path, bm_env)
+    return MeasureGeneric(python, options, bench_data, bm_path, bm_env)
 
 
 def BM_spambayes(*args, **kwargs):
     return SimpleBenchmark(MeasureSpamBayes, *args, **kwargs)
 
 
-def MeasureHtml5lib(python, options):
+def MeasureHtml5lib(python, options, bench_data):
     """Test the performance of the html5lib HTML 5 parser.
 
     Args:
@@ -1313,16 +959,14 @@
     """
     bm_path = Relative("performance/bm_html5lib.py")
     bm_env = {"PYTHONPATH": Relative("lib/html5lib")}
-    return MeasureGeneric(python, options, bm_path, bm_env,
-                          iteration_scaling=0.10)
-
+    return MeasureGeneric(python, options, bench_data, bm_path, bm_env)
 
 def BM_html5lib(*args, **kwargs):
     return SimpleBenchmark(MeasureHtml5lib, *args, **kwargs)
 
-def MeasureRichards(python, options):
+def MeasureRichards(python, options, bench_data):
     bm_path = Relative("performance/bm_richards.py")
-    return MeasureGeneric(python, options, bm_path)
+    return MeasureGeneric(python, options, bench_data, bm_path)
 
 def BM_richards(*args, **kwargs):
     return SimpleBenchmark(MeasureRichards, *args, **kwargs)
@@ -1341,8 +985,8 @@
 # If you update the default group, be sure to update the module docstring, too.
 # An "all" group which includes every benchmark perf.py knows about is generated
 # automatically.
-BENCH_GROUPS = {"default": ["2to3", "django", "nbody", "slowspitfire",
-                            "slowpickle", "slowunpickle", "spambayes"],
+BENCH_GROUPS = {"default": ["2to3", "django", "slowspitfire",
+                            "spambayes"],
                 "startup": ["normal_startup", "startup_nosite"],
                 "regex": ["regex_v8", "regex_effbot", "regex_compile"],
                 "threading": ["threaded_count", "iterative_count"],
@@ -1389,7 +1033,7 @@
 
     should_run = set()
     if not positive_benchmarks:
-        should_run = set(_ExpandBenchmarkName("default", bench_groups))
+        should_run = set(bench_groups['all'])
 
     for name in positive_benchmarks:
         for bm in _ExpandBenchmarkName(name, bench_groups):
@@ -1482,11 +1126,11 @@
 
     results = []
     for name in sorted(should_run):
-        func = bench_funcs[name]
+        func, bench_data = bench_funcs[name]
         print "Running %s..." % name
         # PyPy specific modification: let the func to return a list of results
         # for sub-benchmarks
-        bench_result = func(base_cmd_prefix, options)
+        bench_result = func(base_cmd_prefix, options, bench_data)
         name = getattr(func, 'benchmark_name', name)
         if isinstance(bench_result, list):
             for subname, subresult in bench_result:
diff --git a/unladen_swallow/performance/bm_ai.py b/unladen_swallow/performance/bm_ai.py
--- a/unladen_swallow/performance/bm_ai.py
+++ b/unladen_swallow/performance/bm_ai.py
@@ -70,10 +70,6 @@
 
 
 def test_n_queens(iterations):
-    # Warm-up runs.
-    list(n_queens(8))
-    list(n_queens(8))
-
     times = []
     for _ in xrange(iterations):
         t0 = time.time()
diff --git a/unladen_swallow/performance/bm_django.py b/unladen_swallow/performance/bm_django.py
--- a/unladen_swallow/performance/bm_django.py
+++ b/unladen_swallow/performance/bm_django.py
@@ -36,10 +36,6 @@
     table = [xrange(150) for _ in xrange(150)]
     context = Context({"table": table})
 
-    # Warm up Django.
-    DJANGO_TMPL.render(context)
-    DJANGO_TMPL.render(context)
-
     times = []
     for _ in xrange(count):
         t0 = time.time()
diff --git a/unladen_swallow/performance/bm_nbody.py b/unladen_swallow/performance/bm_nbody.py
--- a/unladen_swallow/performance/bm_nbody.py
+++ b/unladen_swallow/performance/bm_nbody.py
@@ -116,10 +116,6 @@
 
 
 def test_nbody(iterations):
-    # Warm-up runs.
-    report_energy()
-    advance(0.01, 20000)
-    report_energy()
 
     times = []
     for _ in xrange(iterations):
diff --git a/unladen_swallow/performance/bm_richards.py b/unladen_swallow/performance/bm_richards.py
--- a/unladen_swallow/performance/bm_richards.py
+++ b/unladen_swallow/performance/bm_richards.py
@@ -21,7 +21,6 @@
 def test_richards(iterations):
     # Warm-up
     r = richards.Richards()
-    r.run(iterations=2)
 
     times = []
     for _ in xrange(iterations):
diff --git a/unladen_swallow/performance/bm_rietveld.py b/unladen_swallow/performance/bm_rietveld.py
--- a/unladen_swallow/performance/bm_rietveld.py
+++ b/unladen_swallow/performance/bm_rietveld.py
@@ -89,43 +89,11 @@
 
 def test_rietveld(count, tmpl, context):
     # Warm up Django.
-    tmpl.render(context)
-    tmpl.render(context)
-
     times = []
     for _ in xrange(count):
         t0 = time.time()
         # 30 calls to render, so that we don't measure loop overhead.
         tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
-        tmpl.render(context)
         t1 = time.time()
         times.append(t1 - t0)
     return times
diff --git a/unladen_swallow/performance/bm_spambayes.py b/unladen_swallow/performance/bm_spambayes.py
--- a/unladen_swallow/performance/bm_spambayes.py
+++ b/unladen_swallow/performance/bm_spambayes.py
@@ -22,10 +22,6 @@
 
 
 def test_spambayes(iterations, messages, ham_classifier):
-    # Prime the pump. This still leaves some hot functions uncompiled; these
-    # will be noticed as hot during the timed loops below.
-    for msg in messages:
-        ham_classifier.score(msg)
 
     times = []
     for _ in xrange(iterations):
diff --git a/unladen_swallow/performance/bm_spitfire.py b/unladen_swallow/performance/bm_spitfire.py
--- a/unladen_swallow/performance/bm_spitfire.py
+++ b/unladen_swallow/performance/bm_spitfire.py
@@ -53,10 +53,6 @@
 
     table = [xrange(1000) for _ in xrange(1000)]
 
-    # Warm up Spitfire.
-    spitfire_tmpl_o4(search_list=[{"table": table}]).main()
-    spitfire_tmpl_o4(search_list=[{"table": table}]).main()
-
     times = []
     for _ in xrange(count):
         t0 = time.time()