[pypy-commit] pypy vecopt: doc additions, reenabled the int8 expand test (passes now)

Wed Jun 24 13:58:25 CEST 2015

Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r78292:e70ae41089d7
Date: 2015-06-24 13:58 +0200
http://bitbucket.org/pypy/pypy/changeset/e70ae41089d7/

Log:	doc additions, reenabled the int8 expand test (passes now)

diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -274,10 +274,9 @@
         sum(d)
         """
     def test_int8_expand(self):
-        py.test.skip("TODO implement assembler")
         result = self.run("int8_expand")
-        assert int(result) == 8*8 + sum(range(0,17))
-        self.check_vectorized(3, 2) # TODO sum at the end
+        assert int(result) == 17*8 + sum(range(0,17))
+        self.check_vectorized(3, 1) # TODO sum at the end
 
     def define_int32_add_const():
         return """
diff --git a/rpython/doc/jit/vectorization.rst b/rpython/doc/jit/vectorization.rst
--- a/rpython/doc/jit/vectorization.rst
+++ b/rpython/doc/jit/vectorization.rst
@@ -2,7 +2,10 @@
 Vectorization
 =============
 
-TBA
+To find parallel instructions the tracer must provide enough information about
+memory load/store operations. They must be adjacent in memory. The requirement for
+that is that they use the same index variable and offset can be expressed as a
+a linear or affine combination.
 
 Features
 --------
@@ -13,6 +16,9 @@
 * int8/int16/int32/int64 arithmetic: add, substract, multiply, negate, absolute
 * int8/int16/int32/int64 logical: and, or, xor
 
+Reduction
+---------
+
 Reduction is implemented:
 
 * sum
@@ -21,10 +27,13 @@
 
 * all, any, prod, min, max
 
-To find parallel instructions the tracer must provide enough information about
-memory load/store operations. They must be adjacent in memory. The requirement for
-that is that they use the same index variable and offset can be expressed as a
-a linear or affine combination.
+Constant & Variable Expansion
+-----------------------------
+
+Packed arithmetic operations expand scalar variables or contants into vector registers.
+
+Guard Strengthening
+-------------------
 
 Unrolled guards are strengthend on a arithmetical level (See GuardStrengthenOpt).
 The resulting vector trace will only have one guard that checks the index.
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -746,6 +746,7 @@
     PSHUFLW_xxi = xmminsn('\xF2', rex_nw, '\x0F\x70', register(1,8), register(2), '\xC0', immediate(3, 'b'))
     PSHUFB_xx = xmminsn('\x66', rex_nw, '\x0F\x38\x00', register(1,8), register(2), '\xC0')
     PSHUFB_xm = xmminsn('\x66', rex_nw, '\x0F\x38\x00', register(1,8), mem_reg_plus_const(2))
+    PSHUFB_xj = xmminsn('\x66', rex_nw, '\x0F\x38\x00', register(1,8), abs_(2))
 
     # SSE3
     HADDPD_xx = xmminsn('\x66', rex_nw, '\x0F\x7C', register(1,8), register(2), '\xC0')