[pypy-commit] pypy vecopt: preventing int signext from >32 -> <32

Wed Jun 24 12:00:37 CEST 2015

Author: Richard Plangger <rich at pasra.at>
Branch: vecopt
Changeset: r78287:3b569b13ba22
Date: 2015-06-24 12:00 +0200
http://bitbucket.org/pypy/pypy/changeset/3b569b13ba22/

Log:	preventing int signext from >32 -> <32 preventing packed int mul for
	64 bit cannot be done with an sse opcode (see assembler comment)
	interestingly SSE seems to quite well support float/double, but not
	int (other than add,sub,logicals)

diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -2598,10 +2598,12 @@
             self.mc.PMULLW(loc0, loc1)
         elif itemsize == 4:
             self.mc.PMULLD(loc0, loc1)
-        elif itemsize == 8:
-            self.mc.PMULDQ(loc0, loc1) # TODO
         else:
-            raise NotImplementedError("did not implement integer mul")
+            # NOTE see http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
+            # There is no 64x64 bit packed mul and I did not find one
+            # for 8 bit either. It is questionable if it gives any benefit
+            # for 8 bit.
+            raise NotImplementedError("")
 
     def genop_vec_int_add(self, op, arglocs, resloc):
         loc0, loc1, size_loc = arglocs
diff --git a/rpython/jit/metainterp/jitexc.py b/rpython/jit/metainterp/jitexc.py
--- a/rpython/jit/metainterp/jitexc.py
+++ b/rpython/jit/metainterp/jitexc.py
@@ -61,6 +61,14 @@
             self.green_int, self.green_ref, self.green_float,
             self.red_int, self.red_ref, self.red_float)
 
+class NotAVectorizeableLoop(JitException):
+    def __str__(self):
+        return 'NotAVectorizeableLoop()'
+
+class NotAProfitableLoop(JitException):
+    def __str__(self):
+        return 'NotAProfitableLoop()'
+
 
 def _get_standard_error(rtyper, Class):
     exdata = rtyper.exceptiondata
diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py
--- a/rpython/jit/metainterp/optimizeopt/schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/schedule.py
@@ -6,6 +6,7 @@
         MemoryRef, Node, IndexVar)
 from rpython.jit.metainterp.optimizeopt.util import Renamer
 from rpython.rlib.objectmodel import we_are_translated
+from rpython.jit.metainterp.jitexc import NotAProfitableLoop
 
 
 class SchedulerData(object):
@@ -238,12 +239,31 @@
         self.input_type = self.determine_input_type(op0)
         self.output_type = self.determine_output_type(op0)
 
+    def check_if_pack_supported(self, pack):
+        op0 = pack.operations[0].getoperation()
+        insize = self.input_type.getsize()
+        if op0.casts_box():
+            # prohibit the packing of signext calls that
+            # cast to int16/int8.
+            _, outsize = op0.cast_to()
+            self._prevent_signext(outsize, insize)
+        if op0.getopnum() == rop.INT_ADD:
+            if insize == 8 or insize == 1:
+                # see assembler for comment why
+                raise NotAProfitableLoop
+
+    def _prevent_signext(self, outsize, insize):
+        if outsize < 4 and insize != outsize:
+            raise NotAProfitableLoop
+
     def as_vector_operation(self, pack, sched_data, oplist):
         self.sched_data = sched_data
         self.preamble_ops = oplist
         self.costmodel = sched_data.costmodel
         self.update_input_output(pack)
         #
+        self.check_if_pack_supported(pack)
+        #
         off = 0
         stride = self.split_pack(pack, self.sched_data.vec_reg_size)
         left = len(pack.operations)
@@ -370,6 +390,7 @@
 
     def extend_int(self, vbox, newtype):
         vbox_cloned = newtype.new_vector_box(vbox.item_count)
+        self._prevent_signext(newtype.getsize(), vbox.getsize())
         op = ResOperation(rop.VEC_INT_SIGNEXT, 
                           [vbox, ConstInt(newtype.getsize())],
                           vbox_cloned)
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -9,7 +9,7 @@
 import time
 
 from rpython.jit.metainterp.resume import Snapshot
-from rpython.jit.metainterp.jitexc import JitException
+from rpython.jit.metainterp.jitexc import NotAVectorizeableLoop, NotAProfitableLoop
 from rpython.jit.metainterp.optimizeopt.unroll import optimize_unroll
 from rpython.jit.metainterp.compile import ResumeAtLoopHeaderDescr, invent_fail_descr_for_op
 from rpython.jit.metainterp.history import (ConstInt, VECTOR, FLOAT, INT,
@@ -44,14 +44,6 @@
             else:
                 print ""
 
-class NotAVectorizeableLoop(JitException):
-    def __str__(self):
-        return 'NotAVectorizeableLoop()'
-
-class NotAProfitableLoop(JitException):
-    def __str__(self):
-        return 'NotAProfitableLoop()'
-
 def optimize_vector(metainterp_sd, jitdriver_sd, loop, optimizations,
                     inline_short_preamble, start_state, cost_threshold):
     optimize_unroll(metainterp_sd, jitdriver_sd, loop, optimizations,
@@ -623,7 +615,7 @@
                     else:
                         # store only has an input
                         return Pair(lnode, rnode, ptype, None)
-                if self.profitable_pack(lnode, rnode, origin_pack):
+                if self.profitable_pack(lnode, rnode, origin_pack, forward):
                     input_type = origin_pack.output_type
                     output_type = determine_output_type(lnode, input_type)
                     return Pair(lnode, rnode, input_type, output_type)
@@ -640,33 +632,29 @@
                 return True
         return False
 
-    def profitable_pack(self, lnode, rnode, origin_pack):
+    def profitable_pack(self, lnode, rnode, origin_pack, forward):
         lpacknode = origin_pack.left
-        if self.prohibit_packing(origin_pack, lpacknode.getoperation(), lnode.getoperation()):
+        if self.prohibit_packing(origin_pack,
+                                 lpacknode.getoperation(),
+                                 lnode.getoperation(),
+                                 forward):
             return False
         rpacknode = origin_pack.right
-        if self.prohibit_packing(origin_pack, rpacknode.getoperation(), rnode.getoperation()):
+        if self.prohibit_packing(origin_pack,
+                                 rpacknode.getoperation(),
+                                 rnode.getoperation(),
+                                 forward):
             return False
 
         return True
 
-    def prohibit_packing(self, pack, packed, inquestion):
+    def prohibit_packing(self, pack, packed, inquestion, forward):
         """ Blocks the packing of some operations """
         if inquestion.vector == -1:
             return True
         if packed.is_raw_array_access():
             if packed.getarg(1) == inquestion.result:
                 return True
-        if inquestion.casts_box():
-            # prohibit the packing of signext calls that
-            # cast to int16/int8.
-            input_type = pack.output_type
-            if input_type:
-                py.test.set_trace()
-                insize = input_type.getsize()
-                outtype,outsize = inquestion.cast_to()
-                if outsize < 4 and insize != outsize:
-                    return True
         return False
 
     def combine(self, i, j):
diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -214,7 +214,6 @@
         if self.casts[3] == 0:
             if self.getopnum() == rop.INT_SIGNEXT:
                 arg = self.getarg(1)
-                assert isinstance(arg, ConstInt)
                 return (to_type,arg.value)
             else:
                 raise NotImplementedError