[pypy-commit] pypy stmgc-c8: import stmgc/f1272b890ba0 and further simplify the code written by the jit
arigo
noreply at buildbot.pypy.org
Mon Mar 2 22:23:29 CET 2015
Author: Armin Rigo <arigo at tunes.org>
Branch: stmgc-c8
Changeset: r76226:001a2796489f
Date: 2015-03-02 21:45 +0100
http://bitbucket.org/pypy/pypy/changeset/001a2796489f/
Log: import stmgc/f1272b890ba0 and further simplify the code written by
the jit for stm_write_card
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -2356,58 +2356,82 @@
loc_index = None
if card_marking:
if stm:
- # see stm_write_card() in stmgc.h
+ # see stm_write_card() in stmgc.h.
#
- # implementation idea:
- # mov r11, loc_base # the object
- # and r11, ~15 # align
- # lea r11, [loc_index + r11<<(card_bits-4)]
+ # If loc_base and loc_index are both registers:
+ # lea r11, [loc_index + loc_base<<(card_bits-4)]
# shr r11, card_bits
# cmp [r11+1], card_marked
#
- # This assumes that the value computed by the "lea" fits
- # in 64 bits. It clearly does, because (card_bits-4) is
- # at most 3 and both loc_base and loc_index cannot come
- # anywhere close to 2 ** 60.
+ # If loc_base is a register but loc_index an immediate:
+ # mov r11, loc_base
+ # shr r11, 4
+ # cmp [r11+(loc_index>>card_bits)+1], card_marked
#
- if rstm.CARD_SIZE == 32:
- card_bits = 5
- elif rstm.CARD_SIZE == 64:
- card_bits = 6
- elif rstm.CARD_SIZE == 128:
- card_bits = 7
- else:
- raise AssertionError("CARD_SIZE should be 32/64/128")
+ # If the value above does not fit 32 bits, we do instead
+ # mov r11, loc_index
+ # (then the rest like the register-register case)
+ #
+ # If loc_base is an immediate but loc_index a register:
+ # mov r11, loc_base<<(card_bits-4) + (1<<card_bits)
+ # add r11, loc_index
+ # shr r11, card_bits
+ # cmp [r11], card_marked
+ #
+ # If both are immediates:
+ # mov r11, (loc_base>>4)+(loc_index>>card_bits)+1
+ # cmp [r11], card_marked
+ #
+ card_bits = rstm.CARD_BITS
+ assert 5 <= card_bits <= 7
r11 = X86_64_SCRATCH_REG
loc_index = arglocs[1]
- if isinstance(loc_index, RegLoc):
- if isinstance(loc_base, RegLoc):
- mc.MOV_rr(r11.value, loc_base.value)
- mc.AND_ri(r11.value, ~15)
+ if isinstance(loc_base, RegLoc):
+ if isinstance(loc_index, RegLoc):
+ loc_index_reg = loc_index
+ add_constant = 1
else:
- assert isinstance(loc_base, ImmedLoc)
- initial_value = loc_base.value & ~15
- mc.MOV_ri(r11.value, initial_value)
- mc.LEA_ra(r11.value, (self.SEGMENT_NO,
- loc_index.value,
- r11.value,
- card_bits - 4,
- 0))
- mc.SHR_ri(r11.value, card_bits)
+ assert isinstance(loc_index, ImmedLoc)
+ add_constant = (loc_index.value >> card_bits) + 1
+ if rx86.fits_in_32bits(add_constant):
+ mc.MOV_rr(r11.value, loc_base.value)
+ mc.SHR_ri(r11.value, 4)
+ loc_index_reg = None
+ else:
+ mc.MOV_ri(r11.value, loc_index.value)
+ loc_index_reg = r11
+ add_constant = 1
+ if loc_index_reg is not None:
+ mc.LEA_ra(r11.value, (self.SEGMENT_NO,
+ loc_index_reg.value,
+ loc_base.value,
+ card_bits - 4,
+ 0))
+ mc.SHR_ri(r11.value, card_bits)
else:
- assert isinstance(loc_index, ImmedLoc)
- initial_value = (loc_index.value >> card_bits) << 4
- if isinstance(loc_base, RegLoc):
- mc.MOV_ri(r11.value, initial_value)
- mc.ADD_rr(r11.value, loc_base.value)
- mc.SHR_ri(r11.value, 4)
+ # xxx we could try to know statically if loc_base
+ # points to a large object or not, and produce a
+ # non-card-marking version of the barrier if not
+ assert isinstance(loc_base, ImmedLoc)
+ load_value = loc_base.value << (card_bits - 4)
+ load_value += (1 << card_bits)
+ if isinstance(loc_index, RegLoc):
+ add_constant = load_value >> card_bits
+ if rx86.fits_in_32bits(add_constant):
+ mc.MOV_rr(r11.value, loc_index.value)
+ else:
+ add_constant = 0
+ mc.MOV_ri(r11.value, load_value)
+ mc.ADD_rr(r11.value, loc_index.value)
+ mc.SHR_ri(r11.value, card_bits)
else:
- assert isinstance(loc_base, ImmedLoc)
- initial_value += loc_base.value
- initial_value >>= 4
- mc.MOV_ri(r11.value, initial_value)
+ assert isinstance(loc_index, ImmedLoc)
+ load_value += loc_index.value
+ load_value >>= card_bits
+ mc.MOV_ri(r11.value, load_value)
+ add_constant = 0
#
- mc.CMP8_mi((self.SEGMENT_GC, r11.value, 1),
+ mc.CMP8_mi((self.SEGMENT_GC, r11.value, add_constant),
rstm.CARD_MARKED)
mc.J_il8(rx86.Conditions['E'], 0) # patched later
js_location = mc.get_relative_pos()
diff --git a/rpython/rlib/rstm.py b/rpython/rlib/rstm.py
--- a/rpython/rlib/rstm.py
+++ b/rpython/rlib/rstm.py
@@ -32,6 +32,7 @@
CFlexSymbolic('((long)&_stm_write_slowpath_card)'))
CARD_MARKED = CFlexSymbolic('_STM_CARD_MARKED')
+CARD_BITS = CFlexSymbolic('_STM_CARD_BITS')
CARD_SIZE = CFlexSymbolic('_STM_CARD_SIZE')
GCFLAG_CARDS_SET = CFlexSymbolic('_STM_GCFLAG_CARDS_SET')
diff --git a/rpython/translator/stm/src_stm/revision b/rpython/translator/stm/src_stm/revision
--- a/rpython/translator/stm/src_stm/revision
+++ b/rpython/translator/stm/src_stm/revision
@@ -1,1 +1,1 @@
-cba4ee0e9be6
+f1272b890ba0
diff --git a/rpython/translator/stm/src_stm/stm/core.c b/rpython/translator/stm/src_stm/stm/core.c
--- a/rpython/translator/stm/src_stm/stm/core.c
+++ b/rpython/translator/stm/src_stm/stm/core.c
@@ -961,6 +961,9 @@
a direct way to know the length. We know that it is smaller
than the size in bytes. */
assert(index < size);
+ /* this object was allocated with allocate_outside_nursery_large(),
+ which returns addresses aligned to 16 bytes */
+ assert((((uintptr_t)obj) & 15) == 0);
#endif
/* Write into the card's lock. This is used by the next minor
diff --git a/rpython/translator/stm/src_stm/stm/gcpage.c b/rpython/translator/stm/src_stm/stm/gcpage.c
--- a/rpython/translator/stm/src_stm/stm/gcpage.c
+++ b/rpython/translator/stm/src_stm/stm/gcpage.c
@@ -52,10 +52,14 @@
static stm_char *allocate_outside_nursery_large(uint64_t size)
{
- /* Allocate the object with largemalloc.c from the lower addresses. */
- char *addr = _stm_large_malloc(size);
+ /* Allocate the object with largemalloc.c from the lower
+ addresses. Round up the size to a multiple of 16, rather than
+ 8, as a quick way to simplify the code in stm_write_card().
+ */
+ char *addr = _stm_large_malloc((size + 15) & ~15);
if (addr == NULL)
stm_fatalerror("not enough memory!");
+ assert((((uintptr_t)addr) & 15) == 0); /* alignment check */
if (LIKELY(addr + size <= uninitialized_page_start)) {
dprintf(("allocate_outside_nursery_large(%lu): %p, page=%lu\n",
diff --git a/rpython/translator/stm/src_stm/stmgc.h b/rpython/translator/stm/src_stm/stmgc.h
--- a/rpython/translator/stm/src_stm/stmgc.h
+++ b/rpython/translator/stm/src_stm/stmgc.h
@@ -79,7 +79,8 @@
#define _STM_CARD_MARKED 1 /* should always be 1... */
#define _STM_GCFLAG_CARDS_SET 0x8
-#define _STM_CARD_SIZE 32 /* must be >= 32 */
+#define _STM_CARD_BITS 5 /* must be 5/6/7 for the pypy jit */
+#define _STM_CARD_SIZE (1 << _STM_CARD_BITS)
#define _STM_MIN_CARD_COUNT 17
#define _STM_MIN_CARD_OBJ_SIZE (_STM_CARD_SIZE * _STM_MIN_CARD_COUNT)
@@ -213,10 +214,22 @@
array doesn't actually use card marking, the following read
is a bit nonsensical, but in a way that should never return
CARD_MARKED by mistake.
+
+ The computation of the card marker is further optimized by
+ assuming that large objects are allocated to multiples of
+ 16 (rather than just 8, as all objects are). Under this
+ assumption the following code is equivalent to:
+
+ (obj >> 4) + (index / _STM_CARD_SIZE) + 1
+
+ The code below however takes only a couple of assembler
+ instructions. It also assumes that the intermediate value
+ fits in a 64-bit value, which it clearly does (all values
+ are much smaller than 2 ** 60).
*/
- stm_read_marker_t *card = (stm_read_marker_t *)(((uintptr_t)obj) >> 4);
- card += (index / _STM_CARD_SIZE) + 1; /* get_index_to_card_index() */
- if (card->rm != _STM_CARD_MARKED) {
+ uintptr_t v = (((uintptr_t)obj) << (_STM_CARD_BITS - 4)) + index;
+ stm_read_marker_t *card1 = (stm_read_marker_t *)(v >> _STM_CARD_BITS);
+ if (card1[1].rm != _STM_CARD_MARKED) {
/* slow path. */
_stm_write_slowpath_card(obj, index);
More information about the pypy-commit
mailing list