[pypy-commit] stmgc default: add a valgrind target to the makefile

Tue Jan 21 10:14:30 CET 2014

Author: Remi Meier
Branch: 
Changeset: r650:266b6fe74c32
Date: 2014-01-10 12:52 +0100
http://bitbucket.org/pypy/stmgc/changeset/266b6fe74c32/

Log:	add a valgrind target to the makefile

diff --git a/c4/Makefile b/c4/Makefile
--- a/c4/Makefile
+++ b/c4/Makefile
@@ -36,5 +36,9 @@
 release-%: %.c ${H_FILES} ${C_FILES} stmgc.c
 	gcc -pthread -DNDEBUG -O2 -g $< -o release-$* -Wall stmgc.c -lrt
 
+valgrind-%: %.c ${H_FILES} ${C_FILES} stmgc.c
+	gcc -pthread -DNDEBUG -O1 -g $< -o valgrind-$* -Wall stmgc.c -lrt
+
+
 test-%:
 	./$* 2>/dev/null | grep "check ok"
diff --git a/c7/core.c b/c7/core.c
new file mode 100644
--- /dev/null
+++ b/c7/core.c
@@ -0,0 +1,721 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <asm/prctl.h>
+#include <sys/prctl.h>
+
+#include "core.h"
+#include "list.h"
+#include "pagecopy.h"
+
+
+/* number of pages per thread: */
+#define NB_PAGES            (256*256)    // 256MB
+
+#define NB_THREADS          2
+#define MAP_PAGES_FLAGS     (MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE)
+#define LARGE_OBJECT_WORDS  36
+
+#if defined(__i386__) || defined(__x86_64__)
+#  define HAVE_FULL_EXCHANGE_INSN
+#endif
+
+
+typedef TLPREFIX char localchar_t;
+typedef TLPREFIX struct alloc_for_size_s alloc_for_size_t;
+typedef TLPREFIX struct _thread_local2_s _thread_local2_t;
+
+
+struct alloc_for_size_s {
+    localchar_t *next;
+    uint16_t start, stop;
+    bool flag_partial_page;
+};
+
+struct _thread_local2_s {
+    struct _thread_local1_s _tl1;
+    int thread_num;
+    char *thread_base;
+    struct stm_list_s *modified_objects;
+    struct stm_list_s *new_object_ranges;
+    struct alloc_for_size_s alloc[LARGE_OBJECT_WORDS];
+};
+#define _STM_TL2            ((_thread_local2_t *)_STM_TL1)
+
+/* Logical page number (offset) must be offset by thread_num*NB_PAGES to get
+   the real page number */
+enum { 
+    /* shared read-only page, this (logical) page is shared between threads */
+    SHARED_PAGE=0, 
+    /* this page is private for all (2) threads */
+    REMAPPING_PAGE, 
+    /* page is already private for all (2) threads */
+    PRIVATE_PAGE
+};  /* flag_page_private */
+
+
+/* all pages for all threads: */
+static char *object_pages;
+/* pages for the undo-log that contains copies for objs modified by the leader */
+static char *undo_log_pages;
+static char *undo_log_current;
+
+static int num_threads_started;
+/* the thread which may be the current leader (als check for global_history!=0) */
+static int leader_thread_num;
+/* next free page to allocate objs from */
+static uintptr_t index_page_never_used;
+/* the next global write version. incremented by transaction starts, set 
+   to 0 by collections */
+static int next_write_version;
+/* protects the undo log */
+static int undo_lock;
+/* list of objs modified by the leader */
+static struct stm_list_s *global_history;
+/* approximate range to check if an obj needs to be added to the undo_log 
+   because it may be in the global_history */
+static uint16_t gh_write_version_first;
+static uint16_t gh_write_version_last;
+/* stores the state of a page (xxx_PAGE constants above) */
+static uint8_t flag_page_private[NB_PAGES];
+
+
+/************************************************************/
+
+static void spin_loop(void)
+{
+    asm("pause" : : : "memory");
+}
+
+static void acquire_lock(int *lock)
+{
+    while (__sync_lock_test_and_set(lock, 1) != 0) {
+        while (*lock != 0)
+            spin_loop();
+    }
+}
+
+#define ACQUIRE_LOCK_IF(lock, condition)                \
+({                                                      \
+    bool _acquired = false;                             \
+    while (condition) {                                 \
+        if (__sync_lock_test_and_set(lock, 1) == 0) {   \
+            if (condition)                              \
+                _acquired = true;                       \
+            else                                        \
+                __sync_lock_release(lock);              \
+            break;                                      \
+        }                                               \
+        spin_loop();                                    \
+    }                                                   \
+    _acquired;                                          \
+})
+
+
+static void release_lock(int *lock)
+{
+    __sync_lock_release(lock);
+}
+
+static void write_fence(void)
+{
+#if defined(__amd64__) || defined(__i386__)
+    asm("" : : : "memory");
+#else
+#  error "Define write_fence() for your architecture"
+#endif
+}
+
+/* check if obj was read in current transaction */
+static bool _stm_was_read(object_t *obj)
+{
+    read_marker_t *marker = (read_marker_t *)(((uintptr_t)obj) >> 4);
+    return (marker->rm == _STM_TL1->transaction_read_version);
+}
+
+
+/* 2-thread version to privatize a page. A (logical) page is either shared
+   by the 2 threads, or private for both. Needs more logic (e.g. ref-count)
+   for more threads. */
+static void _stm_privatize(uintptr_t pagenum)
+{
+    /* pagenum is a logical pagenum < NB_PAGES */
+
+    if (flag_page_private[pagenum] == PRIVATE_PAGE)
+        return;
+
+#ifdef HAVE_FULL_EXCHANGE_INSN
+    /* use __sync_lock_test_and_set() as a cheaper alternative to
+       __sync_bool_compare_and_swap(). */
+    int previous = __sync_lock_test_and_set(&flag_page_private[pagenum],
+                                            REMAPPING_PAGE);
+    if (previous == PRIVATE_PAGE) {
+        flag_page_private[pagenum] = PRIVATE_PAGE;
+        return;
+    }
+    bool was_shared = (previous == SHARED_PAGE);
+#else
+    bool was_shared = __sync_bool_compare_and_swap(&flag_page_private[pagenum],
+                                                  SHARED_PAGE, REMAPPING_PAGE);
+#endif
+    if (!was_shared) {
+        while (flag_page_private[pagenum] == REMAPPING_PAGE)
+            spin_loop();
+        return;
+    }
+
+    /* 2 threads for now: thread_num = 0 or 1 */
+    ssize_t pgoff1 = pagenum;
+    ssize_t pgoff2 = pagenum + NB_PAGES;
+    ssize_t localpgoff = pgoff1 + NB_PAGES * _STM_TL2->thread_num;
+    ssize_t otherpgoff = pgoff1 + NB_PAGES * (1 - _STM_TL2->thread_num);
+
+    void *localpg = object_pages + localpgoff * 4096UL;
+    void *otherpg = object_pages + otherpgoff * 4096UL;
+
+    int res = remap_file_pages(localpg, 4096, 0, pgoff2, 0);
+    if (res < 0) {
+        perror("remap_file_pages");
+        abort();
+    }
+    pagecopy(localpg, otherpg);
+    write_fence();
+    assert(flag_page_private[pagenum] == REMAPPING_PAGE);
+    flag_page_private[pagenum] = PRIVATE_PAGE;
+}
+
+
+#define REAL_ADDRESS(object_pages, src)   ((object_pages) + (uintptr_t)(src))
+
+static char *real_address(uintptr_t src)
+{
+    return REAL_ADDRESS(_STM_TL2->thread_base, src);
+}
+
+static char *get_thread_base(long thread_num)
+{
+    return object_pages + thread_num * (NB_PAGES * 4096UL);
+}
+
+void stm_abort_transaction(void);
+
+enum detect_conflicts_e { CANNOT_CONFLICT, CAN_CONFLICT };
+
+/* copy current versions of objs from the leader's object space */
+static void update_to_current_version(enum detect_conflicts_e check_conflict)
+{
+    /* XXX this can be done by acquiring the undo_lock for much less time,
+       but it needs to be carefully synchronized with _stm_write_slowpath().
+       For now it must be called with the undo_lock acquired. */
+
+    /* Loop over objects in 'global_history': if they have been
+       read by the current transaction, the current transaction must
+       abort; then copy them out of the leader's object space ---
+       which may have been modified by the leader's uncommitted
+       transaction; this case will be fixed afterwards.
+    */
+    bool conflict_found_or_dont_check = (check_conflict == CANNOT_CONFLICT);
+    char *local_base = _STM_TL2->thread_base;
+    char *remote_base = get_thread_base(1 - _STM_TL2->thread_num);
+    struct stm_list_s *gh, *gh_next;
+
+    assert(leader_thread_num != _STM_TL2->thread_num);
+
+    for (gh = global_history; gh != NULL; gh = gh_next) {
+
+        STM_LIST_FOREACH(gh, ({
+
+            if (!conflict_found_or_dont_check)
+                conflict_found_or_dont_check = _stm_was_read(item);
+
+            char *dst = REAL_ADDRESS(local_base, item);
+            char *src = REAL_ADDRESS(remote_base, item);
+            char *src_rebased = src - (uintptr_t)local_base;
+            size_t size = stm_object_size_rounded_up((object_t *)src_rebased);
+
+            memcpy(dst + sizeof(char *),
+                   src + sizeof(char *),
+                   size - sizeof(char *));
+        }));
+
+        gh_next = gh->nextlist;
+        stm_list_free(gh);
+    }
+    global_history = NULL;
+    gh_write_version_first = 0xffff;
+    gh_write_version_last = 0;
+
+    /* Finally, loop over objects modified by the leader,
+       and copy them out of the undo log.
+    */
+    char *undo = undo_log_pages;
+    char *undo_end = undo_log_current;
+
+    while (undo < undo_end) {
+
+        char *src = undo;
+        char *dst = *(char **)src;
+        char *src_rebased = src - (uintptr_t)local_base;
+
+        *(char **)src = *(char **)dst; /* fix the first word of the object in
+                                         the undo log, for stm_object_size() */
+        size_t size = stm_object_size_rounded_up((object_t *)src_rebased);
+
+        memcpy(dst + sizeof(char *),
+               src + sizeof(char *),
+               size - sizeof(char *));
+
+        undo += size;
+    }
+    undo_log_current = undo_log_pages;   /* make empty again */
+
+    if (conflict_found_or_dont_check && check_conflict == CAN_CONFLICT) {
+        release_lock(&undo_lock);
+        stm_abort_transaction();
+    }
+}
+
+
+/* if we are not leader and there is a global_history, we check
+   for conflicts and update our pages */
+static void maybe_update(enum detect_conflicts_e check_conflict)
+{
+    if (leader_thread_num != _STM_TL2->thread_num && global_history != NULL) {
+        acquire_lock(&undo_lock);
+        update_to_current_version(check_conflict);
+        release_lock(&undo_lock);
+    }
+}
+
+
+void _stm_write_slowpath(object_t *obj)
+{
+    maybe_update(CAN_CONFLICT);
+
+    _stm_privatize(((uintptr_t)obj) / 4096);
+
+    stm_read(obj);
+
+    _STM_TL2->modified_objects = stm_list_append(
+        _STM_TL2->modified_objects, obj);
+
+    uint16_t wv = obj->write_version;
+    obj->write_version = _STM_TL1->transaction_write_version;
+
+    /* We only need to store a copy of the current version of the object if:
+       - we are the leader;
+       - the object is present in the global_history.
+       The second condition is approximated by the following range check.
+       Storing a few more objects than strictly needed is not really a problem.
+    */
+    /* XXX this can be done without acquiring the undo_lock at all,
+       but we need more care in update_to_current_version(). */
+
+    /* XXX can we avoid writing an unbounded number of copies of the
+       same object in case we run a lot of transactions while the other
+       thread is busy?  Unlikely case but in theory annoying.  Should
+       we anyway bound the undo log's size to much less than NB_PAGES,
+       and if full here, sleep?  Should the bound also count the size
+       taken by the global_history lists? */
+    if (ACQUIRE_LOCK_IF(&undo_lock,
+            wv <= gh_write_version_last && wv >= gh_write_version_first
+                && leader_thread_num == _STM_TL2->thread_num)) {
+        /* record in the undo log a copy of the content of the object */
+        size_t size = stm_object_size_rounded_up(obj);
+        char *source = real_address((uintptr_t)obj);
+        char *undo = undo_log_current;
+        *((object_t **)undo) = obj;
+        memcpy(undo + sizeof(object_t *),
+               source + sizeof(object_t *),
+               size - sizeof(object_t *));
+        /*write_fence();*/
+        undo_log_current = undo + size;
+        release_lock(&undo_lock);
+    }
+}
+
+
+uintptr_t _stm_reserve_page(void)
+{
+    /* Grab a free page, initially shared between the threads. */
+
+    // XXX look in some free list first
+
+    /* Return the index'th object page, which is so far never used. */
+    uintptr_t index = __sync_fetch_and_add(&index_page_never_used, 1);
+    if (index >= NB_PAGES) {
+        fprintf(stderr, "Out of mmap'ed memory!\n");
+        abort();
+    }
+    return index;
+}
+
+#define TO_RANGE(range, start, stop)                                    \
+    ((range) = (object_t *)((start) | (((uintptr_t)(stop)) << 16)))
+
+#define FROM_RANGE(start, stop, range)          \
+    ((start) = (uint16_t)(uintptr_t)(range),    \
+     (stop) = ((uintptr_t)(range)) >> 16)
+
+localchar_t *_stm_alloc_next_page(size_t i)
+{
+    /* 'alloc->next' points to where the next allocation should go.  The
+       present function is called instead when this next allocation is
+       equal to 'alloc->stop'.  As we know that 'start', 'next' and
+       'stop' are always nearby pointers, we play tricks and only store
+       the lower 16 bits of 'start' and 'stop', so that the three
+       variables plus some flags fit in 16 bytes.
+
+       'flag_partial_page' is *cleared* to mean that the 'alloc'
+       describes a complete page, so that it needs not be listed inside
+       'new_object_ranges'.  In all other cases it is *set*.
+    */
+    uintptr_t page;
+    localchar_t *result;
+    alloc_for_size_t *alloc = &_STM_TL2->alloc[i];
+    size_t size = i * 8;
+
+    if (alloc->flag_partial_page) {
+        /* record this range in 'new_object_ranges' */
+        localchar_t *ptr1 = alloc->next - size - 1;
+        object_t *range;
+        TO_RANGE(range, alloc->start, alloc->stop);
+        page = ((uintptr_t)ptr1) / 4096;
+        _STM_TL2->new_object_ranges = stm_list_append(
+            _STM_TL2->new_object_ranges, (object_t *)page);
+        _STM_TL2->new_object_ranges = stm_list_append(
+            _STM_TL2->new_object_ranges, range);
+    }
+
+    /* reserve a fresh new page */
+    page = _stm_reserve_page();
+
+    result = (localchar_t *)(page * 4096UL);
+    alloc->start = (uintptr_t)result;
+    alloc->stop = alloc->start + (4096 / size) * size;
+    alloc->next = result + size;
+    alloc->flag_partial_page = false;
+    return result;
+}
+
+object_t *stm_allocate(size_t size)
+{
+    assert(size % 8 == 0);
+    size_t i = size / 8;
+    assert(2 <= i && i < LARGE_OBJECT_WORDS);//XXX
+    alloc_for_size_t *alloc = &_STM_TL2->alloc[i];
+
+    localchar_t *p = alloc->next;
+    alloc->next = p + size;
+    if ((uint16_t)(uintptr_t)p == alloc->stop)
+        p = _stm_alloc_next_page(i);
+
+    object_t *result = (object_t *)p;
+    result->write_version = _STM_TL1->transaction_write_version;
+    return result;
+}
+
+
+#define TOTAL_MEMORY          (NB_PAGES * 4096UL * (NB_THREADS + 1))
+#define READMARKER_END        ((NB_PAGES * 4096UL) >> 4)
+#define FIRST_OBJECT_PAGE     ((READMARKER_END + 4095) / 4096UL)
+#define READMARKER_START      ((FIRST_OBJECT_PAGE * 4096UL) >> 4)
+#define FIRST_READMARKER_PAGE (READMARKER_START / 4096UL)
+
+void stm_setup(void)
+{
+    /* Check that some values are acceptable */
+    assert(4096 <= ((uintptr_t)_STM_TL1));
+    assert(((uintptr_t)_STM_TL1) == ((uintptr_t)_STM_TL2));
+    assert(((uintptr_t)_STM_TL2) + sizeof(*_STM_TL2) <= 8192);
+    assert(2 <= FIRST_READMARKER_PAGE);
+    assert(FIRST_READMARKER_PAGE * 4096UL <= READMARKER_START);
+    assert(READMARKER_START < READMARKER_END);
+    assert(READMARKER_END <= 4096UL * FIRST_OBJECT_PAGE);
+    assert(FIRST_OBJECT_PAGE < NB_PAGES);
+
+    object_pages = mmap(NULL, TOTAL_MEMORY,
+                        PROT_READ | PROT_WRITE,
+                        MAP_PAGES_FLAGS, -1, 0);
+    if (object_pages == MAP_FAILED) {
+        perror("object_pages mmap");
+        abort();
+    }
+
+    long i;
+    for (i = 0; i < NB_THREADS; i++) {
+        char *thread_base = get_thread_base(i);
+
+        /* In each thread's section, the first page is where TLPREFIX'ed
+           NULL accesses land.  We mprotect it so that accesses fail. */
+        mprotect(thread_base, 4096, PROT_NONE);
+
+        /* Fill the TLS page (page 1) with 0xDD */
+        memset(REAL_ADDRESS(thread_base, 4096), 0xDD, 4096);
+        /* Make a "hole" at _STM_TL1 / _STM_TL2 */
+        memset(REAL_ADDRESS(thread_base, _STM_TL2), 0, sizeof(*_STM_TL2));
+
+        /* Pages in range(2, FIRST_READMARKER_PAGE) are never used */
+        if (FIRST_READMARKER_PAGE > 2)
+            mprotect(thread_base + 8192, (FIRST_READMARKER_PAGE - 2) * 4096UL,
+                     PROT_NONE);
+
+        _STM_TL2->thread_num = i;
+        _STM_TL2->thread_base = thread_base;
+
+        if (i > 0) {
+            int res;
+            res = remap_file_pages(thread_base + FIRST_OBJECT_PAGE * 4096UL,
+                                   (NB_PAGES - FIRST_OBJECT_PAGE) * 4096UL,
+                                   0, FIRST_OBJECT_PAGE, 0);
+            if (res != 0) {
+                perror("remap_file_pages");
+                abort();
+            }
+        }
+    }
+
+    undo_log_pages = get_thread_base(NB_THREADS);
+    mprotect(undo_log_pages, 4096, PROT_NONE);
+    mprotect(undo_log_pages + (NB_PAGES - 1) * 4096UL, 4096, PROT_NONE);
+    undo_log_pages += 4096;
+    undo_log_current = undo_log_pages;
+
+    num_threads_started = 0;
+    index_page_never_used = FIRST_OBJECT_PAGE;
+    next_write_version = 1;
+    leader_thread_num = 0;
+    global_history = NULL;
+    gh_write_version_first = 0xffff;
+    gh_write_version_last = 0;
+}
+
+#define INVALID_GS_VALUE  0xDDDDDDDDDDDDDDDDUL
+
+static void set_gs_register(uint64_t value)
+{
+    int result = syscall(SYS_arch_prctl, ARCH_SET_GS, value);
+    assert(result == 0);
+}
+
+void stm_setup_thread(void)
+{
+    int thread_num = __sync_fetch_and_add(&num_threads_started, 1);
+    assert(thread_num < 2);  /* only 2 threads for now */
+
+    char *thread_base = get_thread_base(thread_num);
+    set_gs_register((uintptr_t)thread_base);
+
+    assert(_STM_TL2->thread_num == thread_num);
+    assert(_STM_TL2->thread_base == thread_base);
+
+    _STM_TL2->modified_objects = stm_list_create();
+}
+
+void _stm_teardown_thread(void)
+{
+    stm_list_free(_STM_TL2->modified_objects);
+    _STM_TL2->modified_objects = NULL;
+
+    set_gs_register(INVALID_GS_VALUE);
+}
+
+void _stm_teardown(void)
+{
+    munmap(object_pages, TOTAL_MEMORY);
+    object_pages = NULL;
+    undo_log_pages = NULL;
+    undo_log_current = NULL;
+}
+
+
+static void reset_transaction_read_version(void)
+{
+    /* force-reset all read markers to 0 */
+
+    /* XXX measure the time taken by this madvise() and the following
+       zeroing of pages done lazily by the kernel; compare it with using
+       16-bit read_versions.
+    */
+    /* XXX try to use madvise() on smaller ranges of memory.  In my
+       measures, we could gain a factor 2 --- not really more, even if
+       the range of virtual addresses below is very large, as long as it
+       is already mostly non-reserved pages.  (The following call keeps
+       them non-reserved; apparently the kernel just skips them very
+       quickly.)
+    */
+    int res = madvise(real_address(FIRST_READMARKER_PAGE * 4096UL),
+                      (FIRST_OBJECT_PAGE - FIRST_READMARKER_PAGE) * 4096UL,
+                      MADV_DONTNEED);
+    if (res < 0) {
+        perror("madvise");
+        abort();
+    }
+    _STM_TL1->transaction_read_version = 0;
+}
+
+void stm_major_collection(void)
+{
+    abort();
+}
+
+void stm_start_transaction(jmp_buf *jmpbufptr)
+{
+    if (_STM_TL1->transaction_read_version == 0xff)
+        reset_transaction_read_version();
+    _STM_TL1->transaction_read_version++;
+    _STM_TL1->jmpbufptr = NULL;
+
+    while (1) {
+        int wv = __sync_fetch_and_add(&next_write_version, 1);
+        if (LIKELY(wv <= 0xffff)) {
+            _STM_TL1->transaction_write_version = wv;
+            break;
+        }
+        /* We run out of 16-bit numbers before we do the next major
+           collection, which resets it.  XXX This case seems unlikely
+           for now, but check if it could become a bottleneck at some
+           point. */
+        stm_major_collection();
+    }
+    assert(stm_list_is_empty(_STM_TL2->modified_objects));
+    assert(stm_list_is_empty(_STM_TL2->new_object_ranges));
+
+    maybe_update(CANNOT_CONFLICT);    /* no read object: cannot conflict */
+
+    _STM_TL1->jmpbufptr = jmpbufptr;
+}
+
+static void update_new_objects_in_other_threads(uintptr_t pagenum,
+                                                uint16_t start, uint16_t stop)
+{
+    size_t size = (uint16_t)(stop - start);
+    assert(size <= 4096 - (start & 4095));
+    assert((start & ~4095) == (uint16_t)(pagenum * 4096));
+
+    int thread_num = _STM_TL2->thread_num;
+    uintptr_t local_src = (pagenum * 4096UL) + (start & 4095);
+    char *dst = REAL_ADDRESS(get_thread_base(1 - thread_num), local_src);
+    char *src = REAL_ADDRESS(_STM_TL2->thread_base,           local_src);
+
+    memcpy(dst, src, size);
+}
+
+void stm_stop_transaction(void)
+{
+    write_fence();   /* see later in this function for why */
+
+    acquire_lock(&undo_lock);
+
+    if (leader_thread_num != _STM_TL2->thread_num) {
+        /* non-leader thread */
+        if (global_history != NULL) {
+            update_to_current_version(CAN_CONFLICT);
+            assert(global_history == NULL);
+        }
+
+        /* steal leadership now */
+        leader_thread_num = _STM_TL2->thread_num;
+    }
+
+    /* now we are the leader thread.  the leader can always commit */
+    _STM_TL1->jmpbufptr = NULL;          /* cannot abort any more */
+    undo_log_current = undo_log_pages;   /* throw away the content */
+
+    /* add these objects to the global_history */
+    _STM_TL2->modified_objects->nextlist = global_history;
+    global_history = _STM_TL2->modified_objects;
+    _STM_TL2->modified_objects = stm_list_create();
+
+    uint16_t wv = _STM_TL1->transaction_write_version;
+    if (wv < gh_write_version_last)  gh_write_version_last  = wv;
+    if (wv > gh_write_version_first) gh_write_version_first = wv;
+
+    /* walk the new_object_ranges and manually copy the new objects
+       to the other thread's pages in the (hopefully rare) case that
+       the page they belong to is already unshared */
+    long i;
+    struct stm_list_s *lst = _STM_TL2->new_object_ranges;
+    for (i = stm_list_count(lst); i > 0; ) {
+        i -= 2;
+        uintptr_t pagenum = (uintptr_t)stm_list_item(lst, i);
+
+        /* NB. the read next line should work even against a parallel
+           thread, thanks to the lock acquisition we do earlier (see the
+           beginning of this function).  Indeed, if this read returns
+           SHARED_PAGE, then we know that the real value in memory was
+           actually SHARED_PAGE at least at the time of the
+           acquire_lock().  It may have been modified afterwards by a
+           compare_and_swap() in the other thread, but then we know for
+           sure that the other thread is seeing the last, up-to-date
+           version of our data --- this is the reason of the
+           write_fence() just before the acquire_lock().
+        */
+        if (flag_page_private[pagenum] != SHARED_PAGE) {
+            object_t *range = stm_list_item(lst, i + 1);
+            uint16_t start, stop;
+            FROM_RANGE(start, stop, range);
+            update_new_objects_in_other_threads(pagenum, start, stop);
+        }
+    }
+
+    /* do the same for the partially-allocated pages */
+    long j;
+    for (j = 2; j < LARGE_OBJECT_WORDS; j++) {
+        alloc_for_size_t *alloc = &_STM_TL2->alloc[j];
+        uint16_t start = alloc->start;
+        uint16_t cur = (uintptr_t)alloc->next;
+
+        if (start == cur) {
+            /* nothing to do: this page (or fraction thereof) was left
+               empty by the previous transaction, and starts empty as
+               well in the new transaction.  'flag_partial_page' is
+               unchanged. */
+        }
+        else {
+            uintptr_t pagenum = ((uintptr_t)(alloc->next - 1)) / 4096UL;
+            /* for the new transaction, it will start here: */
+            alloc->start = cur;
+
+            if (alloc->flag_partial_page) {
+                if (flag_page_private[pagenum] != SHARED_PAGE) {
+                    update_new_objects_in_other_threads(pagenum, start, cur);
+                }
+            }
+            else {
+                /* we can skip checking flag_page_private[] in non-debug
+                   builds, because the whole page can only contain
+                   objects made by the just-finished transaction. */
+                assert(flag_page_private[pagenum] == SHARED_PAGE);
+
+                /* the next transaction will start with this page
+                   containing objects that are now committed, so
+                   we need to set this flag now */
+                alloc->flag_partial_page = true;
+            }
+        }
+    }
+
+    release_lock(&undo_lock);
+}
+
+void stm_abort_transaction(void)
+{
+    long j;
+    for (j = 2; j < LARGE_OBJECT_WORDS; j++) {
+        alloc_for_size_t *alloc = &_STM_TL2->alloc[j];
+        uint16_t num_allocated = ((uintptr_t)alloc->next) - alloc->start;
+        alloc->next -= num_allocated;
+    }
+    stm_list_clear(_STM_TL2->new_object_ranges);
+    stm_list_clear(_STM_TL2->modified_objects);
+    assert(_STM_TL1->jmpbufptr != NULL);
+    assert(_STM_TL1->jmpbufptr != (jmp_buf *)-1);   /* for tests only */
+    longjmp(*_STM_TL1->jmpbufptr, 1);
+}
+
diff --git a/c7/core.h b/c7/core.h
new file mode 100644
--- /dev/null
+++ b/c7/core.h
@@ -0,0 +1,78 @@
+#ifndef _STM_CORE_H
+#define _STM_CORE_H
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <setjmp.h>
+
+
+#define TLPREFIX __attribute__((address_space(256)))
+
+typedef TLPREFIX struct _thread_local1_s _thread_local1_t;
+typedef TLPREFIX struct object_s object_t;
+typedef TLPREFIX struct read_marker_s read_marker_t;
+
+
+/* Structure of objects
+   --------------------
+
+   Objects manipulated by the user program, and managed by this library,
+   must start with a "struct object_s" field.  Pointers to any user object
+   must use the "TLPREFIX struct foo *" type --- don't forget TLPREFIX.
+   The best is to use typedefs like above.
+
+   The object_s part contains some fields reserved for the STM library,
+   as well as a 32-bit integer field that can be freely used by the user
+   program.  However, right now this field must be read-only --- i.e. it
+   must never be modified on any object that may already belong to a
+   past transaction; you can only set it on just-allocated objects.  The
+   best is to consider it as a field that is written to only once on
+   newly allocated objects.
+*/
+
+struct object_s {
+    uint16_t write_version;       /* reserved for the STM library */
+    /*uint8_t stm_flags;*/
+    uint32_t header;              /* for the user program -- only write in
+                                     newly allocated objects */
+};
+
+struct read_marker_s {
+    uint8_t rm;
+};
+
+struct _thread_local1_s {
+    jmp_buf *jmpbufptr;
+    uint8_t transaction_read_version;
+    uint16_t transaction_write_version;
+};
+#define _STM_TL1            ((_thread_local1_t *)4352)
+
+
+/* this should use llvm's coldcc calling convention,
+   but it's not exposed to C code so far */
+void _stm_write_slowpath(object_t *);
+
+#define LIKELY(x)   __builtin_expect(x, true)
+#define UNLIKELY(x) __builtin_expect(x, false)
+
+/* invisible read, simply add to read-set */
+static inline void stm_read(object_t *obj)
+{
+    ((read_marker_t *)(((uintptr_t)obj) >> 4))->rm =
+        _STM_TL1->transaction_read_version;
+}
+
+/* open object for writing, eagerly detects write-write conflicts */
+static inline void stm_write(object_t *obj)
+{
+    if (UNLIKELY(obj->write_version != _STM_TL1->transaction_write_version))
+        _stm_write_slowpath(obj);
+}
+
+
+/* must be provided by the user of this library */
+extern size_t stm_object_size_rounded_up(object_t *);
+
+
+#endif