[pypy-commit] pypy vmprof-native: copy over changes made to vmprof-python

Wed Mar 15 08:08:42 EDT 2017

Author: Richard Plangger <planrichi at gmail.com>
Branch: vmprof-native
Changeset: r90697:ac30c079910e
Date: 2017-03-14 15:00 +0100
http://bitbucket.org/pypy/pypy/changeset/ac30c079910e/

Log:	copy over changes made to vmprof-python

diff --git a/rpython/rlib/rvmprof/src/vmprof_main.h b/rpython/rlib/rvmprof/src/vmprof_main.h
--- a/rpython/rlib/rvmprof/src/vmprof_main.h
+++ b/rpython/rlib/rvmprof/src/vmprof_main.h
@@ -1,3 +1,5 @@
+#pragma once
+
 /* VMPROF
  *
  * statistical sampling profiler specifically designed to profile programs
@@ -10,45 +12,49 @@
  *
  * Tested only on gcc, linux, x86_64.
  *
- * Copyright (C) 2014-2015
+ * Copyright (C) 2014-2017
  *   Antonio Cuni - anto.cuni at gmail.com
  *   Maciej Fijalkowski - fijall at gmail.com
  *   Armin Rigo - arigo at tunes.org
+ *   Richard Plangger - planrichi at gmail.com
  *
  */
 
 #define _GNU_SOURCE 1
 
 #include <dlfcn.h>
+#include <pthread.h>
+#include <unistd.h>
 #include <assert.h>
-#include <pthread.h>
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <sys/time.h>
-#include <errno.h>
-#include <unistd.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <signal.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include "vmprof_stack.h"
+
+#include "vmprof.h"
+
+#include "vmp_stack.h"
 #include "vmprof_getpc.h"
 #include "vmprof_mt.h"
-#include "vmprof_get_custom_offset.h"
 #include "vmprof_common.h"
+#include "compat.h"
+
+#if defined(__unix__)
+#include "rss_unix.h"
+#elif defined(__APPLE__)
+#include "rss_darwin.h"
+#endif
+
 
 /************************************************************/
 
-static long prepare_interval_usec;
-static long saved_profile_file;
-static struct profbuf_s *volatile current_codes;
 static void *(*mainloop_get_virtual_ip)(char *) = 0;
-
-static int opened_profile(char *interp_name);
+static int opened_profile(const char *interp_name, int memory, int proflines, int native);
 static void flush_codes(void);
 
-
 /************************************************************/
 
 /* value: last bit is 1 if signals must be ignored; all other bits
@@ -79,24 +85,26 @@
 static char atfork_hook_installed = 0;
 
 
-static intptr_t get_current_thread_id(void)
+/* *************************************************************
+ * functions to dump the stack trace
+ * *************************************************************
+ */
+
+int get_stack_trace(PY_THREAD_STATE_T * current, void** result, int max_depth, intptr_t pc)
 {
-    /* xxx This function is a hack on two fronts:
-
-       - It assumes that pthread_self() is async-signal-safe.  This
-         should be true on Linux.  I hope it is also true elsewhere.
-
-       - It abuses pthread_self() by assuming it just returns an
-         integer.  According to comments in CPython's source code, the
-         platforms where it is not the case are rare nowadays.
-
-       An alternative would be to try to look if the information is
-       available in the ucontext_t in the caller.
-    */
-    return (intptr_t)pthread_self();
+    PY_STACK_FRAME_T * frame;
+#ifdef RPYTHON_VMPROF
+    // do nothing here, 
+    frame = (PY_STACK_FRAME_T*)current;
+#else
+    if (!current) {
+        return 0;
+    }
+    frame = current->frame;
+#endif
+    return vmp_walk_and_record_stack(frame, result, max_depth, 1, pc);
 }
 
-
 /* *************************************************************
  * the signal handler
  * *************************************************************
@@ -112,9 +120,67 @@
     longjmp(restore_point, SIGSEGV);
 }
 
+int _vmprof_sample_stack(struct profbuf_s *p, PY_THREAD_STATE_T * tstate, ucontext_t * uc)
+{
+    int depth;
+    struct prof_stacktrace_s *st = (struct prof_stacktrace_s *)p->data;
+    st->marker = MARKER_STACKTRACE;
+    st->count = 1;
+#ifdef RPYTHON_VMPROF
+    depth = get_stack_trace(get_vmprof_stack(), st->stack, MAX_STACK_DEPTH-1, (intptr_t)GetPC(uc));
+#else
+    depth = get_stack_trace(tstate, st->stack, MAX_STACK_DEPTH-1, (intptr_t)NULL);
+#endif
+    if (depth == 0) {
+        return 0;
+    }
+    st->depth = depth;
+    st->stack[depth++] = tstate;
+    long rss = get_current_proc_rss();
+    if (rss >= 0)
+        st->stack[depth++] = (void*)rss;
+    p->data_offset = offsetof(struct prof_stacktrace_s, marker);
+    p->data_size = (depth * sizeof(void *) +
+                    sizeof(struct prof_stacktrace_s) -
+                    offsetof(struct prof_stacktrace_s, marker));
+    return 1;
+}
+
+#ifndef RPYTHON_VMPROF
+static PY_THREAD_STATE_T * _get_pystate_for_this_thread(void) {
+    // see issue 116 on github.com/vmprof/vmprof-python.
+    // PyGILState_GetThisThreadState(); can hang forever
+    //
+    PyInterpreterState * istate;
+    PyThreadState * state;
+    long mythread_id;
+
+    istate = PyInterpreterState_Head();
+    if (istate == NULL) {
+        return NULL;
+    }
+    mythread_id = PyThread_get_thread_ident();
+    // fish fish fish, it will NOT lock the keymutex in pythread
+    do {
+        state = PyInterpreterState_ThreadHead(istate);
+        do {
+            if (state->thread_id == mythread_id) {
+                return state;
+            }
+        } while ((state = PyThreadState_Next(state)) != NULL);
+    } while ((istate = PyInterpreterState_Next(istate)) != NULL);
+
+    // uh? not found?
+    return NULL;
+}
+#endif
+
 static void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext)
 {
-#ifdef __APPLE__
+    int commit;
+    PY_THREAD_STATE_T * tstate = NULL;
+    void (*prevhandler)(int);
+#ifndef RPYTHON_VMPROF
     // TERRIBLE HACK AHEAD
     // on OS X, the thread local storage is sometimes uninitialized
     // when the signal handler runs - it means it's impossible to read errno
@@ -122,48 +188,46 @@
     // it seems impossible to read the register gs.
     // here we register segfault handler (all guarded by a spinlock) and call
     // longjmp in case segfault happens while reading a thread local
+    //
+    // We do the same error detection for linux to ensure that
+    // get_current_thread_state returns a sane result
     while (__sync_lock_test_and_set(&spinlock, 1)) {
     }
-    signal(SIGSEGV, &segfault_handler);
+    prevhandler = signal(SIGSEGV, &segfault_handler);
     int fault_code = setjmp(restore_point);
     if (fault_code == 0) {
         pthread_self();
-        get_current_thread_id();
+        tstate = _get_pystate_for_this_thread();
     } else {
-        signal(SIGSEGV, SIG_DFL);
-        __sync_synchronize();
-        spinlock = 0;
-        return;    
+        signal(SIGSEGV, prevhandler);
+        __sync_lock_release(&spinlock);
+        return;
     }
-    signal(SIGSEGV, SIG_DFL);
-    __sync_synchronize();
-    spinlock = 0;
+    signal(SIGSEGV, prevhandler);
+    __sync_lock_release(&spinlock);
 #endif
+
     long val = __sync_fetch_and_add(&signal_handler_value, 2L);
 
     if ((val & 1) == 0) {
         int saved_errno = errno;
-        int fd = profile_file;
+        int fd = vmp_profile_fileno();
         assert(fd >= 0);
 
         struct profbuf_s *p = reserve_buffer(fd);
         if (p == NULL) {
             /* ignore this signal: there are no free buffers right now */
-        }
-        else {
-            int depth;
-            struct prof_stacktrace_s *st = (struct prof_stacktrace_s *)p->data;
-            st->marker = MARKER_STACKTRACE;
-            st->count = 1;
-            depth = get_stack_trace(get_vmprof_stack(), st->stack,
-                MAX_STACK_DEPTH-2, GetPC((ucontext_t*)ucontext));
-            st->depth = depth;
-            st->stack[depth++] = get_current_thread_id();
-            p->data_offset = offsetof(struct prof_stacktrace_s, marker);
-            p->data_size = (depth * sizeof(void *) +
-                            sizeof(struct prof_stacktrace_s) -
-                            offsetof(struct prof_stacktrace_s, marker));
-            commit_buffer(fd, p);
+        } else {
+#ifdef RPYTHON_VMPORF
+            commit = _vmprof_sample_stack(p, NULL, (ucontext_t*)ucontext);
+#else
+            commit = _vmprof_sample_stack(p, tstate, (ucontext_t*)ucontext);
+#endif
+            if (commit) {
+                commit_buffer(fd, p);
+            } else {
+                cancel_buffer(p);
+            }
         }
 
         errno = saved_errno;
@@ -173,6 +237,7 @@
 }
 
 
+
 /* *************************************************************
  * the setup and teardown functions
  * *************************************************************
@@ -197,58 +262,53 @@
     return 0;
 }
 
-static int itimer_which = ITIMER_PROF;
-
 static int install_sigprof_timer(void)
 {
-    struct itimerval timer;
+    static struct itimerval timer;
     timer.it_interval.tv_sec = 0;
     timer.it_interval.tv_usec = profile_interval_usec;
     timer.it_value = timer.it_interval;
-    if (setitimer(itimer_which, &timer, NULL) == 0)
-        return 0;   /* normal path */
-
-    if (errno == EINVAL) {
-        /* on WSL, only ITIMER_REAL is supported */
-        if (setitimer(ITIMER_REAL, &timer, NULL) == 0) {
-            fprintf(stderr, "warning: setitimer(): ITIMER_PROF not "
-                            "available, using ITIMER_REAL instead. "
-                            "Multithreaded programs and programs "
-                            "doing a lot of I/O won't give correct "
-                            "results.\n");
-            itimer_which = ITIMER_REAL;
-            return 0;
-        }
-    }
-    return -1;
+    if (setitimer(ITIMER_PROF, &timer, NULL) != 0)
+        return -1;
+    return 0;
 }
 
 static int remove_sigprof_timer(void) {
-    struct itimerval timer;
+    static struct itimerval timer;
     timer.it_interval.tv_sec = 0;
     timer.it_interval.tv_usec = 0;
     timer.it_value.tv_sec = 0;
     timer.it_value.tv_usec = 0;
-    if (setitimer(itimer_which, &timer, NULL) != 0)
+    if (setitimer(ITIMER_PROF, &timer, NULL) != 0)
         return -1;
     return 0;
 }
 
 static void atfork_disable_timer(void) {
     if (profile_interval_usec > 0) {
-        saved_profile_file = profile_file;
-        profile_file = -1;
         remove_sigprof_timer();
+#ifndef RPYTHON_VMPROF
+        is_enabled = 0;
+#endif
     }
 }
 
 static void atfork_enable_timer(void) {
     if (profile_interval_usec > 0) {
-        profile_file = saved_profile_file;
         install_sigprof_timer();
+#ifndef RPYTHON_VMPROF
+        is_enabled = 1;
+#endif
     }
 }
 
+static void atfork_close_profile_file(void) {
+    int fd = vmp_profile_fileno();
+    if (fd != -1)
+        close(fd);
+    vmp_set_profile_fileno(-1);
+}
+
 static int install_pthread_atfork_hooks(void) {
     /* this is needed to prevent the problems described there:
          - http://code.google.com/p/gperftools/issues/detail?id=278
@@ -262,20 +322,69 @@
     */
     if (atfork_hook_installed)
         return 0;
-    int ret = pthread_atfork(atfork_disable_timer, atfork_enable_timer, NULL);
+    int ret = pthread_atfork(atfork_disable_timer, atfork_enable_timer, atfork_close_profile_file);
     if (ret != 0)
         return -1;
     atfork_hook_installed = 1;
     return 0;
 }
 
+#ifdef VMP_SUPPORTS_NATIVE_PROFILING
+void init_cpyprof(int native)
+{
+    // skip this if native should not be enabled
+    if (!native) {
+        vmp_native_disable();
+        return;
+    }
+#if CPYTHON_HAS_FRAME_EVALUATION
+    PyThreadState *tstate = PyThreadState_GET();
+    tstate->interp->eval_frame = vmprof_eval;
+    _default_eval_loop = _PyEval_EvalFrameDefault;
+#elif defined(RPYTHON_VMPROF)
+    // do nothing here, the stack is maintained by rpython
+    // no need for a trampoline
+#else
+    if (vmp_patch_callee_trampoline(PyEval_EvalFrameEx,
+                vmprof_eval, (void*)&_default_eval_loop) == 0) {
+    } else {
+        fprintf(stderr, "FATAL: could not insert trampline, try with --no-native\n");
+        // TODO dump the first few bytes and tell them to create an issue!
+        exit(-1);
+    }
+#endif
+    vmp_native_enable();
+}
+
+static void disable_cpyprof(void)
+{
+    vmp_native_disable();
+#if CPYTHON_HAS_FRAME_EVALUATION
+    PyThreadState *tstate = PyThreadState_GET();
+    tstate->interp->eval_frame = _PyEval_EvalFrameDefault;
+#elif defined(RPYTHON_VMPROF)
+    // TODO nothing?
+#else
+    if (vmp_unpatch_callee_trampoline(PyEval_EvalFrameEx) > 0) {
+        fprintf(stderr, "FATAL: could not remove trampoline\n");
+        exit(-1);
+    }
+#endif
+    dump_native_symbols(vmp_profile_fileno());
+}
+#endif
+
 RPY_EXTERN
-int vmprof_enable(void)
+int vmprof_enable(int memory, int native)
 {
-    assert(profile_file >= 0);
+#ifdef VMP_SUPPORTS_NATIVE_PROFILING
+    init_cpyprof(native);
+#endif
+    assert(vmp_profile_fileno() >= 0);
     assert(prepare_interval_usec > 0);
     profile_interval_usec = prepare_interval_usec;
-
+    if (memory && setup_rss() == -1)
+        goto error;
     if (install_pthread_atfork_hooks() == -1)
         goto error;
     if (install_sigprof_handler() == -1)
@@ -286,32 +395,19 @@
     return 0;
 
  error:
-    profile_file = -1;
+    vmp_set_profile_fileno(-1);
     profile_interval_usec = 0;
     return -1;
 }
 
-static int _write_all(const char *buf, size_t bufsize)
+
+int close_profile(void)
 {
-    while (bufsize > 0) {
-        ssize_t count = write(profile_file, buf, bufsize);
-        if (count <= 0)
-            return -1;   /* failed */
-        buf += count;
-        bufsize -= count;
-    }
-    return 0;
-}
+    (void)vmp_write_time_now(MARKER_TRAILER);
 
-static int close_profile(void)
-{
-    char marker = MARKER_TRAILER;
-
-    if (_write_all(&marker, 1) < 0)
-        return -1;
-
+    teardown_rss();
     /* don't close() the file descriptor from here */
-    profile_file = -1;
+    vmp_set_profile_fileno(-1);
     return 0;
 }
 
@@ -320,29 +416,29 @@
 {
     vmprof_ignore_signals(1);
     profile_interval_usec = 0;
+#ifdef VMP_SUPPORTS_NATIVE_PROFILING
+    disable_cpyprof();
+#endif
 
     if (remove_sigprof_timer() == -1)
         return -1;
     if (remove_sigprof_handler() == -1)
         return -1;
     flush_codes();
-    if (shutdown_concurrent_bufs(profile_file) < 0)
+    if (shutdown_concurrent_bufs(vmp_profile_fileno()) < 0)
         return -1;
     return close_profile();
 }
 
 RPY_EXTERN
-int vmprof_register_virtual_function(char *code_name, long code_uid,
+int vmprof_register_virtual_function(char *code_name, intptr_t code_uid,
                                      int auto_retry)
 {
     long namelen = strnlen(code_name, 1023);
-    long blocklen = 1 + 2 * sizeof(long) + namelen;
+    long blocklen = 1 + sizeof(intptr_t) + sizeof(long) + namelen;
     struct profbuf_s *p;
     char *t;
 
-    if (profile_file == -1)
-        return 0; // silently don't write it
-
  retry:
     p = current_codes;
     if (p != NULL) {
@@ -352,7 +448,7 @@
             size_t freesize = SINGLE_BUF_SIZE - p->data_size;
             if (freesize < (size_t)blocklen) {
                 /* full: flush it */
-                commit_buffer(profile_file, p);
+                commit_buffer(vmp_profile_fileno(), p);
                 p = NULL;
             }
         }
@@ -363,7 +459,7 @@
     }
 
     if (p == NULL) {
-        p = reserve_buffer(profile_file);
+        p = reserve_buffer(vmp_profile_fileno());
         if (p == NULL) {
             /* can't get a free block; should almost never be the
                case.  Spin loop if allowed, or return a failure code
@@ -381,14 +477,14 @@
     p->data_size += blocklen;
     assert(p->data_size <= SINGLE_BUF_SIZE);
     *t++ = MARKER_VIRTUAL_IP;
-    memcpy(t, &code_uid, sizeof(long)); t += sizeof(long);
+    memcpy(t, &code_uid, sizeof(intptr_t)); t += sizeof(intptr_t);
     memcpy(t, &namelen, sizeof(long)); t += sizeof(long);
     memcpy(t, code_name, namelen);
 
     /* try to reattach 'p' to 'current_codes' */
     if (!__sync_bool_compare_and_swap(&current_codes, NULL, p)) {
         /* failed, flush it */
-        commit_buffer(profile_file, p);
+        commit_buffer(vmp_profile_fileno(), p);
     }
     return 0;
 }
@@ -398,6 +494,6 @@
     struct profbuf_s *p = current_codes;
     if (p != NULL) {
         current_codes = NULL;
-        commit_buffer(profile_file, p);
+        commit_buffer(vmp_profile_fileno(), p);
     }
 }