From 36dc037d654e0e1909fc9bfb8c7665dfcfbfd8ae Mon Sep 17 00:00:00 2001
From: Gabriel Ebner <gebner@gebner.org>
Date: Tue, 25 Jul 2017 08:26:46 +0100
Subject: [PATCH] perf(util/rc): do not use fences

As suggested by David Chisnall.  If I read the spec correctly, it would
be unsafe to use the release ordering for dec_ref_core: then the
following situation could happen:

```c++
// m_rc -> 2

// Thread 1:
unsigned x = atomic_fetch_sub_explicit(&m_rc, 1u, memory_order_release);

// Thread 2:
unsigned y = atomic_fetch_sub_explicit(&m_rc, 1u, memory_order_release);

// x = y = 1u, m_rc -> 2
```

That is, a release store operation is not required to be visible to
another release operation.

Herb Sutter also recommends the acq_rel ordering for reference counters.
---
 src/util/rc.h     | 5 ++---
 src/util/thread.h | 4 ++++
 2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/src/util/rc.h b/src/util/rc.h
index 574e3a1e32..5f771504f0 100644
--- a/src/util/rc.h
+++ b/src/util/rc.h
@@ -14,12 +14,11 @@ Author: Leonardo de Moura
 private:                                                                \
 atomic<unsigned> m_rc;                                                  \
 public:                                                                 \
-unsigned get_rc() const { return atomic_load(&m_rc); }                  \
+unsigned get_rc() const { return atomic_load_explicit(&m_rc, memory_order_acquire); } \
 void inc_ref() { atomic_fetch_add_explicit(&m_rc, 1u, memory_order_relaxed); } \
 bool dec_ref_core() {                                                   \
     lean_assert(get_rc() > 0);                                          \
-    if (atomic_fetch_sub_explicit(&m_rc, 1u, memory_order_release) == 1u) { \
-        atomic_thread_fence(memory_order_acquire);                      \
+    if (atomic_fetch_sub_explicit(&m_rc, 1u, memory_order_acq_rel) == 1u) { \
         return true;                                                    \
     } else {                                                            \
         return false;                                                   \
diff --git a/src/util/thread.h b/src/util/thread.h
index e8b8206423..51bbaff1cb 100644
--- a/src/util/thread.h
+++ b/src/util/thread.h
@@ -37,11 +37,13 @@ using std::condition_variable;
 using std::lock_guard;
 using std::unique_lock;
 using std::atomic_load;
+using std::atomic_load_explicit;
 using std::atomic_fetch_add_explicit;
 using std::atomic_fetch_sub_explicit;
 using std::memory_order_relaxed;
 using std::memory_order_release;
 using std::memory_order_acquire;
+using std::memory_order_acq_rel;
 using std::memory_order_seq_cst;
 using std::atomic_thread_fence;
 namespace this_thread = std::this_thread;
@@ -70,6 +72,7 @@ namespace lean {
 constexpr int memory_order_relaxed = 0;
 constexpr int memory_order_release = 0;
 constexpr int memory_order_acquire = 0;
+constexpr int memory_order_acq_rel = 0;
 constexpr int memory_order_seq_cst = 0;
 inline void atomic_thread_fence(int ) {}
 template<typename T>
@@ -95,6 +98,7 @@ public:
     atomic & operator--() { --m_value; return *this; }
     atomic   operator--(int ) { atomic tmp(*this); --m_value; return tmp; }
     friend T atomic_load(atomic const * a) { return a->m_value; }
+    friend T atomic_load_explicit(atomic const * a, int) { return a->m_value; }
     friend T atomic_fetch_add_explicit(atomic * a, T const & v, int ) { T r(a->m_value); a->m_value += v; return r; }
     friend T atomic_fetch_sub_explicit(atomic * a, T const & v, int ) { T r(a->m_value); a->m_value -= v; return r; }
     T exchange(T desired) { T old = m_value; m_value = desired; return old; }