From 458f50633616a1015a579aac8395a160825245a3 Mon Sep 17 00:00:00 2001
From: Etienne Renault <renault@lrde.epita.fr>
Date: Tue, 15 Mar 2016 15:41:28 +0100
Subject: [PATCH] bricks: add bricks for concurrent hashmap

* Makefile.am, README, bricks/brick-assert.h,
bricks/brick-bitlevel.h, bricks/brick-hash.h,
bricks/brick-hashset.h, bricks/brick-shmem.h,
bricks/brick-types.h, configure.ac,
debian/copyright, debian/libspot-dev.install,
m4/bricks.m4, tests/Makefile.am,
tests/core/.gitignore, tests/core/bricks.cc: here.
---
 Makefile.am                |    4 +
 README                     |    1 +
 bricks/brick-assert.h      |  203 +++++
 bricks/brick-bitlevel.h    |  661 +++++++++++++++
 bricks/brick-hash.h        |  977 ++++++++++++++++++++++
 bricks/brick-hashset.h     | 1581 ++++++++++++++++++++++++++++++++++++
 bricks/brick-shmem.h       | 1142 ++++++++++++++++++++++++++
 bricks/brick-types.h       | 1206 +++++++++++++++++++++++++++
 configure.ac               |    3 +
 debian/copyright           |   27 +
 debian/libspot-dev.install |    1 +
 m4/bricks.m4               |    4 +
 tests/Makefile.am          |    4 +-
 tests/core/.gitignore      |    1 +
 tests/core/bricks.cc       |   80 ++
 15 files changed, 5894 insertions(+), 1 deletion(-)
 create mode 100644 bricks/brick-assert.h
 create mode 100644 bricks/brick-bitlevel.h
 create mode 100644 bricks/brick-hash.h
 create mode 100644 bricks/brick-hashset.h
 create mode 100644 bricks/brick-shmem.h
 create mode 100644 bricks/brick-types.h
 create mode 100644 m4/bricks.m4
 create mode 100644 tests/core/bricks.cc

diff --git a/Makefile.am b/Makefile.am
index 186472e94..51e5c37f0 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -39,6 +39,10 @@ SUBDIRS = picosat buddy lib ltdl spot bin tests $(PYTHON_SUBDIR) $(DOC_SUBDIR) \
 UTF8 = utf8/README.md utf8/utf8.h	\
   utf8/utf8/checked.h utf8/utf8/core.h utf8/utf8/unchecked.h
 
+nobase_include_HEADERS= bricks/brick-assert.h bricks/brick-bitlevel.h \
+	bricks/brick-hash.h bricks/brick-hashset.h  bricks/brick-shmem.h \
+	bricks/brick-types.h
+
 DEBIAN =					\
   debian/changelog				\
   debian/changelog.in				\
diff --git a/README b/README
index 72615f049..6ce7271a0 100644
--- a/README
+++ b/README
@@ -323,6 +323,7 @@ Third party software
 --------------------
 
 buddy/            A customized version of BuDDy 2.3 (a BDD library).
+bricks/           A collection of useful C++ code provided by DiVinE 3.3.2
 ltdl/             Libtool's portable dlopen() wrapper library.
 lib/              Gnulib's portability modules.
 utf8/             Nemanja Trifunovic's utf-8 routines.
diff --git a/bricks/brick-assert.h b/bricks/brick-assert.h
new file mode 100644
index 000000000..34a9ac543
--- /dev/null
+++ b/bricks/brick-assert.h
@@ -0,0 +1,203 @@
+// -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+
+/*
+ * Various assert macros based on C++ exceptions and their support code.
+ */
+
+/*
+ * (c) 2006-2014 Petr Ročkai <me@mornfall.net>
+ */
+
+/* Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE. */
+
+#include <exception>
+#include <string>
+#include <sstream>
+
+#ifdef __divine__
+#include <divine.h>
+#endif
+
+#ifndef TEST
+#define TEST(n)         void n()
+#define TEST_FAILING(n) void n()
+#endif
+
+#ifdef __divine__
+#define ASSERT(x) assert( x )
+#define ASSERT_PRED(p, x) assert( p( x ) )
+#define ASSERT_EQ(x, y) assert( (x) == (y) )
+#define ASSERT_LEQ(x, y) assert( (x) <= (y) )
+#define ASSERT_NEQ(x, y) assert ( (x) != (y) )
+#define ASSERT_EQ_IDX(i, x, y) assert( (x) == (y) )
+
+#elif !defined NDEBUG
+#define ASSERT(x) ::brick::_assert::assert_fn( BRICK_LOCWRAP( BRICK_LOCATION( #x ) ), x )
+#define ASSERT_PRED(p, x) ::brick::_assert::assert_pred_fn( BRICK_LOCWRAP( BRICK_LOCATION( #p "( " #x " )" ) ), x, p( x ) )
+#define ASSERT_EQ(x, y) ::brick::_assert::assert_eq_fn( BRICK_LOCWRAP( BRICK_LOCATION( #x " == " #y ) ), x, y )
+#define ASSERT_LEQ(x, y) ::brick::_assert::assert_leq_fn( BRICK_LOCWRAP( BRICK_LOCATION( #x " <= " #y ) ), x, y )
+#define ASSERT_NEQ(x, y) ::brick::_assert::assert_neq_fn( BRICK_LOCWRAP( BRICK_LOCATION( #x " != " #y ) ), x, y )
+#define ASSERT_EQ_IDX(i, x, y) ::brick::_assert::assert_eq_fn( BRICK_LOCWRAP( BRICK_LOCATION_I( #x " == " #y, i ) ), x, y )
+
+#else
+
+#define ASSERT(x) ((void)0)
+#define ASSERT_PRED(p, x) ((void)0)
+#define ASSERT_EQ(x, y) ((void)0)
+#define ASSERT_LEQ(x, y) ((void)0)
+#define ASSERT_NEQ(x, y) ((void)0)
+#define ASSERT_EQ_IDX(i, x, y) ((void)0)
+#endif
+
+/* you must #include <brick-string.h> to use ASSERT_UNREACHABLE_F */
+#define ASSERT_UNREACHABLE_F(...) ::brick::_assert::assert_die_fn( BRICK_LOCATION( brick::string::fmtf(__VA_ARGS__) ) )
+#define ASSERT_UNREACHABLE(x) ::brick::_assert::assert_die_fn( BRICK_LOCATION( x ) )
+#define ASSERT_UNIMPLEMENTED() ::brick::_assert::assert_die_fn( BRICK_LOCATION( "not imlemented" ) )
+
+#ifdef _MSC_VER
+#define UNUSED
+#define noexcept
+#else
+#define UNUSED __attribute__((unused))
+#endif
+
+#ifndef BRICK_ASSERT_H
+#define BRICK_ASSERT_H
+
+namespace brick {
+namespace _assert {
+
+/* discard any number of paramentets, taken as const references */
+template< typename... X >
+void unused( const X&... ) { }
+
+struct Location {
+    const char *file;
+    int line, iteration;
+    std::string stmt;
+    Location( const char *f, int l, std::string st, int iter = -1 )
+        : file( f ), line( l ), iteration( iter ), stmt( st ) {}
+};
+
+#define BRICK_LOCATION(stmt) ::brick::_assert::Location( __FILE__, __LINE__, stmt )
+#define BRICK_LOCATION_I(stmt, i) ::brick::_assert::Location( __FILE__, __LINE__, stmt, i )
+
+// lazy location construction in C++11
+#if __cplusplus >= 201103L
+#define BRICK_LOCWRAP(x) [&]{ return (x); }
+#define BRICK_LOCUNWRAP(x) (x)()
+#else
+#define BRICK_LOCWRAP(x) (x)
+#define BRICK_LOCUNWRAP(x) (x)
+#endif
+
+
+struct AssertFailed : std::exception {
+    std::string str;
+
+    template< typename X >
+    friend inline AssertFailed &operator<<( AssertFailed &f, X x )
+    {
+        std::stringstream str;
+        str << x;
+        f.str += str.str();
+        return f;
+    }
+
+    AssertFailed( Location l )
+    {
+        (*this) << l.file << ": " << l.line;
+        if ( l.iteration != -1 )
+            (*this) << " (iteration " << l.iteration << ")";
+        (*this) << ": assertion `" << l.stmt << "' failed;";
+    }
+
+    const char *what() const noexcept { return str.c_str(); }
+};
+
+template< typename Location, typename X >
+void assert_fn( Location l, X x )
+{
+    if ( !x ) {
+        throw AssertFailed( BRICK_LOCUNWRAP( l ) );
+    }
+}
+
+inline void assert_die_fn( Location l ) __attribute__((noreturn));
+
+inline void assert_die_fn( Location l )
+{
+    throw AssertFailed( l );
+}
+
+template< typename Location, typename X, typename Y >
+void assert_eq_fn( Location l, X x, Y y )
+{
+    if ( !( x == y ) ) {
+        AssertFailed f( BRICK_LOCUNWRAP( l ) );
+        f << " got ["
+          << x << "] != [" << y
+          << "] instead";
+        throw f;
+    }
+}
+
+template< typename Location, typename X, typename Y >
+void assert_leq_fn( Location l, X x, Y y )
+{
+    if ( !( x <= y ) ) {
+        AssertFailed f( BRICK_LOCUNWRAP( l ) );
+        f << " got ["
+          << x << "] > [" << y
+          << "] instead";
+        throw f;
+    }
+}
+
+template< typename Location, typename X >
+void assert_pred_fn( Location l, X x, bool p )
+{
+    if ( !p ) {
+        AssertFailed f( BRICK_LOCUNWRAP( l ) );
+        f << " for " << x;
+        throw f;
+    }
+}
+
+template< typename Location, typename X, typename Y >
+void assert_neq_fn( Location l, X x, Y y )
+{
+    if ( x != y )
+        return;
+    AssertFailed f( BRICK_LOCUNWRAP( l ) );
+    f << " got ["
+      << x << "] == [" << y << "] instead";
+    throw f;
+}
+
+}
+}
+
+#endif
+
+// vim: syntax=cpp tabstop=4 shiftwidth=4 expandtab
diff --git a/bricks/brick-bitlevel.h b/bricks/brick-bitlevel.h
new file mode 100644
index 000000000..dbc4b61ee
--- /dev/null
+++ b/bricks/brick-bitlevel.h
@@ -0,0 +1,661 @@
+// -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+
+/*
+ * Utilities and data structures for bit-level manipulation and data packing.
+ */
+
+/*
+ * (c) 2013-2014 Jiří Weiser <xweiser1@fi.muni.cz>
+ * (c) 2013 Petr Ročkai <me@mornfall.net>
+ */
+
+/* Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE. */
+
+#include <bricks/brick-assert.h>
+
+#include <type_traits>
+
+#ifdef __linux
+#include <asm/byteorder.h>
+#include <byteswap.h>
+#elif !defined LITTLE_ENDIAN // if defined _WIN32
+#define BYTE_ORDER 1234
+#define LITTLE_ENDIAN 1234
+#endif
+
+#ifndef bswap_64
+#define bswap_64 __builtin_bswap64
+#endif
+
+#include <atomic>
+#include <cstring>
+
+#ifndef BRICK_BITLEVEL_H
+#define BRICK_BITLEVEL_H
+
+namespace brick {
+namespace bitlevel {
+
+template< typename T1, typename T2 >
+constexpr inline T1 align( T1 v, T2 a ) {
+    return (v % T1(a)) ? (v + T1(a) - (v % T1(a))) : v;
+}
+
+template< typename T1, typename T2 >
+constexpr inline T1 downalign( T1 v, T2 a ) {
+    return v - (v % T1(a));
+}
+
+namespace compiletime {
+
+template< typename T >
+constexpr unsigned MSB( T x ) {
+    return x > 1 ? 1 + MSB( x >> 1 ) : 0;
+}
+
+template< typename T >
+constexpr T fill( T x ) {
+    return x ? x | fill( x >> 1 ) : x;
+}
+
+template< typename T >
+constexpr size_t sizeOf() {
+    return std::is_empty< T >::value ? 0 : sizeof( T );
+}
+
+}
+
+/*
+ *  Fills `x` by bits up to the most si significant bit.
+ *  Comlexity is O(log n), n is sizeof(x)*8
+ */
+template< typename number >
+static inline number fill( number x ) {
+    static const unsigned m = sizeof( number ) * 8;
+    unsigned r = 1;
+    if ( !x )
+        return 0;
+    while ( m != r ) {
+        x |= x >> r;
+        r <<= 1;
+    }
+    return x;
+}
+
+// get index of Most Significant Bit
+// templated by argument to int, long, long long (all unsigned)
+template< typename T >
+static inline unsigned MSB( T x ) {
+    unsigned position = 0;
+    while ( x ) {
+        x >>= 1;
+        ++position;
+    }
+    return position - 1;
+}
+
+template<>
+inline unsigned MSB< unsigned int >( unsigned int x ) {
+    static const unsigned long bits = sizeof( unsigned int ) * 8 - 1;
+    return bits - __builtin_clz( x );
+}
+
+template<>
+inline unsigned MSB< unsigned long >( unsigned long x ) {
+    static const unsigned bits = sizeof( unsigned long ) * 8 - 1;
+    return bits - __builtin_clzl( x );
+}
+
+template<>
+inline unsigned MSB< unsigned long long >( unsigned long long x ) {
+    static const unsigned bits = sizeof( unsigned long long ) * 8 - 1;
+    return bits - __builtin_clzll( x );
+}
+
+// gets only Most Significant Bit
+template< typename number >
+static inline number onlyMSB( number x ) {
+    return number(1) << MSB( x );
+}
+
+// gets number without Most Significant Bit
+template< typename number >
+static inline number withoutMSB( number x ) {
+    return x & ~onlyMSB( x );
+}
+
+inline uint64_t bitshift( uint64_t t, int shift ) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+    return bswap_64( shift < 0 ? bswap_64( t << -shift ) : bswap_64( t >> shift ) );
+#else
+    return shift < 0 ? ( t << -shift ) : ( t >> shift );
+#endif
+}
+
+struct BitPointer {
+    BitPointer() : base( nullptr ), _bitoffset( 0 ) {}
+    template< typename T > BitPointer( T *t, int offset = 0 )
+        : base( static_cast< void * >( t ) ), _bitoffset( offset )
+    {
+        normalize();
+    }
+    uint32_t &word() { ASSERT( valid() ); return *static_cast< uint32_t * >( base ); }
+    uint64_t &dword() { ASSERT( valid() ); return *static_cast< uint64_t * >( base ); }
+    void normalize() {
+        int shift = downalign( _bitoffset, 32 );
+        _bitoffset -= shift;
+        ASSERT_EQ( shift % 8, 0 );
+        base = static_cast< uint32_t * >( base ) + shift / 32;
+    }
+    void shift( int bits ) { _bitoffset += bits; normalize(); }
+    void fromReference( BitPointer r ) { *this = r; }
+    int bitoffset() { return _bitoffset; }
+    bool valid() { return base; }
+private:
+    void *base;
+    int _bitoffset;
+};
+
+inline uint64_t mask( int first, int count ) {
+    return bitshift(uint64_t(-1), -first) & bitshift(uint64_t(-1), (64 - first - count));
+}
+
+/*
+ * NB. This function will alias whatever "to" points to with an uint64_t. With
+ * aggressive optimisations, this might break code that passes an address of a
+ * variable of different type. When "to" points to a stack variable, take
+ * precautions to avoid breaking strict aliasing rules (the violation is not
+ * detected by GCC as of 4.7.3).
+ */
+
+inline void bitcopy( BitPointer from, BitPointer to, int bitcount )
+{
+    while ( bitcount ) {
+        int w = std::min( 32 - from.bitoffset(), bitcount );
+        uint32_t fmask = mask( from.bitoffset(), w );
+        uint64_t tmask = mask( to.bitoffset(), w );
+        uint64_t bits = bitshift( from.word() & fmask, from.bitoffset() - to.bitoffset() );
+        ASSERT_EQ( bits & ~tmask, 0u );
+        ASSERT_EQ( bits & tmask, bits );
+        if ( to.bitoffset() + bitcount > 32 )
+            to.dword() = (to.dword() & ~tmask) | bits;
+        else
+            to.word() = (to.word() & ~static_cast< uint32_t >( tmask )) | static_cast< uint32_t >( bits );
+        from.shift( w ); to.shift( w ); bitcount -= w; // slide
+    }
+}
+
+template< typename T, int width = sizeof( T ) * 8 >
+struct BitField
+{
+    static const int bitwidth = width;
+    struct Virtual : BitPointer {
+        void set( T t ) { bitcopy( BitPointer( &t ), *this, bitwidth ); }
+        Virtual operator=( T t ) {
+            set( t );
+            return *this;
+        }
+        Virtual operator=( Virtual v ) {
+            set( v.get() );
+            return *this;
+        }
+
+        operator T() const { return get(); }
+        T get() const {
+            union U {
+                uint64_t x;
+                T t;
+                U() : t() { }
+            } u;
+            bitcopy( *this, BitPointer( &u.x ), bitwidth );
+            return u.t;
+        }
+
+        Virtual &operator++() {
+            T value( get() );
+            set( ++value );
+            return *this;
+        }
+        T operator++(int) {
+            T value( get() );
+            T result( value++ );
+            set( value );
+            return result;
+        }
+
+        Virtual &operator--() {
+            T value( get() );
+            set( --value );
+            return *this;
+        }
+        T operator--(int) {
+            T value( get() );
+            T result( value-- );
+            set( value );
+            return result;
+        }
+        template< typename U >
+        Virtual operator+=( U value ) {
+            T t( get() );
+            t += value;
+            set( t );
+            return *this;
+        }
+        template< typename U >
+        Virtual operator-=( U value ) {
+            T t( get() );
+            t -= value;
+            set( t );
+            return *this;
+        }
+        template< typename U >
+        Virtual operator*=( U value ) {
+            T t( get() );
+            t *= value;
+            set( t );
+            return *this;
+        }
+        template< typename U >
+        Virtual operator/=( U value ) {
+            T t( get() );
+            t /= value;
+            set( t );
+            return *this;
+        }
+        template< typename U >
+        Virtual operator%=( U value ) {
+            T t( get() );
+            t %= value;
+            set( t );
+            return *this;
+        }
+    };
+};
+
+struct BitLock
+{
+    static const int bitwidth = 1;
+    struct Virtual : BitPointer {
+        using Atomic = std::atomic< uint32_t >;
+        Atomic &atomic() { return *reinterpret_cast< Atomic * >( &word() ); }
+        uint32_t bit() {
+            ASSERT_LEQ( bitoffset(), 31 );
+            return uint32_t( 1 ) << bitoffset();
+        }
+        void lock() {
+            uint32_t l = word();
+            do { l &= ~bit(); } while ( !atomic().compare_exchange_weak( l, l | bit() ) );
+        }
+        void unlock() { atomic().exchange( word() & ~bit() ); }
+        bool locked() { return atomic().load() & bit(); }
+    };
+};
+
+template< int O, typename... Args > struct BitAccess;
+
+template< int O >
+struct BitAccess< O > { static const int total = 0; };
+
+template< int O, typename T, typename... Args >
+struct BitAccess< O, T, Args... > {
+    static const int offset = O;
+    static const int width = T::bitwidth;
+    typedef typename T::Virtual Head;
+    typedef BitAccess< offset + T::bitwidth, Args... > Tail;
+    static const int total = width + Tail::total;
+};
+
+template< typename BA, int I >
+struct _AccessAt : _AccessAt< typename BA::Tail, I - 1 > {};
+
+template< typename BA >
+struct _AccessAt< BA, 0 > { using T = BA; };
+
+template< typename... Args >
+struct _BitTuple
+{
+    using Access = BitAccess< 0, Args... >;
+    static const int bitwidth = Access::total;
+    template< int I > using AccessAt = _AccessAt< Access, I >;
+    template< int I > static int offset() { return AccessAt< I >::T::offset; }
+};
+
+template< typename... Args > struct BitTuple : _BitTuple< Args... >
+{
+    struct Virtual : BitPointer, _BitTuple< Args... > {};
+    char storage[ align( Virtual::bitwidth, 32 ) / 8 ];
+    BitTuple() { std::fill( storage, storage + sizeof( storage ), 0 ); }
+    operator BitPointer() { return BitPointer( storage ); }
+};
+
+template< int I, typename BT >
+typename BT::template AccessAt< I >::T::Head get( BT &bt )
+{
+    typename BT::template AccessAt< I >::T::Head t;
+    t.fromReference( bt );
+    t.shift( BT::template offset< I >() );
+    return t;
+}
+
+}
+}
+
+namespace brick_test {
+namespace bitlevel {
+
+using namespace ::brick::bitlevel;
+
+struct BitTupleTest {
+    using U10 = BitField< unsigned, 10 >;
+    using T10_10 = BitTuple< U10, U10 >;
+
+    int bitcount( uint32_t word ) {
+        int i = 0;
+        while ( word ) {
+            if ( word & 1 )
+                ++i;
+            word >>= 1;
+        }
+        return i;
+    }
+
+    TEST(mask) {
+        /* only works on little endian machines ... */
+        ASSERT_EQ( 0xFF00u, bitlevel::mask( 8, 8 ) );
+        ASSERT_EQ( 0xF000u, bitlevel::mask( 12, 4 ) );
+        ASSERT_EQ( 0x0F00u, bitlevel::mask( 8, 4 ) );
+        ASSERT_EQ( 60u, bitlevel::mask( 2, 4 ) );// 0b111100
+        ASSERT_EQ( 28u, bitlevel::mask( 2, 3 ) );// 0b11100
+    }
+
+    TEST(bitcopy) {
+        uint32_t a = 42, b = 11;
+        bitlevel::bitcopy( BitPointer( &a ), BitPointer( &b ), 32 );
+        ASSERT_EQ( a, b );
+        a = 0xFF00;
+        bitlevel::bitcopy( BitPointer( &a ), BitPointer( &b, 8 ), 24 );
+        ASSERT_EQ( b, 0xFF0000u | 42u );
+        a = 0;
+        bitlevel::bitcopy( BitPointer( &b, 8 ), BitPointer( &a ), 24 );
+        ASSERT_EQ( a, 0xFF00u );
+        bitlevel::bitcopy( BitPointer( &a, 8 ), BitPointer( &b, 8 ), 8 );
+
+        a = 0x3FF;
+        b = 0;
+        bitlevel::bitcopy( BitPointer( &a, 0 ), BitPointer( &b, 0 ), 10 );
+        ASSERT_EQ( b, 0x3FFu );
+
+        unsigned char from[32], to[32];
+        std::memset( from, 0, 32 );
+        std::memset( to, 0, 32 );
+        from[0] = 1 << 7;
+        bitlevel::bitcopy( BitPointer( from, 7 ), BitPointer( to, 7 ), 1 );
+        ASSERT_EQ( int( to[0] ), int( from[ 0 ] ) );
+        from[0] = 1;
+        to[0] = 0;
+        bitlevel::bitcopy( BitPointer( from, 0 ), BitPointer( to, 7 ), 1 );
+        ASSERT_EQ( int( to[0] ), 1 << 7 );
+
+        from[0] = 13;
+        from[1] = 63;
+        bitlevel::bitcopy( BitPointer( from, 0 ), BitPointer( to, 32 ), 16 );
+        ASSERT_EQ( int( to[4] ), int( from[0] ) );
+        ASSERT_EQ( int( to[5] ), int( from[1] ) );
+
+        from[0] = 2;
+        from[1] = 2;
+        std::memset( to, 0, 32 );
+        bitlevel::bitcopy( BitPointer( from, 1 ), BitPointer( to, 32 ), 16 );
+        ASSERT_EQ( int( to[4] ), 1 );
+        ASSERT_EQ( int( to[5] ), 1 );
+
+        from[0] = 1;
+        from[1] = 1;
+        std::memset( to, 0, 32 );
+        bitlevel::bitcopy( BitPointer( from, 0 ), BitPointer( to, 33 ), 16 );
+        ASSERT_EQ( int( to[4] ), 2 );
+        ASSERT_EQ( int( to[5] ), 2 );
+
+        from[0] = 1;
+        from[1] = 1;
+        std::memset( to, 0, 32 );
+        for ( int i = 0; i < 16; ++i )
+            bitlevel::bitcopy( BitPointer( from, i ), BitPointer( to, 33 + i ), 1 );
+        ASSERT_EQ( int( to[4] ), 2 );
+        ASSERT_EQ( int( to[5] ), 2 );
+
+        for ( int i = 0; i < 16; ++i )
+            from[i] = 2;
+        std::memset( to, 0, 32 );
+        bitlevel::bitcopy( BitPointer( from, 1 ), BitPointer( to, 3 ), 128 );
+        for ( int i = 0; i < 16; ++i )
+            ASSERT_EQ( int( to[i] ), 8 );
+    }
+
+    TEST(field) {
+        int a = 42, b = 0;
+        typedef BitField< int, 10 > F;
+        F::Virtual f;
+        f.fromReference( BitPointer( &b ) );
+        f.set( a );
+        ASSERT_EQ( a, 42 );
+        ASSERT_EQ( a, f );
+    }
+
+    TEST(basic) {
+        T10_10 x;
+        ASSERT_EQ( T10_10::bitwidth, 20 );
+        ASSERT_EQ( T10_10::offset< 0 >(), 0 );
+        ASSERT_EQ( T10_10::offset< 1 >(), 10 );
+        auto a = get< 0 >( x );
+        auto b = get< 1 >( x );
+        a.set( 5 );
+        b.set( 7 );
+        ASSERT_EQ( a, 5u );
+        ASSERT_EQ( b, 7u );
+    }
+
+    TEST(big) {
+        bitlevel::BitTuple< BitField< uint64_t, 63 >, BitField< uint64_t, 63 > > x;
+        ASSERT_EQ( x.bitwidth, 126 );
+        ASSERT_EQ( x.offset< 0 >(), 0 );
+        ASSERT_EQ( x.offset< 1 >(), 63 );
+        get< 0 >( x ).set( (1ull << 62) + 7 );
+        ASSERT_EQ( get< 0 >( x ), (1ull << 62) + 7 );
+        ASSERT_EQ( get< 1 >( x ), 0u );
+        get< 0 >( x ).set( 0 );
+        get< 1 >( x ).set( (1ull << 62) + 7 );
+        ASSERT_EQ( get< 0 >( x ), 0u );
+        ASSERT_EQ( get< 1 >( x ), (1ull << 62) + 7 );
+        get< 0 >( x ).set( (1ull << 62) + 11 );
+        ASSERT_EQ( get< 0 >( x ), (1ull << 62) + 11 );
+        ASSERT_EQ( get< 1 >( x ), (1ull << 62) + 7 );
+    }
+
+    TEST(structure) {
+        bitlevel::BitTuple< BitField< std::pair< uint64_t, uint64_t >, 120 >, BitField< uint64_t, 63 > > x;
+        auto v = std::make_pair( (uint64_t( 1 ) << 62) + 7, uint64_t( 33 ) );
+        ASSERT_EQ( x.bitwidth, 183 );
+        ASSERT_EQ( x.offset< 0 >(), 0 );
+        ASSERT_EQ( x.offset< 1 >(), 120 );
+        get< 1 >( x ).set( 333 );
+        ASSERT_EQ( get< 1 >( x ), 333u );
+        get< 0 >( x ).set( v );
+        ASSERT_EQ( get< 1 >( x ), 333u );
+        ASSERT( get< 0 >( x ).get() == v );
+    }
+
+    TEST(nested) {
+        typedef bitlevel::BitTuple< T10_10, T10_10, BitField< unsigned, 3 > > X;
+        X x;
+        ASSERT_EQ( X::bitwidth, 43 );
+        ASSERT_EQ( X::offset< 0 >(), 0 );
+        ASSERT_EQ( X::offset< 1 >(), 20 );
+        ASSERT_EQ( X::offset< 2 >(), 40 );
+        auto a = get< 0 >( x );
+        auto b = get< 1 >( x );
+        get< 0 >( a ).set( 5 );
+        get< 1 >( a ).set( 7 );
+        get< 0 >( b ).set( 13 );
+        get< 1 >( b ).set( 533 );
+        get< 2 >( x ).set( 15 ); /* we expect to lose the MSB */
+        ASSERT_EQ( get< 0 >( a ), 5u );
+        ASSERT_EQ( get< 1 >( a ), 7u );
+        ASSERT_EQ( get< 0 >( b ), 13u );
+        ASSERT_EQ( get< 1 >( b ), 533u );
+        ASSERT_EQ( get< 2 >( x ), 7u );
+    }
+
+    TEST(locked) {
+        bitlevel::BitTuple<
+            BitField< int, 15 >,
+            BitLock,
+            BitField< int, 16 >
+        > bt;
+
+        get< 1 >( bt ).lock();
+
+        ASSERT_EQ( get< 0 >( bt ), 0 );
+        ASSERT_EQ( get< 2 >( bt ), 0 );
+        ASSERT( get< 1 >( bt ).locked() );
+        ASSERT( get< 0 >( bt ).word() );
+
+        get< 0 >( bt ) = 1;
+        get< 2 >( bt ) = 1;
+
+        ASSERT_EQ( get< 0 >( bt ), 1 );
+        ASSERT_EQ( get< 2 >( bt ), 1 );
+
+        ASSERT_EQ( bitcount( get< 0 >( bt ).word() ), 3 );
+
+        get< 1 >( bt ).unlock();
+        ASSERT_EQ( get< 0 >( bt ), 1 );
+        ASSERT_EQ( get< 2 >( bt ), 1 );
+        ASSERT( !get< 1 >( bt ).locked() );
+
+        ASSERT_EQ( bitcount( get< 0 >( bt ).word() ), 2 );
+
+        get< 0 >( bt ) = 0;
+        get< 2 >( bt ) = 0;
+        ASSERT( !get< 0 >( bt ).word() );
+    }
+
+    TEST(assign) {
+        bitlevel::BitTuple<
+            BitField< bool, 1 >,
+            BitField< int, 6 >,
+            BitField< bool, 1 >
+        > tuple;
+
+        get< 0 >( tuple ) = true;
+        get< 2 >( tuple ) = get< 0 >( tuple );
+        ASSERT( get< 2 >( tuple ).get() );
+    }
+
+    struct OperatorTester {
+        int value;
+        int expected;
+        OperatorTester &operator++() { ASSERT_UNREACHABLE( "fell through" ); return *this; }
+        OperatorTester operator++( int ) { ASSERT_UNREACHABLE( "fell through" ); return *this; }
+        OperatorTester &operator--() { ASSERT_UNREACHABLE( "fell through" ); return *this; }
+        OperatorTester &operator--( int ) { ASSERT_UNREACHABLE( "fell through" ); return *this; }
+        OperatorTester &operator+=( int ) { ASSERT_UNREACHABLE( "fell through" ); return *this; }
+        OperatorTester &operator-=( int ) { ASSERT_UNREACHABLE( "fell through" ); return *this; }
+        OperatorTester &operator*=( int ) { ASSERT_UNREACHABLE( "fell through" ); return *this; }
+        OperatorTester &operator/=( int ) { ASSERT_UNREACHABLE( "fell through" ); return *this; }
+        OperatorTester &operator%=( int ) { ASSERT_UNREACHABLE( "fell through" ); return *this; }
+        void test() { ASSERT_EQ( value, expected ); }
+        void set( int v, int e ) { value = v; expected = e; }
+    };
+    struct TPrI : OperatorTester {
+        TPrI &operator++() { ++value; return *this; }
+    };
+    struct TPoI : OperatorTester {
+        TPoI operator++( int ) { auto r = *this; value++; return r; }
+    };
+    struct TPrD : OperatorTester {
+        TPrD &operator--() { --value; return *this; }
+    };
+    struct TPoD : OperatorTester {
+        TPoD operator--( int ) { auto r = *this; value--; return r; }
+    };
+    struct TPlO : OperatorTester {
+        TPlO &operator+=( int v ) { value += v; return *this; }
+    };
+    struct TMO : OperatorTester {
+        TMO &operator-=( int v ) { value -= v; return *this; }
+    };
+    struct TPoO : OperatorTester {
+        TPoO &operator*=( int v ) { value *= v; return *this; }
+    };
+    struct TSO : OperatorTester {
+        TSO &operator/=( int v ) { value /= v; return *this; }
+    };
+    struct TPrO : OperatorTester {
+        TPrO &operator%=( int v ) { value %= v; return *this; }
+    };
+
+    template< int N, typename BT, typename L >
+    void checkOperator( BT &bt, int v, int e, L l ) {
+        auto t = get< N >( bt ).get();
+        t.set( v, e );
+        get< N >( bt ) = t;
+        l( get< N >( bt ) );
+        get< N >( bt ).get().test();
+    }
+
+#define CHECK( N, bt, v, e, test ) checkOperator< N >( bt, v, e, []( decltype( get< N >( bt ) ) item ) { test; } )
+
+    TEST(operators) {
+        bitlevel::BitTuple<
+            BitField< bool, 4 >,
+            BitField< TPrI >,// ++v
+            BitField< TPoI >,// v++
+            BitField< TPrD >,// --v
+            BitField< TPoD >,// v--
+            BitField< TPlO >,// v+=
+            BitField< TMO >,// v-=
+            BitField< TPoO >,// v*=
+            BitField< TSO >,// v/=
+            BitField< TPrO >,// v%=
+            BitField< bool, 4 >
+        > bt;
+
+        CHECK( 1, bt, 0, 1, ++item );
+        CHECK( 2, bt, 0, 1, item++ );
+        CHECK( 3, bt, 0, -1, --item );
+        CHECK( 4, bt, 0, -1, item-- );
+        CHECK( 5, bt, 0, 5, item += 5 );
+        CHECK( 6, bt, 0, -5, item -= 5 );
+        CHECK( 7, bt, 2, 14, item *= 7 );
+        CHECK( 8, bt, 42, 6, item /= 7 );
+        CHECK( 9, bt, 42, 9, item %= 11 );
+    }
+#undef CHECK
+};
+
+}
+}
+
+#endif
+// vim: syntax=cpp tabstop=4 shiftwidth=4 expandtab
diff --git a/bricks/brick-hash.h b/bricks/brick-hash.h
new file mode 100644
index 000000000..80f623173
--- /dev/null
+++ b/bricks/brick-hash.h
@@ -0,0 +1,977 @@
+// -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+
+/*
+ *     2010-2012 Bob Jenkins (code in public domain)
+ * (c) 2013 Vladimír Štill <xstill@fi.muni.cz>
+ *
+ * Based on http://burtleburtle.net/bob/c/SpookyV2.cpp
+ */
+
+/* Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE. */
+
+#include <bricks/brick-assert.h>
+
+#include <cstddef>
+#include <utility> // pair
+#include <tuple> // tie
+
+#ifdef _MSC_VER
+# define INLINE __forceinline
+  typedef  unsigned __int64 uint64;
+  typedef  unsigned __int32 uint32;
+  typedef  unsigned __int16 uint16;
+  typedef  unsigned __int8  uint8;
+#else
+# include <cstdint>
+# define INLINE inline
+  typedef  uint64_t  uint64;
+  typedef  uint32_t  uint32;
+  typedef  uint16_t  uint16;
+  typedef  uint8_t   uint8;
+#endif
+#include <memory.h>
+
+#define ALLOW_UNALIGNED_READS 1
+
+#ifndef BRICK_HASH_H
+#define BRICK_HASH_H
+
+namespace brick {
+namespace hash {
+
+typedef uint64_t hash64_t;
+typedef std::pair< hash64_t, hash64_t > hash128_t;
+
+namespace jenkins {
+//
+// SpookyHash: a 128-bit noncryptographic hash function
+// By Bob Jenkins, public domain
+//   Oct 31 2010: alpha, framework + SpookyHash::Mix appears right
+//   Oct 31 2011: alpha again, Mix only good to 2^^69 but rest appears right
+//   Dec 31 2011: beta, improved Mix, tested it for 2-bit deltas
+//   Feb  2 2012: production, same bits as beta
+//   Feb  5 2012: adjusted definitions of uint* to be more portable
+//   Mar 30 2012: 3 bytes/cycle, not 4.  Alpha was 4 but wasn't thorough enough.
+//   August 5 2012: SpookyV2 (different results)
+//
+// Up to 3 bytes/cycle for long messages.  Reasonably fast for short messages.
+// All 1 or 2 bit deltas achieve avalanche within 1% bias per output bit.
+//
+// This was developed for and tested on 64-bit x86-compatible processors.
+// It assumes the processor is little-endian.  There is a macro
+// controlling whether unaligned reads are allowed (by default they are).
+// This should be an equally good hash on big-endian machines, but it will
+// compute different results on them than on little-endian machines.
+//
+// Google's CityHash has similar specs to SpookyHash, and CityHash is faster
+// on new Intel boxes.  MD4 and MD5 also have similar specs, but they are orders
+// of magnitude slower.  CRCs are two or more times slower, but unlike
+// SpookyHash, they have nice math for combining the CRCs of pieces to form
+// the CRCs of wholes.  There are also cryptographic hashes, but those are even
+// slower than MD5.
+//
+
+// Modifications for brick-hash.h:
+// - merged into one file
+// - pairs are used instead of output parameters
+// - some functions were marked explicitly for inlining with gcc attribete
+//   as they are considered too long otherwise
+
+class SpookyHash
+{
+public:
+    //
+    // SpookyHash: hash a single message in one call, produce 128-bit output
+    //
+    static INLINE std::pair< uint64, uint64 > Hash128(
+        const void *message,  // message to hash
+        size_t length,        // length of message in bytes
+        uint64 seed1,        // in/out: in seed 1, out hash value 1
+        uint64 seed2)       // in/out: in seed 2, out hash value 2
+    {
+        if (length < sc_bufSize)
+        {
+            return Short(message, length, seed1, seed2);
+        }
+
+        uint64 h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11;
+        uint64 buf[sc_numVars];
+        uint64 *end;
+        union
+        {
+            const uint8 *p8;
+            uint64 *p64;
+            size_t i;
+        } u;
+        size_t remainder;
+
+        h0=h3=h6=h9  = seed1;
+        h1=h4=h7=h10 = seed2;
+        h2=h5=h8=h11 = sc_const;
+
+        u.p8 = reinterpret_cast< const uint8 * >( message );
+        end = u.p64 + (length/sc_blockSize)*sc_numVars;
+
+        // handle all whole sc_blockSize blocks of bytes
+        if (ALLOW_UNALIGNED_READS || ((u.i & 0x7) == 0))
+        {
+            while (u.p64 < end)
+            {
+                Mix(u.p64, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+            u.p64 += sc_numVars;
+            }
+        }
+        else
+        {
+            while (u.p64 < end)
+            {
+                memcpy(buf, u.p64, sc_blockSize);
+                Mix(buf, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+            u.p64 += sc_numVars;
+            }
+        }
+
+        // handle the last partial block of sc_blockSize bytes
+        remainder = (length - (reinterpret_cast< const uint8 *>(end)-reinterpret_cast< const uint8 * >(message)));
+        memcpy(buf, end, remainder);
+        memset( reinterpret_cast< uint8 * >( buf )+remainder, 0, sc_blockSize-remainder);
+        reinterpret_cast< uint8 * >( buf )[sc_blockSize-1] = remainder;
+
+        // do some final mixing
+        End(buf, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+        return std::make_pair( h0, h1 );
+    }
+
+    //
+    // Hash64: hash a single message in one call, return 64-bit output
+    //
+    static INLINE uint64 Hash64(
+        const void *message,  // message to hash
+        size_t length,        // length of message in bytes
+        uint64 seed)          // seed
+    {
+        return Hash128(message, length, seed, seed).first;
+    }
+
+    //
+    // Hash32: hash a single message in one call, produce 32-bit output
+    //
+    static INLINE uint32 Hash32(
+        const void *message,  // message to hash
+        size_t length,        // length of message in bytes
+        uint32 seed)          // seed
+    {
+        return uint32( Hash128(message, length, seed, seed).first );
+    }
+
+    //
+    // Init: initialize the context of a SpookyHash
+    //
+    INLINE void Init(
+        uint64 seed1,       // any 64-bit value will do, including 0
+        uint64 seed2)      // different seeds produce independent hashes
+    {
+        m_length = 0;
+        m_remainder = 0;
+        m_state[0] = seed1;
+        m_state[1] = seed2;
+    }
+
+    //
+    // Update: add a piece of a message to a SpookyHash state
+    //
+    INLINE void Update(
+        const void *message,  // message fragment
+        size_t length)       // length of message fragment in bytes
+    {
+        uint64 h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11;
+        size_t newLength = length + m_remainder;
+        uint8  remainder;
+        union
+        {
+            const uint8 *p8;
+            uint64 *p64;
+            size_t i;
+        } u;
+        const uint64 *end;
+
+        // Is this message fragment too short?  If it is, stuff it away.
+        if (newLength < sc_bufSize)
+        {
+            memcpy(&reinterpret_cast< uint8 * >( m_data )[m_remainder], message, length);
+            m_length = length + m_length;
+            m_remainder = uint8( newLength );
+            return;
+        }
+
+        // init the variables
+        if (m_length < sc_bufSize)
+        {
+            h0=h3=h6=h9  = m_state[0];
+            h1=h4=h7=h10 = m_state[1];
+            h2=h5=h8=h11 = sc_const;
+        }
+        else
+        {
+            h0 = m_state[0];
+            h1 = m_state[1];
+            h2 = m_state[2];
+            h3 = m_state[3];
+            h4 = m_state[4];
+            h5 = m_state[5];
+            h6 = m_state[6];
+            h7 = m_state[7];
+            h8 = m_state[8];
+            h9 = m_state[9];
+            h10 = m_state[10];
+            h11 = m_state[11];
+        }
+        m_length = length + m_length;
+
+        // if we've got anything stuffed away, use it now
+        if (m_remainder)
+        {
+            uint8 prefix = sc_bufSize-m_remainder;
+            memcpy(&(reinterpret_cast< uint8 * >( m_data )[m_remainder]), message, prefix);
+            u.p64 = m_data;
+            Mix(u.p64, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+            Mix(&u.p64[sc_numVars], h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+            u.p8 = reinterpret_cast< const uint8 * >( message ) + prefix;
+            length -= prefix;
+        }
+        else
+        {
+            u.p8 = reinterpret_cast< const uint8 * >( message );
+        }
+
+        // handle all whole blocks of sc_blockSize bytes
+        end = u.p64 + (length/sc_blockSize)*sc_numVars;
+        remainder = uint8(length-(reinterpret_cast< const uint8 * >( end ) - u.p8));
+        if (ALLOW_UNALIGNED_READS || (u.i & 0x7) == 0)
+        {
+            while (u.p64 < end)
+            {
+                Mix(u.p64, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+            u.p64 += sc_numVars;
+            }
+        }
+        else
+        {
+            while (u.p64 < end)
+            {
+                memcpy(m_data, u.p8, sc_blockSize);
+                Mix(m_data, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+            u.p64 += sc_numVars;
+            }
+        }
+
+        // stuff away the last few bytes
+        m_remainder = remainder;
+        memcpy(m_data, end, remainder);
+
+        // stuff away the variables
+        m_state[0] = h0;
+        m_state[1] = h1;
+        m_state[2] = h2;
+        m_state[3] = h3;
+        m_state[4] = h4;
+        m_state[5] = h5;
+        m_state[6] = h6;
+        m_state[7] = h7;
+        m_state[8] = h8;
+        m_state[9] = h9;
+        m_state[10] = h10;
+        m_state[11] = h11;
+    }
+
+
+    //
+    // Final: compute the hash for the current SpookyHash state
+    //
+    // This does not modify the state; you can keep updating it afterward
+    //
+    // The result is the same as if SpookyHash() had been called with
+    // all the pieces concatenated into one message.
+    //
+    INLINE std::pair< uint64, uint64 > Final()
+    {
+        // init the variables
+        if (m_length < sc_bufSize)
+        {
+            return Short( m_data, m_length, m_state[0], m_state[1]);
+        }
+
+        uint64 *data = reinterpret_cast< uint64 * >( m_data );
+        uint8 remainder = m_remainder;
+
+        uint64 h0 = m_state[0];
+        uint64 h1 = m_state[1];
+        uint64 h2 = m_state[2];
+        uint64 h3 = m_state[3];
+        uint64 h4 = m_state[4];
+        uint64 h5 = m_state[5];
+        uint64 h6 = m_state[6];
+        uint64 h7 = m_state[7];
+        uint64 h8 = m_state[8];
+        uint64 h9 = m_state[9];
+        uint64 h10 = m_state[10];
+        uint64 h11 = m_state[11];
+
+        if (remainder >= sc_blockSize)
+        {
+            // m_data can contain two blocks; handle any whole first block
+            Mix(data, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+            data += sc_numVars;
+            remainder -= sc_blockSize;
+        }
+
+        // mix in the last partial block, and the length mod sc_blockSize
+        memset(&reinterpret_cast< uint8 * >( data )[remainder], 0, (sc_blockSize-remainder));
+
+        reinterpret_cast< uint8 * >( data )[sc_blockSize-1] = remainder;
+
+        // do some final mixing
+        End(data, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+
+        return std::make_pair( h0, h1 );
+    }
+
+    //
+    // left rotate a 64-bit value by k bytes
+    //
+    static INLINE constexpr uint64 Rot64(uint64 x, int k) __attribute__((always_inline))
+    {
+        return (x << k) | (x >> (64 - k));
+    }
+
+    //
+    // This is used if the input is 96 bytes long or longer.
+    //
+    // The internal state is fully overwritten every 96 bytes.
+    // Every input bit appears to cause at least 128 bits of entropy
+    // before 96 other bytes are combined, when run forward or backward
+    //   For every input bit,
+    //   Two inputs differing in just that input bit
+    //   Where "differ" means xor or subtraction
+    //   And the base value is random
+    //   When run forward or backwards one Mix
+    // I tried 3 pairs of each; they all differed by at least 212 bits.
+    //
+    static INLINE void Mix(
+        const uint64 *data,
+        uint64 &s0, uint64 &s1, uint64 &s2, uint64 &s3,
+        uint64 &s4, uint64 &s5, uint64 &s6, uint64 &s7,
+        uint64 &s8, uint64 &s9, uint64 &s10,uint64 &s11) __attribute__((always_inline))
+    {
+      s0 += data[0];    s2 ^= s10;    s11 ^= s0;    s0 = Rot64(s0,11);    s11 += s1;
+      s1 += data[1];    s3 ^= s11;    s0 ^= s1;    s1 = Rot64(s1,32);    s0 += s2;
+      s2 += data[2];    s4 ^= s0;    s1 ^= s2;    s2 = Rot64(s2,43);    s1 += s3;
+      s3 += data[3];    s5 ^= s1;    s2 ^= s3;    s3 = Rot64(s3,31);    s2 += s4;
+      s4 += data[4];    s6 ^= s2;    s3 ^= s4;    s4 = Rot64(s4,17);    s3 += s5;
+      s5 += data[5];    s7 ^= s3;    s4 ^= s5;    s5 = Rot64(s5,28);    s4 += s6;
+      s6 += data[6];    s8 ^= s4;    s5 ^= s6;    s6 = Rot64(s6,39);    s5 += s7;
+      s7 += data[7];    s9 ^= s5;    s6 ^= s7;    s7 = Rot64(s7,57);    s6 += s8;
+      s8 += data[8];    s10 ^= s6;    s7 ^= s8;    s8 = Rot64(s8,55);    s7 += s9;
+      s9 += data[9];    s11 ^= s7;    s8 ^= s9;    s9 = Rot64(s9,54);    s8 += s10;
+      s10 += data[10];    s0 ^= s8;    s9 ^= s10;    s10 = Rot64(s10,22);    s9 += s11;
+      s11 += data[11];    s1 ^= s9;    s10 ^= s11;    s11 = Rot64(s11,46);    s10 += s0;
+    }
+
+    //
+    // Mix all 12 inputs together so that h0, h1 are a hash of them all.
+    //
+    // For two inputs differing in just the input bits
+    // Where "differ" means xor or subtraction
+    // And the base value is random, or a counting value starting at that bit
+    // The final result will have each bit of h0, h1 flip
+    // For every input bit,
+    // with probability 50 +- .3%
+    // For every pair of input bits,
+    // with probability 50 +- 3%
+    //
+    // This does not rely on the last Mix() call having already mixed some.
+    // Two iterations was almost good enough for a 64-bit result, but a
+    // 128-bit result is reported, so End() does three iterations.
+    //
+    static INLINE void EndPartial(
+        uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3,
+        uint64 &h4, uint64 &h5, uint64 &h6, uint64 &h7,
+        uint64 &h8, uint64 &h9, uint64 &h10,uint64 &h11) __attribute__((always_inline))
+    {
+        h11+= h1;    h2 ^= h11;   h1 = Rot64(h1,44);
+        h0 += h2;    h3 ^= h0;    h2 = Rot64(h2,15);
+        h1 += h3;    h4 ^= h1;    h3 = Rot64(h3,34);
+        h2 += h4;    h5 ^= h2;    h4 = Rot64(h4,21);
+        h3 += h5;    h6 ^= h3;    h5 = Rot64(h5,38);
+        h4 += h6;    h7 ^= h4;    h6 = Rot64(h6,33);
+        h5 += h7;    h8 ^= h5;    h7 = Rot64(h7,10);
+        h6 += h8;    h9 ^= h6;    h8 = Rot64(h8,13);
+        h7 += h9;    h10^= h7;    h9 = Rot64(h9,38);
+        h8 += h10;   h11^= h8;    h10= Rot64(h10,53);
+        h9 += h11;   h0 ^= h9;    h11= Rot64(h11,42);
+        h10+= h0;    h1 ^= h10;   h0 = Rot64(h0,54);
+    }
+
+    static INLINE void End(
+        const uint64 *data,
+        uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3,
+        uint64 &h4, uint64 &h5, uint64 &h6, uint64 &h7,
+        uint64 &h8, uint64 &h9, uint64 &h10,uint64 &h11) __attribute__((always_inline))
+    {
+        h0 += data[0];   h1 += data[1];   h2 += data[2];   h3 += data[3];
+        h4 += data[4];   h5 += data[5];   h6 += data[6];   h7 += data[7];
+        h8 += data[8];   h9 += data[9];   h10 += data[10]; h11 += data[11];
+        EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+        EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+        EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+    }
+
+    //
+    // The goal is for each bit of the input to expand into 128 bits of
+    //   apparent entropy before it is fully overwritten.
+    // n trials both set and cleared at least m bits of h0 h1 h2 h3
+    //   n: 2   m: 29
+    //   n: 3   m: 46
+    //   n: 4   m: 57
+    //   n: 5   m: 107
+    //   n: 6   m: 146
+    //   n: 7   m: 152
+    // when run forwards or backwards
+    // for all 1-bit and 2-bit diffs
+    // with diffs defined by either xor or subtraction
+    // with a base of all zeros plus a counter, or plus another bit, or random
+    //
+    static INLINE void ShortMix(uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3) __attribute__((always_inline))
+    {
+        h2 = Rot64(h2,50);  h2 += h3;  h0 ^= h2;
+        h3 = Rot64(h3,52);  h3 += h0;  h1 ^= h3;
+        h0 = Rot64(h0,30);  h0 += h1;  h2 ^= h0;
+        h1 = Rot64(h1,41);  h1 += h2;  h3 ^= h1;
+        h2 = Rot64(h2,54);  h2 += h3;  h0 ^= h2;
+        h3 = Rot64(h3,48);  h3 += h0;  h1 ^= h3;
+        h0 = Rot64(h0,38);  h0 += h1;  h2 ^= h0;
+        h1 = Rot64(h1,37);  h1 += h2;  h3 ^= h1;
+        h2 = Rot64(h2,62);  h2 += h3;  h0 ^= h2;
+        h3 = Rot64(h3,34);  h3 += h0;  h1 ^= h3;
+        h0 = Rot64(h0,5);   h0 += h1;  h2 ^= h0;
+        h1 = Rot64(h1,36);  h1 += h2;  h3 ^= h1;
+    }
+
+    //
+    // Mix all 4 inputs together so that h0, h1 are a hash of them all.
+    //
+    // For two inputs differing in just the input bits
+    // Where "differ" means xor or subtraction
+    // And the base value is random, or a counting value starting at that bit
+    // The final result will have each bit of h0, h1 flip
+    // For every input bit,
+    // with probability 50 +- .3% (it is probably better than that)
+    // For every pair of input bits,
+    // with probability 50 +- .75% (the worst case is approximately that)
+    //
+    static INLINE void ShortEnd(uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3) __attribute__((always_inline))
+    {
+        h3 ^= h2;  h2 = Rot64(h2,15);  h3 += h2;
+        h0 ^= h3;  h3 = Rot64(h3,52);  h0 += h3;
+        h1 ^= h0;  h0 = Rot64(h0,26);  h1 += h0;
+        h2 ^= h1;  h1 = Rot64(h1,51);  h2 += h1;
+        h3 ^= h2;  h2 = Rot64(h2,28);  h3 += h2;
+        h0 ^= h3;  h3 = Rot64(h3,9);   h0 += h3;
+        h1 ^= h0;  h0 = Rot64(h0,47);  h1 += h0;
+        h2 ^= h1;  h1 = Rot64(h1,54);  h2 += h1;
+        h3 ^= h2;  h2 = Rot64(h2,32);  h3 += h2;
+        h0 ^= h3;  h3 = Rot64(h3,25);  h0 += h3;
+        h1 ^= h0;  h0 = Rot64(h0,63);  h1 += h0;
+    }
+
+private:
+
+    //
+    // Short is used for messages under 192 bytes in length
+    // Short has a low startup cost, the normal mode is good for long
+    // keys, the cost crossover is at about 192 bytes.  The two modes were
+    // held to the same quality bar.
+    //
+    static INLINE std::pair< uint64, uint64 > Short(
+        const void *message,  // message (array of bytes, not necessarily aligned)
+        size_t length,        // length of message (in bytes)
+        uint64 seed1,        // in/out: in the seed, out the hash value
+        uint64 seed2)       // in/out: in the seed, out the hash value
+        __attribute__((always_inline))
+    {
+        uint64 buf[2*sc_numVars];
+        union
+        {
+            const uint8 *p8;
+            uint32 *p32;
+            uint64 *p64;
+            size_t i;
+        } u;
+
+        u.p8 = reinterpret_cast< const uint8 *>( message );
+
+        if (!ALLOW_UNALIGNED_READS && (u.i & 0x7))
+        {
+            memcpy(buf, message, length);
+            u.p64 = buf;
+        }
+
+        size_t remainder = length%32;
+        uint64 a= seed1;
+        uint64 b= seed2;
+        uint64 c=sc_const;
+        uint64 d=sc_const;
+
+        if (length > 15)
+        {
+            const uint64 *end = u.p64 + (length/32)*4;
+
+            // handle all complete sets of 32 bytes
+            for (; u.p64 < end; u.p64 += 4)
+            {
+                c += u.p64[0];
+                d += u.p64[1];
+                ShortMix(a,b,c,d);
+                a += u.p64[2];
+                b += u.p64[3];
+            }
+
+            //Handle the case of 16+ remaining bytes.
+            if (remainder >= 16)
+            {
+                c += u.p64[0];
+                d += u.p64[1];
+                ShortMix(a,b,c,d);
+                u.p64 += 2;
+                remainder -= 16;
+            }
+        }
+
+        // Handle the last 0..15 bytes, and its length
+        d += uint64( length ) << 56;
+        switch (remainder)
+        {
+        case 15:
+        d += uint64( u.p8[14] ) << 48;
+        case 14:
+            d += uint64( u.p8[13] ) << 40;
+        case 13:
+            d += uint64( u.p8[12] ) << 32;
+        case 12:
+            d += u.p32[2];
+            c += u.p64[0];
+            break;
+        case 11:
+            d += uint64( u.p8[10] ) << 16;
+        case 10:
+            d += uint64( u.p8[9] ) << 8;
+        case 9:
+            d += uint64( u.p8[8] );
+        case 8:
+            c += u.p64[0];
+            break;
+        case 7:
+            c += uint64(u.p8[6] ) << 48;
+        case 6:
+            c += uint64( u.p8[5] ) << 40;
+        case 5:
+            c += uint64( u.p8[4] ) << 32;
+        case 4:
+            c += u.p32[0];
+            break;
+        case 3:
+            c += uint64( u.p8[2] ) << 16;
+        case 2:
+            c += uint64( u.p8[1] ) << 8;
+        case 1:
+            c += uint64( u.p8[0] );
+            break;
+        case 0:
+            c += sc_const;
+            d += sc_const;
+        }
+        ShortEnd(a,b,c,d);
+
+        return std::make_pair( a, b );
+    }
+
+    // number of uint64's in internal state
+    static const size_t sc_numVars = 12;
+
+    // size of the internal state
+    static const size_t sc_blockSize = sc_numVars*8;
+
+    // size of buffer of unhashed data, in bytes
+    static const size_t sc_bufSize = 2*sc_blockSize;
+
+    //
+    // sc_const: a constant which:
+    //  * is not zero
+    //  * is odd
+    //  * is a not-very-regular mix of 1's and 0's
+    //  * does not need any other special mathematical properties
+    //
+    static const uint64 sc_const = 0xdeadbeefdeadbeefLL;
+
+    uint64 m_data[2*sc_numVars];   // unhashed data, for partial messages
+    uint64 m_state[sc_numVars];  // internal state of the hash
+    size_t m_length;             // total length of the input so far
+    uint8  m_remainder;          // length of unhashed data stashed in m_data
+};
+
+struct SpookyState {
+
+    SpookyState( uint64_t seed1, uint64_t seed2 ) : state() {
+        state.Init( seed1, seed2 );
+    }
+    SpookyState() = delete;
+    SpookyState( const SpookyState & ) = delete;
+    SpookyState &operator=( const SpookyState & ) = delete;
+
+    void update( const void *message, size_t length ) {
+        state.Update( message, length );
+    }
+
+    hash128_t finalize() {
+        return state.Final();
+    }
+
+  private:
+    SpookyHash state;
+};
+
+}
+
+namespace {
+
+inline hash128_t spooky( const void *message, size_t length, uint64_t seed1, uint64_t seed2 ) {
+    return jenkins::SpookyHash::Hash128( message, length, seed1, seed2 );
+}
+
+}
+
+}
+}
+
+namespace brick_test {
+namespace hash {
+
+using namespace ::brick::hash;
+
+class Random
+{
+public:
+    inline uint64 Value()
+    {
+        uint64 e = m_a - Rot64(m_b, 23);
+        m_a = m_b ^ Rot64(m_c, 16);
+        m_b = m_c + Rot64(m_d, 11);
+        m_c = m_d + e;
+        m_d = e + m_a;
+        return m_d;
+    }
+
+    inline void Init( uint64 seed)
+    {
+        m_a = 0xdeadbeef;
+        m_b = m_c = m_d = seed;
+        for (int i=0; i<20; ++i)
+            static_cast< void >( Value() );
+    }
+
+private:
+    static inline uint64 Rot64(uint64 x, int k)
+    {
+        return (x << k) | (x >> (64-(k)));
+    }
+
+    uint64 m_a;
+    uint64 m_b;
+    uint64 m_c;
+    uint64 m_d;
+};
+
+#define BUFSIZE (512)
+
+using brick::hash::jenkins::SpookyHash;
+
+struct Jenkins {
+
+    TEST(results) {
+        static const uint64 expected[BUFSIZE] = {
+            0x6bf50919,0x70de1d26,0xa2b37298,0x35bc5fbf,0x8223b279,0x5bcb315e,0x53fe88a1,0xf9f1a233,
+            0xee193982,0x54f86f29,0xc8772d36,0x9ed60886,0x5f23d1da,0x1ed9f474,0xf2ef0c89,0x83ec01f9,
+            0xf274736c,0x7e9ac0df,0xc7aed250,0xb1015811,0xe23470f5,0x48ac20c4,0xe2ab3cd5,0x608f8363,
+            0xd0639e68,0xc4e8e7ab,0x863c7c5b,0x4ea63579,0x99ae8622,0x170c658b,0x149ba493,0x027bca7c,
+            0xe5cfc8b6,0xce01d9d7,0x11103330,0x5d1f5ed4,0xca720ecb,0xef408aec,0x733b90ec,0x855737a6,
+            0x9856c65f,0x647411f7,0x50777c74,0xf0f1a8b7,0x9d7e55a5,0xc68dd371,0xfc1af2cc,0x75728d0a,
+            0x390e5fdc,0xf389b84c,0xfb0ccf23,0xc95bad0e,0x5b1cb85a,0x6bdae14f,0x6deb4626,0x93047034,
+            0x6f3266c6,0xf529c3bd,0x396322e7,0x3777d042,0x1cd6a5a2,0x197b402e,0xc28d0d2b,0x09c1afb4,
+
+            0x069c8bb7,0x6f9d4e1e,0xd2621b5c,0xea68108d,0x8660cb8f,0xd61e6de6,0x7fba15c7,0xaacfaa97,
+            0xdb381902,0x4ea22649,0x5d414a1e,0xc3fc5984,0xa0fc9e10,0x347dc51c,0x37545fb6,0x8c84b26b,
+            0xf57efa5d,0x56afaf16,0xb6e1eb94,0x9218536a,0xe3cc4967,0xd3275ef4,0xea63536e,0x6086e499,
+            0xaccadce7,0xb0290d82,0x4ebfd0d6,0x46ccc185,0x2eeb10d3,0x474e3c8c,0x23c84aee,0x3abae1cb,
+            0x1499b81a,0xa2993951,0xeed176ad,0xdfcfe84c,0xde4a961f,0x4af13fe6,0xe0069c42,0xc14de8f5,
+            0x6e02ce8f,0x90d19f7f,0xbca4a484,0xd4efdd63,0x780fd504,0xe80310e3,0x03abbc12,0x90023849,
+            0xd6f6fb84,0xd6b354c5,0x5b8575f0,0x758f14e4,0x450de862,0x90704afb,0x47209a33,0xf226b726,
+            0xf858dab8,0x7c0d6de9,0xb05ce777,0xee5ff2d4,0x7acb6d5c,0x2d663f85,0x41c72a91,0x82356bf2,
+
+            0x94e948ec,0xd358d448,0xeca7814d,0x78cd7950,0xd6097277,0x97782a5d,0xf43fc6f4,0x105f0a38,
+            0x9e170082,0x4bfe566b,0x4371d25f,0xef25a364,0x698eb672,0x74f850e4,0x4678ff99,0x4a290dc6,
+            0x3918f07c,0x32c7d9cd,0x9f28e0af,0x0d3c5a86,0x7bfc8a45,0xddf0c7e1,0xdeacb86b,0x970b3c5c,
+            0x5e29e199,0xea28346d,0x6b59e71b,0xf8a8a46a,0x862f6ce4,0x3ccb740b,0x08761e9e,0xbfa01e5f,
+            0xf17cfa14,0x2dbf99fb,0x7a0be420,0x06137517,0xe020b266,0xd25bfc61,0xff10ed00,0x42e6be8b,
+            0x029ef587,0x683b26e0,0xb08afc70,0x7c1fd59e,0xbaae9a70,0x98c8c801,0xb6e35a26,0x57083971,
+            0x90a6a680,0x1b44169e,0x1dce237c,0x518e0a59,0xccb11358,0x7b8175fb,0xb8fe701a,0x10d259bb,
+            0xe806ce10,0x9212be79,0x4604ae7b,0x7fa22a84,0xe715b13a,0x0394c3b2,0x11efbbae,0xe13d9e19,
+
+            0x77e012bd,0x2d05114c,0xaecf2ddd,0xb2a2b4aa,0xb9429546,0x55dce815,0xc89138f8,0x46dcae20,
+            0x1f6f7162,0x0c557ebc,0x5b996932,0xafbbe7e2,0xd2bd5f62,0xff475b9f,0x9cec7108,0xeaddcffb,
+            0x5d751aef,0xf68f7bdf,0xf3f4e246,0x00983fcd,0x00bc82bb,0xbf5fd3e7,0xe80c7e2c,0x187d8b1f,
+            0xefafb9a7,0x8f27a148,0x5c9606a9,0xf2d2be3e,0xe992d13a,0xe4bcd152,0xce40b436,0x63d6a1fc,
+            0xdc1455c4,0x64641e39,0xd83010c9,0x2d535ae0,0x5b748f3e,0xf9a9146b,0x80f10294,0x2859acd4,
+            0x5fc846da,0x56d190e9,0x82167225,0x98e4daba,0xbf7865f3,0x00da7ae4,0x9b7cd126,0x644172f8,
+            0xde40c78f,0xe8803efc,0xdd331a2b,0x48485c3c,0x4ed01ddc,0x9c0b2d9e,0xb1c6e9d7,0xd797d43c,
+            0x274101ff,0x3bf7e127,0x91ebbc56,0x7ffeb321,0x4d42096f,0xd6e9456a,0x0bade318,0x2f40ee0b,
+
+            0x38cebf03,0x0cbc2e72,0xbf03e704,0x7b3e7a9a,0x8e985acd,0x90917617,0x413895f8,0xf11dde04,
+            0xc66f8244,0xe5648174,0x6c420271,0x2469d463,0x2540b033,0xdc788e7b,0xe4140ded,0x0990630a,
+            0xa54abed4,0x6e124829,0xd940155a,0x1c8836f6,0x38fda06c,0x5207ab69,0xf8be9342,0x774882a8,
+            0x56fc0d7e,0x53a99d6e,0x8241f634,0x9490954d,0x447130aa,0x8cc4a81f,0x0868ec83,0xc22c642d,
+            0x47880140,0xfbff3bec,0x0f531f41,0xf845a667,0x08c15fb7,0x1996cd81,0x86579103,0xe21dd863,
+            0x513d7f97,0x3984a1f1,0xdfcdc5f4,0x97766a5e,0x37e2b1da,0x41441f3f,0xabd9ddba,0x23b755a9,
+            0xda937945,0x103e650e,0x3eef7c8f,0x2760ff8d,0x2493a4cd,0x1d671225,0x3bf4bd4c,0xed6e1728,
+            0xc70e9e30,0x4e05e529,0x928d5aa6,0x164d0220,0xb5184306,0x4bd7efb3,0x63830f11,0xf3a1526c,
+
+            0xf1545450,0xd41d5df5,0x25a5060d,0x77b368da,0x4fe33c7e,0xeae09021,0xfdb053c4,0x2930f18d,
+            0xd37109ff,0x8511a781,0xc7e7cdd7,0x6aeabc45,0xebbeaeaa,0x9a0c4f11,0xda252cbb,0x5b248f41,
+            0x5223b5eb,0xe32ab782,0x8e6a1c97,0x11d3f454,0x3e05bd16,0x0059001d,0xce13ac97,0xf83b2b4c,
+            0x71db5c9a,0xdc8655a6,0x9e98597b,0x3fcae0a2,0x75e63ccd,0x076c72df,0x4754c6ad,0x26b5627b,
+            0xd818c697,0x998d5f3d,0xe94fc7b2,0x1f49ad1a,0xca7ff4ea,0x9fe72c05,0xfbd0cbbf,0xb0388ceb,
+            0xb76031e3,0xd0f53973,0xfb17907c,0xa4c4c10f,0x9f2d8af9,0xca0e56b0,0xb0d9b689,0xfcbf37a3,
+            0xfede8f7d,0xf836511c,0x744003fc,0x89eba576,0xcfdcf6a6,0xc2007f52,0xaaaf683f,0x62d2f9ca,
+            0xc996f77f,0x77a7b5b3,0x8ba7d0a4,0xef6a0819,0xa0d903c0,0x01b27431,0x58fffd4c,0x4827f45c,
+
+            0x44eb5634,0xae70edfc,0x591c740b,0x478bf338,0x2f3b513b,0x67bf518e,0x6fef4a0c,0x1e0b6917,
+            0x5ac0edc5,0x2e328498,0x077de7d5,0x5726020b,0x2aeda888,0x45b637ca,0xcf60858d,0x3dc91ae2,
+            0x3e6d5294,0xe6900d39,0x0f634c71,0x827a5fa4,0xc713994b,0x1c363494,0x3d43b615,0xe5fe7d15,
+            0xf6ada4f2,0x472099d5,0x04360d39,0x7f2a71d0,0x88a4f5ff,0x2c28fac5,0x4cd64801,0xfd78dd33,
+            0xc9bdd233,0x21e266cc,0x9bbf419d,0xcbf7d81d,0x80f15f96,0x04242657,0x53fb0f66,0xded11e46,
+            0xf2fdba97,0x8d45c9f1,0x4eeae802,0x17003659,0xb9db81a7,0xe734b1b2,0x9503c54e,0xb7c77c3e,
+            0x271dd0ab,0xd8b906b5,0x0d540ec6,0xf03b86e0,0x0fdb7d18,0x95e261af,0xad9ec04e,0x381f4a64,
+            0xfec798d7,0x09ea20be,0x0ef4ca57,0x1e6195bb,0xfd0da78b,0xcea1653b,0x157d9777,0xf04af50f,
+
+            0xad7baa23,0xd181714a,0x9bbdab78,0x6c7d1577,0x645eb1e7,0xa0648264,0x35839ca6,0x2287ef45,
+            0x32a64ca3,0x26111f6f,0x64814946,0xb0cddaf1,0x4351c59e,0x1b30471c,0xb970788a,0x30e9f597,
+            0xd7e58df1,0xc6d2b953,0xf5f37cf4,0x3d7c419e,0xf91ecb2d,0x9c87fd5d,0xb22384ce,0x8c7ac51c,
+            0x62c96801,0x57e54091,0x964536fe,0x13d3b189,0x4afd1580,0xeba62239,0xb82ea667,0xae18d43a,
+            0xbef04402,0x1942534f,0xc54bf260,0x3c8267f5,0xa1020ddd,0x112fcc8a,0xde596266,0xe91d0856,
+            0xf300c914,0xed84478e,0x5b65009e,0x4764da16,0xaf8e07a2,0x4088dc2c,0x9a0cad41,0x2c3f179b,
+            0xa67b83f7,0xf27eab09,0xdbe10e28,0xf04c911f,0xd1169f87,0x8e1e4976,0x17f57744,0xe4f5a33f,
+            0x27c2e04b,0x0b7523bd,0x07305776,0xc6be7503,0x918fa7c9,0xaf2e2cd9,0x82046f8e,0xcc1c8250
+        };
+
+        uint8 buf[BUFSIZE];
+        uint32 saw[BUFSIZE];
+        for (int i=0; i<BUFSIZE; ++i)
+        {
+            buf[i] = i+128;
+            saw[i] = SpookyHash::Hash32(buf, i, 0);
+            if (saw[i] != expected[i])
+            {
+                printf("%3d: saw 0x%.8x, expected 0x%.8lx\n", i, saw[i], expected[i]);
+                ASSERT( false );
+            }
+        }
+    }
+#undef BUFSIZE
+
+#define BUFSIZE 1024
+    TEST(alignment) {
+        char buf[BUFSIZE];
+        uint64 hash[8];
+        for (int i=0; i<BUFSIZE-16; ++i)
+        {
+            for (int j=0; j<8; ++j)
+            {
+                buf[j] = char(i+j);
+                for (int k=1; k<=i; ++k)
+                {
+                    buf[j+k] = k;
+                }
+                buf[j+i+1] = char(i+j);
+                hash[j] = SpookyHash::Hash64(reinterpret_cast< const void * >(buf+j+1), i, 0);
+            }
+            for (int j=1; j<8; ++j)
+            {
+                if (hash[0] != hash[j])
+                {
+                    printf("alignment problems: %d %d\n", i, j);
+                    ASSERT( false );
+                }
+            }
+        }
+    }
+#undef BUFSIZE
+
+// test that all deltas of one or two input bits affect all output bits
+#define BUFSIZE 256
+#define TRIES 50
+#define MEASURES 6
+
+    // this takes hours, not doing that in tests...
+    void deltas(int seed)
+    {
+        printf("\nall 1 or 2 bit input deltas get %d tries to flip every output bit ...\n", TRIES);
+
+        Random random;
+        random.Init(uint64(seed));
+
+        // for messages 0..BUFSIZE-1 bytes
+        for (int h=0; h<BUFSIZE; ++h)
+        {
+            int maxk = 0;
+            // first bit to set
+            for (int i=0; i<h*8; ++i)
+            {
+                // second bit to set, or don't have a second bit
+                for (int j=0; j<=i; ++j)
+                {
+                    uint64 measure[MEASURES][2];
+                    uint64 counter[MEASURES][2];
+                    for (int l=0; l<2; ++l)
+                    {
+                        for (int m=0; m<MEASURES; ++m)
+                        {
+                            measure[m][l] = 0;
+                            counter[m][l] = 0;
+                        }
+                    }
+
+                    // try to hit every output bit TRIES times
+                    int k;
+                    for (k=0; k<TRIES; ++k)
+                    {
+                        uint8 buf1[BUFSIZE];
+                        uint8 buf2[BUFSIZE];
+                        int done = 1;
+                        for (int l=0; l<h; ++l)
+                        {
+                            buf1[l] = buf2[l] = random.Value();
+                        }
+                        buf1[i/8] ^= (1 << (i%8));
+                        if (j != i)
+                        {
+                            buf1[j/8] ^= (1 << (j%8));
+                        }
+                        std::tie( measure[0][0], measure[0][1] ) = SpookyHash::Hash128(buf1, h, measure[0][0], measure[0][1]);
+                        std::tie( measure[1][0], measure[1][1] ) = SpookyHash::Hash128(buf2, h, measure[1][0], measure[1][1]);
+                        for (int l=0; l<2; ++l) {
+                            measure[2][l] = measure[0][l] ^ measure[1][l];
+                            measure[3][l] = ~(measure[0][l] ^ measure[1][l]);
+                            measure[4][l] = measure[0][l] - measure[1][l];
+                            measure[4][l] ^= (measure[4][l]>>1);
+                            measure[5][l] = measure[0][l] + measure[1][l];
+                            measure[5][l] ^= (measure[4][l]>>1);
+                        }
+                        for (int l=0; l<2; ++l)
+                        {
+                            for (int m=0; m<MEASURES; ++m)
+                            {
+                                counter[m][l] |= measure[m][l];
+                                if (~counter[m][l]) done = 0;
+                            }
+                        }
+                        if (done) break;
+                    }
+                    if (k == TRIES)
+                    {
+                        printf("failed %d %d %d\n", h, i, j);
+                        ASSERT( false );
+                    }
+                    else if (k > maxk)
+                    {
+                        maxk = k;
+                    }
+                }
+            }
+            printf("passed for buffer size %d  max %d\n", h, maxk);
+        }
+    }
+#undef BUFSIZE
+#undef TRIES
+#undef MEASURES
+
+
+// test that hashing pieces has the same behavior as hashing the whole
+#define BUFSIZE 1024
+    TEST(pieces)
+    {
+        char buf[BUFSIZE];
+        for (int i=0; i<BUFSIZE; ++i)
+        {
+            buf[i] = i;
+        }
+        for (int i=0; i<BUFSIZE; ++i)
+        {
+            uint64 a,b,c,d,seed1=1,seed2=2;
+            SpookyHash state;
+
+            // all as one call
+            a = seed1;
+            b = seed2;
+            std::tie( a, b ) = SpookyHash::Hash128(buf, i, a, b);
+
+            // all as one piece
+            c = 0xdeadbeefdeadbeef;
+            d = 0xbaceba11baceba11;
+            state.Init(seed1, seed2);
+            state.Update(buf, i);
+            std::tie( c, d ) = state.Final();
+
+            ASSERT_EQ( a, c );
+            ASSERT_EQ( b, d );
+
+            // all possible two consecutive pieces
+            for (int j=0; j<i; ++j)
+            {
+                c = seed1;
+                d = seed2;
+                state.Init(c, d);
+                state.Update(&buf[0], j);
+                state.Update(&buf[j], i-j);
+                std::tie( c, d ) = state.Final();
+                ASSERT_EQ( a, c );
+                ASSERT_EQ( b, d );
+            }
+        }
+    }
+#undef BUFSIZE
+};
+
+}
+}
+
+#endif
+// vim: syntax=cpp tabstop=4 shiftwidth=4 expandtab
diff --git a/bricks/brick-hashset.h b/bricks/brick-hashset.h
new file mode 100644
index 000000000..72a94746c
--- /dev/null
+++ b/bricks/brick-hashset.h
@@ -0,0 +1,1581 @@
+// -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+
+/*
+ * Fast hash tables.
+ */
+
+/*
+ * (c) 2010-2014 Petr Ročkai <me@mornfall.net>
+ * (c) 2012-2014 Jiří Weiser <xweiser1@fi.muni.cz>
+ * (c) 2013-2014 Vladimír Štill <xstill@fi.muni.cz>
+ */
+
+/* Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE. */
+
+#include <bricks/brick-hash.h>
+#include <bricks/brick-shmem.h>
+#include <bricks/brick-bitlevel.h>
+#include <bricks/brick-assert.h>
+#include <bricks/brick-types.h>
+
+#include <type_traits>
+#include <set>
+
+#ifndef BRICK_HASHSET_H
+#define BRICK_HASHSET_H
+
+namespace brick {
+namespace hashset {
+
+using hash::hash64_t;
+using hash::hash128_t;
+
+/*
+ * Hash table cell implementations (tables are represented as vectors of
+ * cells).
+ */
+
+template< typename T, typename _Hasher >
+struct CellBase
+{
+    using value_type = T;
+    using Hasher = _Hasher;
+};
+
+template< typename T, typename Hasher >
+struct FastCell : CellBase< T, Hasher >
+{
+    T _value;
+    hash64_t _hash;
+
+    template< typename Value >
+    bool is( Value v, hash64_t hash, Hasher &h ) {
+        return _hash == hash && h.equal( _value, v );
+    }
+
+    bool empty() { return !_hash; }
+    void store( T bn, hash64_t hash ) {
+        _hash = hash;
+        _value = bn;
+    }
+
+    T &fetch() { return _value; }
+    T copy() { return _value; }
+    hash64_t hash( Hasher & ) { return _hash; }
+};
+
+template< typename T, typename Hasher >
+struct CompactCell : CellBase< T, Hasher >
+{
+    T _value;
+
+    template< typename Value >
+    bool is( Value v, hash64_t, Hasher &h ) {
+        return h.equal( _value, v );
+    }
+
+    bool empty() { return !_value; } /* meh */
+    void store( T bn, hash64_t ) { _value = bn; }
+
+    T &fetch() { return _value; }
+    T copy() { return _value; }
+    hash64_t hash( Hasher &h ) { return h.hash( _value ).first; }
+};
+
+template< typename T, typename Hasher >
+struct FastAtomicCell : CellBase< T, Hasher >
+{
+    std::atomic< hash64_t > hashLock;
+    T value;
+
+    bool empty() { return hashLock == 0; }
+    bool invalid() { return hashLock == 3; }
+
+    /* returns old cell value */
+    FastAtomicCell invalidate() {
+        // wait for write to end
+        hash64_t prev = 0;
+        while ( !hashLock.compare_exchange_weak( prev, 0x3 ) ) {
+            if ( prev == 3 )
+                return FastAtomicCell( prev, value );
+            prev &= ~(0x3); // clean flags
+        }
+        return FastAtomicCell( prev, value );
+    }
+
+    T &fetch() { return value; }
+    T copy() { return value; }
+
+    // TODO: this loses bits and hence doesn't quite work
+    // hash64_t hash( Hasher & ) { return hashLock >> 2; }
+    hash64_t hash( Hasher &h ) { return h.hash( value ).first; }
+
+    // wait for another write; returns false if cell was invalidated
+    bool wait() {
+        while( hashLock & 1 )
+            if ( invalid() )
+                return false;
+        return true;
+    }
+
+    bool tryStore( T v, hash64_t hash ) {
+        hash |= 0x1;
+        hash64_t chl = 0;
+        if ( hashLock.compare_exchange_strong( chl, (hash << 2) | 1 ) ) {
+            value = v;
+            hashLock.exchange( hash << 2 );
+            return true;
+        }
+        return false;
+    }
+
+    template< typename Value >
+    bool is( Value v, hash64_t hash, Hasher &h ) {
+        hash |= 0x1;
+        if ( ( (hash << 2) | 1) != (hashLock | 1) )
+            return false;
+        if ( !wait() )
+            return false;
+        return h.equal( value, v );
+    }
+
+    FastAtomicCell() : hashLock( 0 ), value() {}
+    FastAtomicCell( const FastAtomicCell & ) : hashLock( 0 ), value() {}
+    FastAtomicCell( hash64_t hash, T value ) : hashLock( hash ), value( value ) { }
+};
+
+template< typename T, typename = void >
+struct Tagged {
+    T t;
+    uint32_t _tag;
+
+    static const int tagBits = 16;
+    void setTag( uint32_t v ) { _tag = v; }
+    uint32_t tag() { return _tag; }
+    Tagged() noexcept : t(), _tag( 0 ) {}
+    Tagged( const T &t ) : t( t ), _tag( 0 ) {}
+};
+
+template< typename T >
+struct Tagged< T, typename std::enable_if< (T::tagBits > 0) >::type >
+{
+    T t;
+
+    static const int tagBits = T::tagBits;
+    void setTag( uint32_t value ) { t.setTag( value ); }
+    uint32_t tag() { return t.tag(); }
+    Tagged() noexcept : t() {}
+    Tagged( const T &t ) : t( t ) {}
+};
+
+template< typename T, typename Hasher >
+struct AtomicCell : CellBase< T, Hasher >
+{
+    std::atomic< Tagged< T > > value;
+
+    static_assert( sizeof( std::atomic< Tagged< T > > ) == sizeof( Tagged< T > ),
+                   "std::atomic< Tagged< T > > must be lock-free" );
+    static_assert( Tagged< T >::tagBits > 0, "T has at least a one-bit tagspace" );
+
+    bool empty() { return !value.load().t; }
+    bool invalid() {
+        Tagged< T > v = value.load();
+        return (v.tag() == 0 && v.t) || (v.tag() != 0 && !v.t);
+    }
+
+    static hash64_t hashToTag( hash64_t hash, int bits = Tagged< T >::tagBits )
+    {
+        // use different part of hash than used for storing
+        return ( hash >> ( sizeof( hash64_t ) * 8 - bits ) ) | 0x1;
+    }
+
+    /* returns old cell value */
+    AtomicCell invalidate() {
+        Tagged< T > v = value;
+        v.setTag( v.tag() ? 0 : 1 ); // set tag to 1 if it was empty -> empty != invalid
+        return AtomicCell( value.exchange( v ) );
+    }
+
+    Tagged< T > &deatomize() {
+        value.load(); // fence
+        return *reinterpret_cast< Tagged< T > * >( &value );
+    }
+
+    T &fetch() { return deatomize().t; }
+    T copy() { Tagged< T > v = value; v.setTag( 0 ); return v.t; }
+    bool wait() { return !invalid(); }
+
+    void store( T bn, hash64_t hash ) {
+        return tryStore( bn, hash );
+    }
+
+    bool tryStore( T b, hash64_t hash ) {
+        Tagged< T > zero;
+        Tagged< T > next( b );
+        next.setTag( hashToTag( hash ) );
+        auto rv = value.compare_exchange_strong( zero, next );
+        return rv;
+    }
+
+    template< typename Value >
+    bool is( Value v, hash64_t hash, Hasher &h ) {
+        return value.load().tag() == hashToTag( hash ) &&
+            h.equal( value.load().t, v );
+    }
+
+    hash64_t hash( Hasher &h ) { return h.hash( value.load().t ).first; }
+
+    // AtomicCell &operator=( const AtomicCell &cc ) = delete;
+
+    AtomicCell() : value() {}
+    AtomicCell( const AtomicCell & ) : value() {}
+    AtomicCell( Tagged< T > val ) : value() {
+        value.store( val );
+    }
+};
+
+// default hash implementation
+template< typename T >
+struct default_hasher {};
+
+template< typename T >
+struct Found : types::Wrapper< T >
+{
+    bool _found;
+
+    Found( const T &t, bool found ) : types::Wrapper< T >( t ), _found( found ) {}
+    bool isnew() { return !_found; }
+    bool found() { return _found; }
+
+};
+
+template< typename S, typename F >
+types::FMap< Found, S, F > fmap( F f, Found< S > n ) {
+    return types::FMap< Found, S, F >( f( n.unwrap() ), n._found );
+}
+
+template< typename T >
+Found< T > isNew( const T &x, bool y ) {
+    return Found< T >( x, !y );
+}
+
+template< typename Cell >
+struct HashSetBase
+{
+    struct ThreadData {};
+
+    using value_type = typename Cell::value_type;
+    using Hasher = typename Cell::Hasher;
+
+    static const unsigned cacheLine = 64; // bytes
+    static const unsigned thresh = cacheLine / sizeof( Cell );
+    static const unsigned threshMSB = bitlevel::compiletime::MSB( thresh );
+    static const unsigned maxcollisions = 1 << 16; // 2^16
+    static const unsigned growthreshold = 75; // percent
+
+    Hasher hasher;
+
+    struct iterator {
+        Cell *_cell;
+        bool _new;
+        iterator( Cell *c = nullptr, bool n = false ) : _cell( c ), _new( n ) {}
+        value_type *operator->() { return &(_cell->fetch()); }
+        value_type &operator*() { return _cell->fetch(); }
+        value_type copy() { return _cell->copy(); }
+        bool valid() { return _cell; }
+        bool isnew() { return _new; }
+    };
+
+    iterator end() { return iterator(); }
+
+    static size_t index( hash64_t h, size_t i, size_t mask ) {
+        h &= ~hash64_t( thresh - 1 );
+        const unsigned Q = 1, R = 1;
+        if ( i < thresh )
+            return ( h + i ) & mask;
+        else {
+            size_t j = i & ( thresh - 1 );
+            i = i >> threshMSB;
+            size_t hop = ( (2 * Q + 1) * i + 2 * R * (i * i) ) << threshMSB;
+            return ( h + j + hop ) & mask;
+        }
+    }
+
+    HashSetBase( const Hasher &h ) : hasher( h ) {}
+};
+
+/**
+ * An implementation of high-performance hash table, used as a set. It's an
+ * open-hashing implementation with a combination of linear and quadratic
+ * probing. It also uses a hash-compacted prefilter to avoid fetches when
+ * looking up an item and the item stored at the current lookup position is
+ * distinct (a collision).
+ *
+ * An initial size may be provided to improve performance in cases where it is
+ * known there will be many elements. Table growth is exponential with base 2
+ * and is triggered at 75% load. (See maxcollision().)
+ */
+template< typename Cell >
+struct _HashSet : HashSetBase< Cell >
+{
+    using Base = HashSetBase< Cell >;
+    typedef std::vector< Cell > Table;
+    _HashSet< Cell > &withTD( typename Base::ThreadData & ) { return *this; }
+
+    using typename Base::iterator;
+    using typename Base::value_type;
+    using typename Base::Hasher;
+
+    Table _table;
+    int _used;
+    int _bits;
+    size_t _maxsize;
+    bool _growing;
+
+    size_t size() const { return _table.size(); }
+    bool empty() const { return !_used; }
+
+    int count( const value_type &i ) { return find( i ).valid(); }
+    hash64_t hash( const value_type &i ) { return hash128( i ).first; }
+    hash128_t hash128( const value_type &i ) { return this->hasher.hash( i ); }
+    iterator insert( value_type i ) { return insertHinted( i, hash( i ) ); }
+
+    template< typename T >
+    iterator find( const T &i ) {
+        return findHinted( i, hash( i ) );
+    }
+
+    template< typename T >
+    iterator findHinted( const T &item, hash64_t hash )
+    {
+        size_t idx;
+        for ( size_t i = 0; i < this->maxcollisions; ++i ) {
+            idx = this->index( hash, i, _bits );
+
+            if ( _table[ idx ].empty() )
+                return this->end();
+
+            if ( _table[ idx ].is( item, hash, this->hasher ) )
+                return iterator( &_table[ idx ] );
+        }
+        // we can be sure that the element is not in the table *because*: we
+        // never create chains longer than "mc", and if we haven't found the
+        // key in this many steps, it can't be in the table
+        return this->end();
+    }
+
+    iterator insertHinted( const value_type &i, hash64_t h ) {
+        return insertHinted( i, h, _table, _used );
+    }
+
+    iterator insertHinted( const value_type &item, hash64_t h, Table &table, int &used )
+    {
+        if ( !_growing && size_t( _used ) > (size() / 100) * 75 )
+            grow();
+
+        size_t idx;
+        for ( size_t i = 0; i < this->maxcollisions; ++i ) {
+            idx = this->index( h, i, _bits );
+
+            if ( table[ idx ].empty() ) {
+                ++ used;
+                table[ idx ].store( item, h );
+                return iterator( &table[ idx ], true );
+            }
+
+            if ( table[ idx ].is( item, h, this->hasher ) )
+                return iterator( &table[ idx ], false );
+        }
+
+        grow();
+
+        return insertHinted( item, h, table, used );
+    }
+
+    void grow() {
+        if ( 2 * size() >= _maxsize )
+            ASSERT_UNREACHABLE( "ran out of space in the hash table" );
+
+        if( _growing )
+            ASSERT_UNREACHABLE( "too many collisions during table growth" );
+
+        _growing = true;
+
+        int used = 0;
+
+        Table table;
+
+        table.resize( 2 * size(), Cell() );
+        _bits |= (_bits << 1); // unmask more
+
+        for ( auto cell : _table ) {
+            if ( cell.empty() )
+                continue;
+            insertHinted( cell.fetch(), cell.hash( this->hasher ),
+                          table, used );
+        }
+
+        std::swap( table, _table );
+        ASSERT_EQ( used, _used );
+
+        _growing = false;
+    }
+
+    void setSize( size_t s )
+    {
+        _bits = 0;
+        while ((s = s >> 1))
+            _bits |= s;
+        _table.resize( _bits + 1, Cell() );
+    }
+
+    void clear() {
+        _used = 0;
+        std::fill( _table.begin(), _table.end(), value_type() );
+    }
+
+    bool valid( int off ) {
+        return !_table[ off ].empty();
+    }
+
+    value_type &operator[]( int off ) {
+        return _table[ off ].fetch();
+    }
+
+
+    _HashSet() : _HashSet( Hasher() ) {}
+    explicit _HashSet( Hasher h ) : _HashSet( h, 32 ) {}
+
+    _HashSet( Hasher h, int initial )
+        : Base( h ), _used( 0 ), _maxsize( -1 ), _growing( false )
+    {
+        setSize( initial );
+    }
+};
+
+template< typename T, typename Hasher = default_hasher< T > >
+using Fast = _HashSet< FastCell< T, Hasher > >;
+
+template< typename T, typename Hasher = default_hasher< T > >
+using Compact = _HashSet< CompactCell< T, Hasher > >;
+
+template< typename Cell >
+struct _ConcurrentHashSet : HashSetBase< Cell >
+{
+    using Base = HashSetBase< Cell >;
+    using typename Base::Hasher;
+    using typename Base::value_type;
+    using typename Base::iterator;
+
+    enum class Resolution {
+        Success, // the item has been inserted successfully
+        Failed,  // cannot insert value, table growth has been triggered while
+                 // we were looking for a free cell
+        Found,   // item was already in the table
+        NotFound,
+        NoSpace, // there's is not enough space in the table
+        Growing  // table is growing or was already resized, retry
+    };
+
+    struct _Resolution {
+        Resolution r;
+        Cell *c;
+
+        _Resolution( Resolution r, Cell *c = nullptr ) : r( r ), c( c ) {}
+    };
+
+    using Insert = _Resolution;
+    using Find = _Resolution;
+
+    struct ThreadData {
+        unsigned inserts;
+        unsigned currentRow;
+
+        ThreadData() : inserts( 0 ), currentRow( 0 ) {}
+    };
+
+    struct Row {
+        std::atomic< Cell * > _data;
+        size_t _size;
+
+        size_t size() const { return _size; }
+
+        void size( size_t s ) {
+            ASSERT( empty() );
+            _size = std::max( s, size_t( 1 ) );
+        }
+
+        bool empty() const { return begin() == nullptr; }
+
+        void resize( size_t n ) {
+            Cell *old = _data.exchange( new Cell[ n ] );
+            _size = n;
+            delete[] old;
+        }
+
+        void free() {
+            Cell *old = _data.exchange( nullptr );
+            _size = 0;
+            delete[] old;
+        }
+
+        Cell &operator[]( size_t i ) {
+            return _data.load( std::memory_order_relaxed )[ i ];
+        }
+
+        Cell *begin() {
+            return _data.load( std::memory_order_relaxed );
+        }
+        Cell *begin() const {
+            return _data.load( std::memory_order_relaxed );
+        }
+
+        Cell *end() {
+            return begin() + size();
+        }
+        Cell *end() const {
+            return begin() + size();
+        }
+
+        Row() : _data( nullptr ), _size( 0 ) {}
+        ~Row() { free(); }
+    };
+
+    static const unsigned segmentSize = 1 << 16;// 2^16 = 65536
+    static const unsigned syncPoint = 1 << 10;// 2^10 = 1024
+
+    struct Data
+    {
+        Hasher hasher;
+        std::vector< Row > table;
+        std::vector< std::atomic< unsigned short > > tableWorkers;
+        std::atomic< unsigned > currentRow;
+        std::atomic< int > availableSegments;
+        std::atomic< unsigned > doneSegments;
+        std::atomic< size_t > used;
+        std::atomic< bool > growing;
+
+        Data( const Hasher &h, unsigned maxGrows )
+            : hasher( h ), table( maxGrows ), tableWorkers( maxGrows ), currentRow( 0 ),
+              availableSegments( 0 ), used( 0 ), growing( false )
+        {}
+    };
+
+    Data _d;
+    ThreadData _global; /* for single-thread access */
+
+    static size_t nextSize( size_t s ) {
+        if ( s < 512 * 1024 )
+            return s * 16;
+        if ( s < 16 * 1024 * 1024 )
+            return s * 8;
+        if ( s < 32 * 1024 * 1024 )
+            return s * 4;
+        return s * 2;
+    }
+
+    struct WithTD
+    {
+        using iterator = typename Base::iterator;
+        using value_type = typename Base::value_type;
+
+        Data &_d;
+        ThreadData &_td;
+        WithTD( Data &d, ThreadData &td ) : _d( d ), _td( td ) {}
+
+        size_t size() { return current().size(); }
+        Row &current() { return _d.table[ _d.currentRow ]; }
+        Row &current( unsigned index ) { return _d.table[ index ]; }
+        bool changed( unsigned row ) { return row < _d.currentRow || _d.growing; }
+
+        iterator insert( value_type x ) {
+            return insertHinted( x, _d.hasher.hash( x ).first );
+        }
+
+        template< typename T >
+        iterator find( T x ) {
+            return findHinted( x, _d.hasher.hash( x ).first );
+        }
+
+        int count( value_type x ) {
+            return find( x ).valid() ? 1 : 0;
+        }
+
+        iterator insertHinted( value_type x, hash64_t h )
+        {
+            while ( true ) {
+                Insert ir = insertCell< false >( x, h );
+                switch ( ir.r ) {
+                    case Resolution::Success:
+                        increaseUsage();
+                        return iterator( ir.c, true );
+                    case Resolution::Found:
+                        return iterator( ir.c, false );
+                    case Resolution::NoSpace:
+                        if ( grow( _td.currentRow + 1 ) ) {
+                            ++_td.currentRow;
+                            break;
+                        }
+                    case Resolution::Growing:
+                        helpWithRehashing();
+                        updateIndex( _td.currentRow );
+                        break;
+                    default:
+                        ASSERT_UNREACHABLE("impossible result from insertCell");
+                }
+            }
+            ASSERT_UNREACHABLE("broken loop");
+        }
+
+        template< typename T >
+        iterator findHinted( T x, hash64_t h ) {
+            while ( true ) {
+                Find fr = findCell( x, h, _td.currentRow );
+                switch ( fr.r ) {
+                    case Resolution::Found:
+                        return iterator( fr.c );
+                    case Resolution::NotFound:
+                        return iterator();
+                    case Resolution::Growing:
+                        helpWithRehashing();
+                        updateIndex( _td.currentRow );
+                        break;
+                    default:
+                        ASSERT_UNREACHABLE("impossible result from findCell");
+                }
+            }
+            ASSERT_UNREACHABLE("broken loop");
+        }
+
+        template< typename T >
+        Find findCell( T v, hash64_t h, unsigned rowIndex )
+        {
+            if ( changed( rowIndex ) )
+                return Find( Resolution::Growing );
+
+            Row &row = current( rowIndex );
+
+            if ( row.empty() )
+                return Find( Resolution::NotFound );
+
+            const size_t mask = row.size() - 1;
+
+            for ( size_t i = 0; i < Base::maxcollisions; ++i ) {
+                if ( changed( rowIndex ) )
+                    return Find( Resolution::Growing );
+
+                Cell &cell = row[ Base::index( h, i, mask ) ];
+                if ( cell.empty() )
+                    return Find( Resolution::NotFound );
+                if ( cell.is( v, h, _d.hasher ) )
+                    return Find( Resolution::Found, &cell );
+                if ( cell.invalid() )
+                    return Find( Resolution::Growing );
+            }
+            return Find( Resolution::NotFound );
+        }
+
+        template< bool force >
+        Insert insertCell( value_type x, hash64_t h )
+        {
+            Row &row = current( _td.currentRow );
+            if ( !force ) {
+                // read usage first to guarantee usage <= size
+                size_t u = _d.used.load();
+                // usage >= 75% of table size
+                // usage is never greater than size
+                if ( row.empty() || double( row.size() ) <= double( 4 * u ) / 3 )
+                    return Insert( Resolution::NoSpace );
+                if ( changed( _td.currentRow ) )
+                    return Insert( Resolution::Growing );
+            }
+
+            ASSERT( !row.empty() );
+            const size_t mask = row.size() - 1;
+
+            for ( size_t i = 0; i < Base::maxcollisions; ++i )
+            {
+                Cell &cell = row[ Base::index( h, i, mask ) ];
+
+                if ( cell.empty() ) {
+                    if ( cell.tryStore( x, h ) )
+                        return Insert( Resolution::Success, &cell );
+                    if ( !force && changed( _td.currentRow ) )
+                        return Insert( Resolution::Growing );
+                }
+                if ( cell.is( x, h, _d.hasher ) )
+                    return Insert( Resolution::Found, &cell );
+
+                if ( !force && changed( _td.currentRow ) )
+                    return Insert( Resolution::Growing );
+            }
+            return Insert( Resolution::NoSpace );
+        }
+
+        bool grow( unsigned rowIndex )
+        {
+            ASSERT( rowIndex );
+
+            if ( rowIndex >= _d.table.size() )
+                ASSERT_UNREACHABLE( "out of growth space" );
+
+            if ( _d.currentRow >= rowIndex )
+                return false;
+
+            while ( _d.growing.exchange( true ) ) // acquire growing lock
+                helpWithRehashing();
+
+            if ( _d.currentRow >= rowIndex ) {
+                _d.growing.exchange( false ); // release the lock
+                return false;
+            }
+
+            Row &row = current( rowIndex - 1 );
+            _d.table[ rowIndex ].resize( nextSize( row.size() ) );
+            _d.currentRow.exchange( rowIndex );
+            _d.tableWorkers[ rowIndex ] = 1;
+            _d.doneSegments.exchange( 0 );
+
+            // current row is fake, so skip the rehashing
+            if ( row.empty() ) {
+                rehashingDone();
+                return true;
+            }
+
+            const unsigned segments = std::max( row.size() / segmentSize, size_t( 1 ) );
+            _d.availableSegments.exchange( segments );
+
+            while ( rehashSegment() );
+
+            return true;
+        }
+
+        void helpWithRehashing() {
+            while ( _d.growing )
+                while( rehashSegment() );
+        }
+
+        void rehashingDone() {
+            releaseRow( _d.currentRow - 1 );
+            _d.growing.exchange( false ); /* done */
+        }
+
+        bool rehashSegment() {
+            int segment;
+            if ( _d.availableSegments <= 0 )
+                return false;
+            if ( ( segment = --_d.availableSegments ) < 0 )
+                return false;
+
+            Row &row = current( _d.currentRow - 1 );
+            size_t segments = std::max( row.size() / segmentSize, size_t( 1 ) );
+            auto it = row.begin() + segmentSize * segment;
+            auto end = it + segmentSize;
+            if ( end > row.end() )
+                end = row.end();
+            ASSERT( it < end );
+
+            ThreadData td;
+            td.currentRow = _d.currentRow;
+
+            // every cell has to be invalidated
+            for ( ; it != end; ++it ) {
+                Cell old = it->invalidate();
+                if ( old.empty() || old.invalid() )
+                    continue;
+
+                value_type value = old.fetch();
+                Resolution r = WithTD( _d, td ).insertCell< true >( value, old.hash( _d.hasher ) ).r;
+                switch( r ) {
+                    case Resolution::Success:
+                        break;
+                    case Resolution::NoSpace:
+                        ASSERT_UNREACHABLE( "ran out of space during growth" );
+                    default:
+                        ASSERT_UNREACHABLE( "internal error" );
+                }
+            }
+
+            if ( ++_d.doneSegments == segments )
+                rehashingDone();
+
+            return segment > 0;
+        }
+
+        void updateIndex( unsigned &index ) {
+            unsigned row = _d.currentRow;
+            if ( row != index ) {
+                releaseRow( index );
+                acquireRow( row );
+                index = row;
+            }
+        }
+
+        void releaseRow( unsigned index ) {
+            // special case - zero index
+            if ( !_d.tableWorkers[ index ] )
+                return;
+            // only last thread releases memory
+            if ( !--_d.tableWorkers[ index ] )
+                _d.table[ index ].free();
+        }
+
+        void acquireRow( unsigned &index ) {
+            unsigned short refCount	= _d.tableWorkers[ index ];
+
+            do {
+                if ( !refCount ) {
+                    index = _d.currentRow;
+                    refCount = _d.tableWorkers[ index ];
+                    continue;
+                }
+
+                if (_d.tableWorkers[ index ].compare_exchange_weak( refCount, refCount + 1 ))
+                    break;
+            } while( true );
+        }
+
+        void increaseUsage() {
+            if ( ++_td.inserts == syncPoint ) {
+                _d.used += syncPoint;
+                _td.inserts = 0;
+            }
+        }
+
+    };
+
+    WithTD withTD( ThreadData &td ) { return WithTD( _d, td ); }
+
+    explicit _ConcurrentHashSet( Hasher h = Hasher(), unsigned maxGrows = 64 )
+        : Base( h ), _d( h, maxGrows )
+    {
+        setSize( 16 ); // by default
+    }
+
+    /* XXX only usable before the first insert; rename? */
+    void setSize( size_t s ) {
+        s = bitlevel::fill( s - 1 ) + 1;
+        size_t toSet = 1;
+        while ( nextSize( toSet ) < s )
+            toSet <<= 1;
+        _d.table[ 0 ].size( toSet );
+    }
+
+    hash64_t hash( const value_type &t ) { return hash128( t ).first; }
+    hash128_t hash128( const value_type &t ) { return _d.hasher.hash( t ); }
+    iterator insert( const value_type &t ) { return withTD( _global ).insert( t ); }
+    int count( const value_type &t ) { return withTD( _global ).count( t ); }
+    size_t size() { return withTD( _global ).size(); }
+
+    _ConcurrentHashSet( const _ConcurrentHashSet & ) = delete;
+    _ConcurrentHashSet &operator=( const _ConcurrentHashSet & )= delete;
+
+    /* multiple threads may use operator[], but not concurrently with insertions */
+    value_type operator[]( size_t index ) { // XXX return a reference
+        return _d.table[ _d.currentRow ][ index ].fetch();
+    }
+
+    bool valid( size_t index ) {
+        return !_d.table[ _d.currentRow ][ index ].empty();
+    }
+};
+
+template< typename T, typename Hasher = default_hasher< T > >
+using FastConcurrent = _ConcurrentHashSet< FastAtomicCell< T, Hasher > >;
+
+template< typename T, typename Hasher = default_hasher< T > >
+using CompactConcurrent = _ConcurrentHashSet< AtomicCell< T, Hasher > >;
+
+#ifdef BRICKS_FORCE_FAST_CONCURRENT_SET
+template< typename T, typename Hasher = default_hasher< T > >
+using Concurrent = FastConcurrent< T, Hasher >;
+
+#elif BRICKS_FORCE_COMPACT_CONCURRENT_SET
+template< typename T, typename Hasher = default_hasher< T > >
+using Concurrent = CompactConcurrent< T, Hasher >;
+
+#else
+template< typename T, typename Hasher = default_hasher< T > >
+using Concurrent = _ConcurrentHashSet< typename std::conditional< (
+              sizeof( Tagged< T > ) > 8 // most platforms do not have CAS for data types bigger then 64bit
+                                        // for example 16B CAS does not link in clang 3.4 on x86_64
+              || sizeof( std::atomic< Tagged< T > > ) > sizeof( Tagged< T > ) // atomic is not lock-free
+              || sizeof( AtomicCell< T, Hasher > ) >= sizeof( FastAtomicCell< T, Hasher > ) ),
+        FastAtomicCell< T, Hasher >, AtomicCell< T, Hasher > >::type >;
+#endif
+
+}
+}
+
+/* unit tests */
+
+namespace brick_test {
+namespace hashset {
+
+using namespace ::brick::hashset;
+
+template< template< typename > class HS >
+struct Sequential
+{
+    TEST(basic) {
+        HS< int > set;
+
+        ASSERT( !set.count( 1 ) );
+        ASSERT( set.insert( 1 ).isnew() );
+        ASSERT( set.count( 1 ) );
+
+        unsigned count = 0;
+        for ( unsigned i = 0; i != set.size(); ++i )
+            if ( set[ i ] )
+                ++count;
+
+        ASSERT_EQ( count, 1u );
+    }
+
+    TEST(stress) {
+        HS< int > set;
+        for ( int i = 1; i < 32*1024; ++i ) {
+            set.insert( i );
+            ASSERT( set.count( i ) );
+        }
+        for ( int i = 1; i < 32*1024; ++i ) {
+            ASSERT( set.count( i ) );
+        }
+    }
+
+    TEST(set) {
+        HS< int > set;
+
+        for ( int i = 1; i < 32*1024; ++i ) {
+            ASSERT( !set.count( i ) );
+        }
+
+        for ( int i = 1; i < 32*1024; ++i ) {
+            set.insert( i );
+            ASSERT( set.count( i ) );
+            ASSERT( !set.count( i + 1 ) );
+        }
+
+        for ( int i = 1; i < 32*1024; ++i ) {
+            ASSERT( set.count( i ) );
+        }
+
+        for ( int i = 32*1024; i < 64 * 1024; ++i ) {
+            ASSERT( !set.count( i ) );
+        }
+    }
+};
+
+template< template< typename > class HS >
+struct Parallel
+{
+    struct Insert : shmem::Thread {
+        HS< int > *_set;
+        typename HS< int >::ThreadData td;
+        int from, to;
+        bool overlap;
+
+        void main() {
+            auto set = _set->withTD( td );
+            for ( int i = from; i < to; ++i ) {
+                set.insert( i );
+                ASSERT( !set.insert( i ).isnew() );
+                if ( !overlap && i < to - 1 )
+                    ASSERT( !set.count( i + 1 ) );
+            }
+        }
+    };
+
+    TEST(insert) {
+        HS< int > set;
+        set.setSize( 4 * 1024 );
+        Insert a;
+        a._set = &set;
+        a.from = 1;
+        a.to = 32 * 1024;
+        a.overlap = false;
+        a.main();
+        for ( int i = 1; i < 32*1024; ++i )
+            ASSERT( set.count( i ) );
+    }
+
+    static void _par( HS< int > *set, int f1, int t1, int f2, int t2 )
+    {
+        Insert a, b;
+
+        a.from = f1;
+        a.to = t1;
+        b.from = f2;
+        b.to = t2;
+        a._set = set;
+        b._set = set;
+        a.overlap = b.overlap = (t1 > f2);
+
+        a.start();
+        b.start();
+        a.join();
+        b.join();
+    }
+
+    static void _multi( HS< int > *set, std::size_t count, int from, int to )
+    {
+        Insert *arr = new Insert[ count ];
+
+        for ( std::size_t i = 0; i < count; ++i ) {
+            arr[ i ].from = from;
+            arr[ i ].to = to;
+            arr[ i ]._set = set;
+            arr[ i ].overlap = true;
+        }
+
+        for ( std::size_t i = 0; i < count; ++i )
+            arr[ i ].start();
+
+        for ( std::size_t i = 0; i < count; ++i )
+            arr[ i ].join();
+
+        delete[] arr;
+    }
+
+    TEST(multi)
+    {
+        HS< int > set;
+        set.setSize( 4 * 1024 );
+        _multi( &set, 10, 1, 32 * 1024 );
+
+        for  ( int i = 1; i < 32 * 1024; ++i )
+            ASSERT( set.count( i ) );
+
+        int count = 0;
+        std::set< int > s;
+        for ( size_t i = 0; i != set.size(); ++i ) {
+            if ( set[ i ] ) {
+                if ( s.find( set[ i ] ) == s.end() )
+                    s.insert( set[ i ] );
+                ++count;
+            }
+        }
+        ASSERT_EQ( count, 32 * 1024 - 1 );
+    }
+
+    TEST(stress)
+    {
+        HS< int > set;
+
+        set.setSize( 4 * 1024 );
+        _par( &set, 1, 16*1024, 8*1024, 32*1024 );
+
+        for ( int i = 1; i < 32*1024; ++i )
+            ASSERT( set.count( i ) );
+    }
+
+    TEST(set) {
+        HS< int > set;
+        set.setSize( 4 * 1024 );
+        for ( int i = 1; i < 32*1024; ++i )
+            ASSERT( !set.count( i ) );
+
+        _par( &set, 1, 16*1024, 16*1024, 32*1024 );
+
+        for ( int i = 1; i < 32*1024; ++i )
+            ASSERT_EQ( i, i * set.count( i ) );
+
+        for ( int i = 32*1024; i < 64 * 1024; ++i )
+            ASSERT( !set.count( i ) );
+    }
+};
+
+template< typename T >
+struct test_hasher {
+    template< typename X >
+    test_hasher( X& ) { }
+    test_hasher() = default;
+    hash128_t hash( int t ) const { return std::make_pair( t, t ); }
+    bool valid( int t ) const { return t != 0; }
+    bool equal( int a, int b ) const { return a == b; }
+};
+
+template< typename T > using CS = Compact< T, test_hasher< T > >;
+template< typename T > using FS = Fast< T, test_hasher< T > >;
+template< typename T > using ConCS = CompactConcurrent< T, test_hasher< T > >;
+template< typename T > using ConFS = FastConcurrent< T, test_hasher< T > >;
+
+/* instantiate the testcases */
+template struct Sequential< CS >;
+template struct Sequential< FS >;
+template struct Sequential< ConCS >;
+template struct Sequential< ConFS >;
+template struct Parallel< ConCS >;
+template struct Parallel< ConFS >;
+
+}
+}
+
+#ifdef BRICK_BENCHMARK_REG
+
+#include <brick-hlist.h>
+#include <brick-benchmark.h>
+#include <unordered_set>
+
+#ifdef BRICKS_HAVE_TBB
+#include <tbb/concurrent_hash_map.h>
+#include <tbb/concurrent_unordered_set.h>
+#endif
+
+namespace brick_test {
+namespace hashset {
+
+template< typename HS >
+struct RandomThread : shmem::Thread {
+    HS *_set;
+    typename HS::ThreadData td;
+    int count, id;
+    std::mt19937 rand;
+    std::uniform_int_distribution<> dist;
+    bool insert;
+    int max;
+
+    RandomThread() : insert( true ) {}
+
+    void main() {
+        rand.seed( id );
+        auto set = _set->withTD( td );
+        for ( int i = 0; i < count; ++i ) {
+            int v = dist( rand );
+            if ( max < std::numeric_limits< int >::max() ) {
+                v = v % max;
+                v = v * v + v + 41; /* spread out the values */
+            }
+            if ( insert )
+                set.insert( v );
+            else
+                set.count( v );
+        }
+    };
+};
+
+namespace {
+
+Axis axis_items( int min = 16, int max = 16 * 1024 ) {
+    Axis a;
+    a.type = Axis::Quantitative;
+    a.name = "items";
+    a.log = true;
+    a.step = sqrt(sqrt(2));
+    a.normalize = Axis::Div;
+    a.unit = "k";
+    a.unit_div =    1000;
+    a.min = min * 1000;
+    a.max = max * 1000;
+    return a;
+}
+
+Axis axis_threads( int max = 16 ) {
+    Axis a;
+    a.type = Axis::Quantitative;
+    a.name = "threads";
+    a.normalize = Axis::Mult;
+    a.unit = "";
+    a.min = 1;
+    a.max = max;
+    a.step = 1;
+    return a;
+}
+
+Axis axis_reserve( int max = 200, int step = 50 )
+{
+    Axis a;
+    a.type = Axis::Quantitative;
+    a.name = "reserve";
+    a.unit = "%";
+    a.min = 0;
+    a.max = max;
+    a.step = step;
+    return a;
+}
+
+Axis axis_types( int count )
+{
+    Axis a;
+    a.type = Axis::Qualitative;
+    a.name = "type";
+    a.unit = "";
+    a.min = 0;
+    a.max = count - 1;
+    a.step = 1;
+    return a;
+}
+
+}
+
+template< typename T > struct TN {};
+template< typename > struct _void { typedef void T; };
+
+template< typename Ts >
+struct Run : BenchmarkGroup
+{
+    template< typename, int Id >
+    std::string render( int, hlist::not_preferred ) { return ""; }
+
+    template< typename Tss = Ts, int Id = 0, typename = typename Tss::Head >
+    std::string render( int id, hlist::preferred = hlist::preferred() )
+    {
+        if ( id == Id )
+            return TN< typename Tss::Head >::n();
+        return render< typename Tss::Tail, Id + 1 >( id, hlist::preferred() );
+    }
+
+    std::string describe() {
+        std::string s;
+        for ( int i = 0; i < int( Ts::length ); ++i )
+            s += " type:" + render( i );
+        return std::string( s, 1, s.size() );
+    }
+
+    template< template< typename > class, typename Self, int, typename, typename... Args >
+    static void run( Self *, hlist::not_preferred, Args... ) {
+        ASSERT_UNREACHABLE( "brick_test::hashset::Run fell off the cliff" );
+    }
+
+    template< template< typename > class RI, typename Self, int id,
+              typename Tss, typename... Args >
+    static auto run( Self *self, hlist::preferred, Args... args )
+        -> typename _void< typename Tss::Head >::T
+    {
+        if ( self->type() == id ) {
+            RI< typename Tss::Head > x( self, args... );
+            self->reset(); // do not count the constructor
+            x( self );
+        } else
+            run< RI, Self, id + 1, typename Tss::Tail, Args... >( self, hlist::preferred(), args... );
+    }
+
+    template< template< typename > class RI, typename Self, typename... Args >
+    static void run( Self *self, Args... args ) {
+        run< RI, Self, 0, Ts, Args... >( self, hlist::preferred(), args... );
+    }
+
+    int type() { return 0; } // default
+};
+
+template< int _threads, typename T >
+struct ItemsVsReserve : Run< hlist::TypeList< T > >
+{
+    ItemsVsReserve() {
+        this->x = axis_items();
+        this->y = axis_reserve();
+    }
+
+    std::string fixed() {
+        std::stringstream s;
+        s << "threads:" << _threads;
+        return s.str();
+    }
+
+    int threads() { return _threads; }
+    int items() { return this->p; }
+    double reserve() { return this->q / 100; }
+    double normal() { return _threads; }
+};
+
+template< int _max_threads, int _reserve, typename T >
+struct ItemsVsThreads : Run< hlist::TypeList< T > >
+{
+    ItemsVsThreads() {
+        this->x = axis_items();
+        this->y = axis_threads( _max_threads );
+    }
+
+    std::string fixed() {
+        std::stringstream s;
+        s << "reserve:" << _reserve;
+        return s.str();
+    }
+
+    int threads() { return this->q; }
+    int items() { return this->p; }
+    double reserve() { return _reserve / 100.0; }
+};
+
+template< int _items, typename T >
+struct ThreadsVsReserve : Run< hlist::TypeList< T > >
+{
+    ThreadsVsReserve() {
+        this->x = axis_threads();
+        this->y = axis_reserve();
+    }
+
+    std::string fixed() {
+        std::stringstream s;
+        s << "items:" << _items << "k";
+        return s.str();
+    }
+
+    int threads() { return this->p; }
+    int reserve() { return this->q; }
+    int items() { return _items * 1000; }
+};
+
+template< int _threads, int _reserve, typename... Ts >
+struct ItemsVsTypes : Run< hlist::TypeList< Ts... > >
+{
+    ItemsVsTypes() {
+        this->x = axis_items();
+        this->y = axis_types( sizeof...( Ts ) );
+        this->y._render = [this]( int i ) {
+            return this->render( i );
+        };
+    }
+
+    std::string fixed() {
+        std::stringstream s;
+        s << "threads:" << _threads << " reserve:" << _reserve;
+        return s.str();
+    }
+
+    int threads() { return _threads; }
+    double reserve() { return _reserve / 100.0; }
+    int items() { return this->p; }
+    int type() { return this->q; }
+    double normal() { return _threads; }
+};
+
+template< int _items, int _reserve, int _threads, typename... Ts >
+struct ThreadsVsTypes : Run< hlist::TypeList< Ts... > >
+{
+    ThreadsVsTypes() {
+        this->x = axis_threads( _threads );
+        this->y = axis_types( sizeof...( Ts ) );
+        this->y._render = [this]( int i ) {
+            return this->render( i );
+        };
+    }
+
+    std::string fixed() {
+        std::stringstream s;
+        s << "items:" << _items << "k reserve:" << _reserve;
+        return s.str();
+    }
+
+    int threads() { return this->p; }
+    double reserve() { return _reserve / 100.0; }
+    int items() { return _items * 1000; }
+    int type() { return this->q; }
+    double normal() { return 1.0 / items(); }
+};
+
+template< typename T >
+struct RandomInsert {
+    bool insert;
+    int max;
+    using HS = typename T::template HashTable< int >;
+    HS t;
+
+    template< typename BG >
+    RandomInsert( BG *bg, int max = std::numeric_limits< int >::max() )
+        : insert( true ), max( max )
+    {
+        if ( bg->reserve() > 0 )
+            t.setSize( bg->items() * bg->reserve() );
+    }
+
+    template< typename BG >
+    void operator()( BG *bg )
+    {
+        RandomThread< HS > *ri = new RandomThread< HS >[ bg->threads() ];
+
+        for ( int i = 0; i < bg->threads(); ++i ) {
+            ri[i].id = i;
+            ri[i].insert = insert;
+            ri[i].max = max;
+            ri[i].count = bg->items() / bg->threads();
+            ri[i]._set = &t;
+        }
+
+        for ( int i = 0; i < bg->threads(); ++i )
+            ri[i].start();
+        for ( int i = 0; i < bg->threads(); ++i )
+            ri[i].join();
+    }
+};
+
+template< typename T >
+struct RandomLookup : RandomInsert< T > {
+
+    template< typename BG >
+    RandomLookup( BG *bg, int ins_max, int look_max )
+        : RandomInsert< T >( bg, ins_max )
+    {
+        (*this)( bg );
+        this->max = look_max;
+        this->insert = false;
+    }
+};
+
+template< typename Param >
+struct Bench : Param
+{
+    std::string describe() {
+        return "category:hashset " + Param::describe() + " " +
+            Param::fixed() + " " + this->describe_axes();
+    }
+
+    BENCHMARK(random_insert_1x) {
+        this->template run< RandomInsert >( this );
+    }
+
+    BENCHMARK(random_insert_2x) {
+        this->template run< RandomInsert >( this, this->items() / 2 );
+    }
+
+    BENCHMARK(random_insert_4x) {
+        this->template run< RandomInsert >( this, this->items() / 4 );
+    }
+
+    BENCHMARK(random_lookup_100) {
+        this->template run< RandomInsert >( this );
+    }
+
+    BENCHMARK(random_lookup_50) {
+        this->template run< RandomLookup >(
+            this, this->items() / 2, this->items() );
+    }
+
+    BENCHMARK(random_lookup_25) {
+        this->template run< RandomLookup >(
+            this, this->items() / 4, this->items() );
+    }
+};
+
+template< template< typename > class C >
+struct wrap_hashset {
+    template< typename T > using HashTable = C< T >;
+};
+
+template< template< typename > class C >
+struct wrap_set {
+    template< typename T >
+    struct HashTable {
+        C< T > *t;
+        struct ThreadData {};
+        HashTable< T > withTD( ThreadData & ) { return *this; }
+        void setSize( int s ) { t->rehash( s ); }
+        void insert( T i ) { t->insert( i ); }
+        int count( T i ) { return t->count( i ); }
+        HashTable() : t( new C< T > ) {}
+    };
+};
+
+struct empty {};
+
+template< template< typename > class C >
+struct wrap_map {
+    template< typename T >
+    struct HashTable : wrap_set< C >::template HashTable< T >
+    {
+        template< typename TD >
+        HashTable< T > &withTD( TD & ) { return *this; }
+        void insert( int v ) {
+            this->t->insert( std::make_pair( v, empty() ) );
+        }
+    };
+};
+
+template< typename T >
+using unordered_set = std::unordered_set< T >;
+
+using A = wrap_set< unordered_set >;
+using B = wrap_hashset< CS >;
+using C = wrap_hashset< FS >;
+using D = wrap_hashset< ConCS >;
+using E = wrap_hashset< ConFS >;
+
+template<> struct TN< A > { static const char *n() { return "std"; } };
+template<> struct TN< B > { static const char *n() { return "scs"; } };
+template<> struct TN< C > { static const char *n() { return "sfs"; } };
+template<> struct TN< D > { static const char *n() { return "ccs"; } };
+template<> struct TN< E > { static const char *n() { return "cfs"; } };
+
+#define FOR_SEQ(M) M(A) M(B) M(C)
+#define SEQ A, B, C
+
+#ifdef BRICKS_HAVE_TBB
+#define FOR_PAR(M) M(D) M(E) M(F) M(G)
+#define PAR D, E, F, G
+
+template< typename T > using cus = tbb::concurrent_unordered_set< T >;
+template< typename T > using chm = tbb::concurrent_hash_map< T, empty >;
+
+using F = wrap_set< cus >;
+using G = wrap_map< chm >;
+
+template<> struct TN< F > { static const char *n() { return "cus"; } };
+template<> struct TN< G > { static const char *n() { return "chm"; } };
+
+#else
+#define FOR_PAR(M) M(D) M(E)
+#define PAR D, E
+#endif
+
+#define TvT(N) \
+    template struct Bench< ThreadsVsTypes< N, 50, 4, PAR > >;
+
+TvT(1024)
+TvT(16 * 1024)
+
+#define IvTh_PAR(T) \
+  template struct Bench< ItemsVsThreads< 4, 0, T > >;
+
+template struct Bench< ItemsVsTypes< 1, 0, SEQ, PAR > >;
+template struct Bench< ItemsVsTypes< 2, 0, PAR > >;
+template struct Bench< ItemsVsTypes< 4, 0, PAR > >;
+
+#define IvR_SEQ(T) \
+  template struct Bench< ItemsVsReserve< 1, T > >;
+#define IvR_PAR(T) \
+  template struct Bench< ItemsVsReserve< 1, T > >; \
+  template struct Bench< ItemsVsReserve< 2, T > >; \
+  template struct Bench< ItemsVsReserve< 4, T > >;
+
+FOR_PAR(IvTh_PAR)
+
+FOR_SEQ(IvR_SEQ)
+FOR_PAR(IvR_PAR)
+
+#undef FOR_SEQ
+#undef FOR_PAR
+#undef SEQ
+#undef PAR
+#undef IvT_PAR
+#undef IvR_SEQ
+#undef IvR_PAR
+
+
+}
+}
+
+#endif // benchmarks
+
+#endif
+
+// vim: syntax=cpp tabstop=4 shiftwidth=4 expandtab
diff --git a/bricks/brick-shmem.h b/bricks/brick-shmem.h
new file mode 100644
index 000000000..1f4353900
--- /dev/null
+++ b/bricks/brick-shmem.h
@@ -0,0 +1,1142 @@
+// -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+
+/*
+ * Utilities and data structures for shared-memory parallelism. Includes:
+ * - shared memory, lock-free first-in/first-out queue (one reader + one writer)
+ * - a spinlock
+ * - approximate counter (share a counter between threads without contention)
+ * - a weakened atomic type (like std::atomic)
+ * - a derivable wrapper around std::thread
+ */
+
+/*
+ * (c) 2008, 2012 Petr Ročkai <me@mornfall.net>
+ * (c) 2011 Tomáš Janoušek <tomi@nomi.cz>
+ * (c) 2014 Vladimír Štill <xstill@fi.muni.cz>
+ */
+
+/* Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE. */
+
+#include <bricks/brick-assert.h>
+#include <deque>
+#include <iostream>
+#include <typeinfo>
+
+#if __cplusplus >= 201103L
+#include <mutex>
+#include <atomic>
+#include <thread>
+#include <stdexcept>
+#include <mutex>
+#endif
+
+#ifndef BRICK_SHMEM_H
+#define BRICK_SHMEM_H
+
+#ifndef BRICKS_CACHELINE
+#define BRICKS_CACHELINE 64
+#endif
+
+namespace brick {
+namespace shmem {
+
+#if __cplusplus >= 201103L
+
+struct Thread {
+    std::unique_ptr< std::thread > _thread;
+    std::atomic< bool > _interrupted;
+    virtual void main() = 0;
+    virtual void exception( std::exception_ptr ep ) {
+        try {
+            std::rethrow_exception( ep );
+        } catch ( std::exception &ex ) {
+            std::cerr << "Uncaught exception"
+                      << " of type " << typeid( ex ).name()
+                      << ":" << std::endl;
+            std::cerr << ex.what() << std::endl;
+            std::terminate();
+        }
+    }
+
+    Thread() : _interrupted( false ) {}
+    Thread( const Thread &other ) : _interrupted( false ) {
+        if( other._thread )
+            throw std::logic_error( "cannot copy running thread" );
+    }
+    Thread( Thread &&other ) :
+        _thread( std::move( other._thread ) ),
+        _interrupted( other.interrupted() )
+    {}
+
+    ~Thread() { stop(); }
+
+    Thread &operator=( const Thread &other ) {
+        if ( _thread )
+            throw std::logic_error( "cannot overwrite running thread" );
+        if ( other._thread )
+            throw std::logic_error( "cannot copy running thread" );
+        _interrupted.store( other.interrupted(), std::memory_order_relaxed );
+        return *this;
+    }
+
+    Thread &operator=( Thread &&other ) {
+        if ( _thread )
+            throw std::logic_error( "cannot overwrite running thread" );
+        _thread.swap( other._thread );
+        _interrupted.store( other.interrupted(), std::memory_order_relaxed );
+        return *this;
+    }
+
+#ifdef __divine__
+    void start() __attribute__((noinline)) {
+        __divine_interrupt_mask();
+#else
+    void start() {
+#endif
+        _interrupted.store( false, std::memory_order_relaxed );
+        _thread.reset( new std::thread( [this]() {
+                    try {
+                        this->main();
+                    } catch (...) {
+                        this->exception( std::current_exception() );
+                    }
+        } ) );
+    }
+
+    // stop must be idempotent
+    void stop() {
+        interrupt();
+        if ( _thread && _thread->joinable() )
+            join();
+    }
+
+    void join() {
+        if ( _thread ) {
+            _thread->join();
+            _thread.reset();
+        }
+    }
+
+    void detach() {
+        if ( _thread ) {
+            _thread->detach();
+            _thread.reset();
+        }
+    }
+
+    bool interrupted() const {
+        return _interrupted.load( std::memory_order_relaxed );
+    }
+
+    void interrupt() {
+        _interrupted.store( true, std::memory_order_relaxed );
+    }
+};
+
+/**
+ * A spinlock implementation.
+ *
+ * One has to wonder why this is missing from the C++0x stdlib.
+ */
+struct SpinLock {
+    std::atomic_flag b;
+
+    SpinLock() : b( ATOMIC_FLAG_INIT ) {}
+
+    void lock() {
+        while( b.test_and_set() );
+    }
+
+    void unlock() {
+        b.clear();
+    }
+
+    SpinLock( const SpinLock & ) = delete;
+    SpinLock &operator=( const SpinLock & ) = delete;
+};
+
+/**
+ * Termination detection implemented as a shared counter of open (not yet
+ * processed) states. This appears to be fast because the shared counter is
+ * modified very rarely -- its incremented in large steps for thousands of
+ * states in advance and then adjusted down to its actual value only if the
+ * queue gets empty.
+ *
+ * Shared counter - Σ local of all threads = actual open count.
+ * local ≥ 0, hence shared is an overapproximation of the actual open count.
+ * This implies partial correctness.
+ *
+ * Termination follows from proper calls to sync().
+ */
+struct ApproximateCounter {
+    enum { step = 100000 };
+
+    struct Shared {
+        std::atomic< intptr_t > counter;
+        Shared() : counter( 0 ) {}
+
+        Shared( const Shared& ) = delete;
+    };
+
+    Shared &shared;
+    intptr_t local;
+
+    ApproximateCounter( Shared &s ) : shared( s ), local( 0 ) {}
+    ~ApproximateCounter() { sync(); }
+
+    void sync() {
+        intptr_t value = shared.counter;
+
+        while ( local > 0 ) {
+            if ( value >= local ) {
+                if ( shared.counter.compare_exchange_weak( value, value - local ) )
+                    local = 0;
+            } else {
+                if ( shared.counter.compare_exchange_weak( value, 0 ) )
+                    local = 0;
+            }
+        }
+    }
+
+    ApproximateCounter& operator++() {
+        if ( local == 0 ) {
+            shared.counter += step;
+            local = step;
+        }
+
+        --local;
+
+        return *this;
+    }
+
+    ApproximateCounter &operator--() {
+        ++local;
+        return *this;
+    }
+
+    // NB. sync() must be called manually as this method is called too often
+    bool isZero() {
+        return shared.counter == 0;
+    }
+
+    void reset() { shared.counter = 0; }
+
+    ApproximateCounter( const ApproximateCounter &a )
+        : shared( a.shared ), local( a.local )
+    {}
+    ApproximateCounter operator=( const ApproximateCounter & ) = delete;
+};
+
+struct StartDetector {
+
+    struct Shared {
+        /*
+         *  Normally these fields should be placed in different
+         *  cache-lines to avoid false-sharing. This case is a
+         *  little bit different since this class is a parallel
+         *  reentrant barrier moreover used only on the very
+         *  beginning of the verification in DIVINE.
+         */
+        std::atomic< unsigned short > counter;
+        std::atomic< unsigned short > leaveGuard;
+
+        Shared() : counter( 0 ), leaveGuard( 0 ) {}
+        Shared( Shared & ) = delete;
+    };
+
+    Shared &shared;
+
+    StartDetector( Shared &s ) : shared( s ) {}
+    StartDetector( const StartDetector &s ) : shared( s.shared ) {}
+
+    void waitForAll( unsigned short peers ) {
+
+        while ( shared.leaveGuard );
+
+        if ( ++shared.counter == peers ) {
+            shared.leaveGuard = peers;
+            shared.counter = 0;
+        }
+
+        while ( shared.counter );
+        --shared.leaveGuard;
+    }
+
+};
+
+/*
+ * Simple wrapper around atomic with weakened memory orders.
+ *
+ * The WeakAtomic users consume memory order for reading and release MO for
+ * writing, which should assure atomicity and consistency of given variable,
+ * however it does not assure consistence of other variables written before
+ * given atomic location.  Read-modify-write operations use
+ * memory_order_acq_rel.
+ */
+
+namespace _impl {
+
+template< typename Self, typename T >
+struct WeakAtomicIntegral {
+
+    T operator |=( T val ) {
+        return self()._data.fetch_or( val, std::memory_order_acq_rel ) | val;
+    }
+
+    T operator &=( T val ) {
+        return self()._data.fetch_and( val, std::memory_order_acq_rel ) & val;
+    }
+
+    Self &self() { return *static_cast< Self * >( this ); }
+};
+
+struct Empty { };
+
+}
+
+template< typename T >
+struct WeakAtomic : std::conditional< std::is_integral< T >::value && !std::is_same< T, bool >::value,
+                      _impl::WeakAtomicIntegral< WeakAtomic< T >, T >,
+                      _impl::Empty >::type
+{
+    WeakAtomic( T x ) : _data( x ) { }
+    WeakAtomic() = default;
+
+    operator T() const { return _data.load( std::memory_order_consume ); }
+    T operator=( T val ) {
+        _data.store( val, std::memory_order_release );
+        return val;
+    }
+
+  private:
+    std::atomic< T > _data;
+    friend struct _impl::WeakAtomicIntegral< WeakAtomic< T >, T >;
+};
+
+#endif
+
+#ifndef __divine__
+template< typename T >
+constexpr int defaultNodeSize() {
+    return (32 * 4096 - BRICKS_CACHELINE - sizeof( void* )) / sizeof( T );
+}
+#else
+template< typename T >
+constexpr int defaultNodeSize() { return 3; }
+#endif
+
+/*
+ * A simple queue (First-In, First-Out). Concurrent access to the ends of the
+ * queue is supported -- a thread may write to the queue while another is
+ * reading. Concurrent access to a single end is, however, not supported.
+ *
+ * The NodeSize parameter defines a size of single block of objects. By
+ * default, we make the node a page-sized object -- this seems to work well in
+ * practice. We rely on the allocator to align the allocated blocks reasonably
+ * to give good cache usage.
+ */
+
+template< typename T, int NodeSize = defaultNodeSize< T >() >
+struct Fifo {
+protected:
+    // the Node layout puts read and write counters far apart to avoid
+    // them sharing a cache line, since they are always written from
+    // different threads
+    struct Node {
+        T *read               __attribute__((__aligned__(BRICKS_CACHELINE)));
+        T buffer[ NodeSize ]  __attribute__((__aligned__(BRICKS_CACHELINE)));
+        T * volatile write;
+        Node *next;
+        Node() {
+            read = write = buffer;
+            next = 0;
+        }
+    };
+
+    // pad the fifo object to ensure that head/tail pointers never
+    // share a cache line with anyone else
+    Node *head            __attribute__((__aligned__(BRICKS_CACHELINE)));
+    Node * volatile tail  __attribute__((__aligned__(BRICKS_CACHELINE)));
+
+public:
+    Fifo() {
+        head = tail = new Node();
+        ASSERT( empty() );
+    }
+
+    // copying a fifo is not allowed
+    Fifo( const Fifo & ) {
+        head = tail = new Node();
+        ASSERT( empty() );
+    }
+
+    ~Fifo() {
+        while ( head != tail ) {
+            Node *next = head->next;
+            ASSERT( next != 0 );
+            delete head;
+            head = next;
+        }
+        delete head;
+    }
+
+    void push( const T&x ) {
+        Node *t;
+        if ( tail->write == tail->buffer + NodeSize )
+            t = new Node();
+        else
+            t = tail;
+
+        *t->write = x;
+        ++ t->write;
+        __sync_synchronize();
+
+        if ( tail != t ) {
+            tail->next = t;
+            __sync_synchronize();
+            tail = t;
+        }
+    }
+
+    bool empty() {
+        return head == tail && head->read >= head->write;
+    }
+
+    int size() {
+    	int size = 0;
+    	Node *n = head;
+    	do {
+            size += n->write - n->read;
+            n = n->next;
+        } while (n);
+        return size;
+    }
+
+    void dropHead() {
+        Node *old = head;
+        head = head->next;
+        ASSERT( head );
+        delete old;
+    }
+
+    void pop() {
+        ASSERT( !empty() );
+        ++ head->read;
+        if ( head->read == head->buffer + NodeSize ) {
+            if ( head != tail ) {
+                dropHead();
+            }
+        }
+        // the following can happen when head->next is 0 even though head->read
+        // has reached NodeSize, *and* no front() has been called in the meantime
+        if ( head != tail && head->read > head->buffer + NodeSize ) {
+            dropHead();
+            pop();
+        }
+    }
+
+    T &front( bool wait = false ) {
+        while ( wait && empty() ) ;
+        ASSERT( head );
+        ASSERT( !empty() );
+        // last pop could have left us with empty queue exactly at an
+        // edge of a block, which leaves head->read == NodeSize
+        if ( head->read == head->buffer + NodeSize ) {
+            dropHead();
+        }
+        return *head->read;
+    }
+};
+
+/*
+ * A very simple spinlock-protected queue based on std::deque.
+ */
+
+template < typename T >
+struct LockedQueue {
+    typedef brick::shmem::SpinLock Mutex;
+    Mutex m;
+    brick::shmem::WeakAtomic< bool > _empty;
+    std::deque< T > q;
+    using element = T;
+
+    LockedQueue( void ) : _empty( true ) {}
+
+    bool empty() const { return _empty; }
+
+    void push( const T &x ) {
+        std::lock_guard< Mutex > lk( m );
+        q.push_back( x );
+        _empty = false;
+    }
+
+    void push( T &&x ) {
+        std::lock_guard< Mutex > lk( m );
+        q.push_back( std::move( x ) );
+        _empty = false;
+    }
+
+    /**
+     * Pops a whole chunk, to be processed by one thread as a whole.
+     */
+    T pop() {
+        T ret = T();
+
+        /* Prevent threads from contending for a lock if the queue is empty. */
+        if ( empty() )
+            return ret;
+
+        std::lock_guard< Mutex > lk( m );
+
+        if ( q.empty() )
+            return ret;
+
+        ret = std::move( q.front() );
+        q.pop_front();
+
+        if ( q.empty() )
+            _empty = true;
+
+        return ret;
+    }
+
+    void clear() {
+        std::lock_guard< Mutex > guard{ m };
+        q.clear();
+        _empty = true;
+    }
+
+    LockedQueue( const LockedQueue & ) = delete;
+    LockedQueue &operator=( const LockedQueue & ) = delete;
+};
+
+}
+}
+
+#if __cplusplus >= 201103L
+
+#include <unistd.h> // alarm
+#include <vector>
+
+namespace brick_test {
+namespace shmem {
+
+using namespace ::brick::shmem;
+
+struct FifoTest {
+    template< typename T >
+    struct Checker : Thread
+    {
+        Fifo< T > fifo;
+        int terminate;
+        int n;
+
+        void main()
+        {
+            std::vector< int > x;
+            x.resize( n );
+            for ( int i = 0; i < n; ++i )
+                x[ i ] = 0;
+
+            while (true) {
+                while ( !fifo.empty() ) {
+                    int i = fifo.front();
+                    ASSERT_EQ( x[i % n], i / n );
+                    ++ x[ i % n ];
+                    fifo.pop();
+                }
+                if ( terminate > 1 )
+                    break;
+                if ( terminate )
+                    ++terminate;
+            }
+            terminate = 0;
+            for ( int i = 0; i < n; ++i )
+                ASSERT_EQ( x[ i ], 128*1024 );
+        }
+
+        Checker( int _n = 1 ) : terminate( 0 ), n( _n ) {}
+    };
+
+    TEST(stress) {
+        Checker< int > c;
+        for ( int j = 0; j < 5; ++j ) {
+            c.start();
+            for( int i = 0; i < 128 * 1024; ++i )
+                c.fifo.push( i );
+            c.terminate = true;
+            c.join();
+        }
+    }
+};
+
+struct Utils {
+    static const int peers = 12;
+
+    struct DetectorWorker : Thread {
+
+        StartDetector detector;
+        int rep;
+
+        DetectorWorker( StartDetector::Shared &sh, int repeat ) :
+            detector( sh ),
+            rep( repeat )
+        {}
+
+        void main() {
+            for ( int i = 0; i < rep; ++i )
+                detector.waitForAll( peers );
+        }
+    };
+
+    void processDetector( int repeat ) {
+        StartDetector::Shared sh;
+        std::vector< DetectorWorker > threads{ peers, DetectorWorker{ sh, repeat } };
+
+#if (defined( __unix ) || defined( POSIX )) && !defined( __divine__ ) // hm
+        alarm( 5 );
+#endif
+
+        for ( int i = 0; i != 4; ++i ) {
+            for ( auto &w : threads )
+                w.start();
+            for ( auto &w : threads )
+                w.join();
+            ASSERT_EQ( sh.counter.load(), 0 );
+        }
+    }
+
+    TEST(startDetectorSimple) {
+        processDetector( 1 );
+    }
+
+    TEST(startDetectorReentrant) {
+        processDetector( 4 );
+    }
+
+    struct CounterWorker : Thread {
+        StartDetector detector;
+        ApproximateCounter counter;
+        int produce;
+        int consume;
+
+        template< typename D, typename C >
+        CounterWorker( D &d, C &c ) :
+            detector( d ),
+            counter( c ),
+            produce( 0 ),
+            consume( 0 )
+        {}
+
+        void main() {
+            detector.waitForAll( peers );
+
+            while ( produce-- )
+                ++counter;
+            counter.sync();
+
+            detector.waitForAll( peers );
+
+            while ( consume-- )
+                --counter;
+            counter.sync();
+
+        }
+    };
+
+    void processCounter() {
+        StartDetector::Shared detectorShared;
+        ApproximateCounter::Shared counterShared;
+        std::vector< CounterWorker > threads{ peers,
+            CounterWorker{ detectorShared, counterShared } };
+
+#if (defined( __unix ) || defined( POSIX )) && !defined( __divine__ ) // hm
+        alarm( 5 );
+#endif
+
+        // set consume and produce limits to each worker
+        int i = 1;
+        for ( auto &w : threads ) {
+            w.produce = i;
+            // let last worker consume the rest of produced values
+            w.consume = peers - i;
+            if ( w.consume == 0 )
+                w.consume = peers;// also initials
+            ++i;
+        }
+
+        for ( auto &w : threads )
+            w.start();
+
+        for ( auto &w : threads )
+            w.join();
+        ASSERT_EQ( counterShared.counter.load(), 0 );
+    }
+
+    TEST(approximateCounter) {
+        processCounter();
+    };
+};
+
+}
+}
+
+#ifdef BRICK_BENCHMARK_REG
+
+#ifdef BRICKS_HAVE_TBB
+#include <tbb/concurrent_queue.h>
+#endif
+
+#include <random>
+#include <brick-benchmark.h>
+
+namespace brick_test {
+namespace shmem {
+
+template< typename T >
+struct Naive {
+    std::deque< T > q;
+    std::mutex m;
+
+    void push( T x ) {
+        std::lock_guard< std::mutex > __l( m );
+        q.push_back( x );
+    }
+
+    void pop() {
+        std::lock_guard< std::mutex > __l( m );
+        q.pop_front();
+    }
+
+    T &front() {
+        std::lock_guard< std::mutex > __l( m );
+        return q.front();
+    }
+
+    bool empty() {
+        std::lock_guard< std::mutex > __l( m );
+        return q.empty();
+    }
+};
+
+template< typename T, int size = 512 >
+struct Ring {
+    volatile int reader;
+    T q[ size ];
+    volatile int writer;
+
+    void push( T x ) {
+        while ( (writer + 1) % size == reader ); // full; need to wait
+        q[ writer ] = x;
+        writer = (writer + 1) % size;
+    }
+
+    T &front() {
+        return q[ reader ];
+    }
+
+    void pop() {
+        reader = (reader + 1) % size;
+    }
+
+    bool empty() {
+        return reader == writer;
+    }
+
+    Ring() : reader( 0 ), writer( 0 ) {}
+};
+
+template< typename T >
+struct Student {
+    static const int width = 64;
+    static const int size = 8;
+
+    volatile int writer;
+    T q[width*size];
+    int reader;
+    volatile int free_lines __attribute__((aligned(64)));
+
+    void push(T x) {
+        q[writer] = x;
+        writer = (writer+1) % (size*width);
+        if (writer%size == 0) {
+            __sync_fetch_and_sub(&free_lines, 1);
+            // NOTE: (free_lines < 0) can happen!
+            while (free_lines<=0) ;
+        }
+    }
+
+    T &front() {
+        return q[reader];
+    }
+
+    void pop() {
+        reader = (reader+1)%(width*size);
+        if (reader%size == 0) {
+            __sync_fetch_and_add(&free_lines, 1);
+        }
+    }
+
+    bool empty() {
+        // NOTE: (free_lines > width) can happen!
+        return free_lines >= width && reader == writer;
+    }
+
+    Student() : writer(0), reader(0), free_lines(width) {}
+};
+
+template< typename T >
+struct Linked {
+    using element = T;
+    struct Node {
+        T value;
+        Node *next;
+    };
+
+    Node * volatile reader;
+    char _separation[ 128 ];
+    Node * volatile writer;
+
+    void push( T x ) {
+        Node *n = new Node;
+        n->value = x;
+        writer->next = n; // n->next = (Node *) writer;
+        writer = n;
+    }
+
+    T &front() {
+        return reader->value;
+    }
+
+    void pop() {
+        Node volatile *n = reader;
+        ASSERT( reader->next );
+        reader = reader->next;
+        delete n;
+    }
+
+    bool empty() {
+        return reader == writer;
+    }
+
+    Linked() {
+        reader = writer = new Node();
+        reader->next = 0;
+    }
+};
+
+#ifdef BRICKS_HAVE_TBB
+
+template< typename T >
+struct LocklessQueue {
+    tbb::concurrent_queue< T > q;
+    using element = T;
+
+    void push( T x ) {
+        q.push( x );
+    }
+
+    T pop() {
+        T res;
+        q.try_pop( res ); /* does nothing to res on failure */
+        return res;
+    }
+
+    bool empty() {
+        return q.empty();
+    }
+
+    LocklessQueue() {}
+};
+
+#endif
+
+template< typename Q >
+struct Shared {
+    using T = typename Q::element;
+    std::shared_ptr< Q > q;
+    void push( T t ) { q->push( t ); }
+    T pop() { return q->pop(); }
+    bool empty() { return q->empty(); }
+    void flush() {}
+    Shared() : q( new Q() ) {}
+};
+
+template< template< typename > class Q, typename T >
+struct Chunked {
+    using Chunk = std::deque< T >;
+    using ChQ = Q< Chunk >;
+    std::shared_ptr< ChQ > q;
+    unsigned chunkSize;
+
+    Chunk outgoing;
+    Chunk incoming;
+
+    void push( T t ) {
+        outgoing.push_back( t );
+        // std::cerr << "pushed " << outgoing.back() << std::endl;
+        if ( outgoing.size() >= chunkSize )
+            flush();
+    }
+
+    T pop() {
+        // std::cerr << "pop: empty = " << incoming.empty() << std::endl;
+        if ( incoming.empty() )
+            incoming = q->pop();
+        if ( incoming.empty() )
+            return T();
+        // std::cerr << "pop: found " << incoming.front() << std::endl;
+        auto x = incoming.front();
+        incoming.pop_front();
+        return x;
+    }
+
+    void flush() {
+        if ( !outgoing.empty() ) {
+            // std::cerr << "flushing " << outgoing.size() << " items" << std::endl;
+            Chunk tmp;
+            std::swap( outgoing, tmp );
+            q->push( std::move( tmp ) );
+
+            /* A quickstart trick -- make first few chunks smaller. */
+            if ( chunkSize < 64 )
+                chunkSize = std::min( 2 * chunkSize, 64u );
+        }
+    }
+
+    bool empty() {
+        if ( incoming.empty() ) { /* try to get a fresh one */
+            incoming = q->pop();
+            // std::cerr << "pulled in " << incoming.size() << " items" << std::endl;
+        }
+        return incoming.empty();
+    }
+
+    Chunked() : q( new ChQ() ), chunkSize( 2 ) {}
+};
+
+template< typename Q >
+struct InsertThread : Thread {
+    Q *q;
+    int items;
+    std::mt19937 rand;
+    std::uniform_int_distribution<> dist;
+
+    InsertThread() {}
+
+    void main() {
+        ASSERT( q );
+        for ( int i = 0; i < items; ++i )
+            q->push( rand() );
+    };
+};
+
+template< typename Q >
+struct WorkThread : Thread {
+    Q q;
+    std::atomic< bool > *stop;
+    int items;
+    int id, threads;
+
+    WorkThread() {}
+
+    void main() {
+        int step = items / 10;
+        for ( int i = 1; i <= step; ++i )
+            if ( id == i % threads )
+                q.push( i );
+        while ( !stop->load() ) {
+            while ( !q.empty() ) {
+                int i = q.pop();
+                if ( !i )
+                    continue;
+                if ( i == items )
+                    stop->store( true );
+                if ( i + step <= items ) {
+                    q.push( i + step );
+                    q.push( i + step + items );
+                }
+            }
+            q.flush();
+        }
+    }
+};
+
+template< int size >
+struct padded {
+    int i;
+    char padding[ size - sizeof( int ) ];
+    operator int() { return i; }
+    padded( int i ) : i( i ) {}
+    padded() : i( 0 ) {}
+};
+
+struct ShQueue : BenchmarkGroup
+{
+    ShQueue() {
+        x.type = Axis::Quantitative;
+        x.name = "threads";
+        x.min = 1;
+        x.max = 16;
+        x.step = 1;
+
+        y.type = Axis::Qualitative;
+        y.name = "type";
+        y.min = 0;
+        y.step = 1;
+#ifdef BRICKS_HAVE_TBB
+        y.max = 3;
+#else
+        y.max = 1;
+#endif
+        y._render = []( int i ) {
+            switch (i) {
+                case 0: return "spinlock";
+                case 1: return "lockless";
+                case 2: return "chunked";
+                case 3: return "hybrid";
+                default: abort();
+            }
+        };
+    }
+
+    std::string describe() {
+        return "category:shmem category:shqueue";
+    }
+
+    template< typename Q >
+    void scale() {
+        Q fifo;
+        auto *t = new WorkThread< Q >[ p ];
+        std::atomic< bool > stop( false );
+
+        for ( int i = 0; i < p; ++i ) {
+            t[ i ].q = fifo;
+            t[ i ].items = 1000;
+            t[ i ].id = i;
+            t[ i ].threads = p;
+            t[ i ].stop = &stop;
+        }
+
+        for ( int i = 0; i < p; ++i )
+            t[ i ].start();
+
+        for ( int i = 0; i < p; ++i )
+            t[ i ].join();
+    }
+
+    template< typename T >
+    void param() {
+        switch (q) {
+            case 0: return scale< Shared< LockedQueue< T > > >();
+            case 1: return scale< Chunked< LockedQueue, T > >();
+#ifdef BRICKS_HAVE_TBB
+            case 2: return scale< Shared< LocklessQueue< T > > >();
+            case 3: return scale< Chunked< LocklessQueue, T > >();
+#endif
+            default: ASSERT_UNREACHABLE_F( "bad q = %d", q );
+        }
+    }
+
+    BENCHMARK(p_int) { param< int >(); }
+    BENCHMARK(p_intptr) { param< intptr_t >(); }
+    BENCHMARK(p_64b) { param< padded< 64 > >(); }
+};
+
+struct FIFO : BenchmarkGroup
+{
+    FIFO() {
+        x.type = Axis::Disabled;
+        /* x.name = "p";
+        x.unit = "items";
+        x.min = 8;
+        x.max = 4096;
+        x.log = true;
+        x.step = 8; */
+
+        y.type = Axis::Qualitative;
+        y.name = "type";
+        y.min = 0;
+        y.step = 1;
+        y.max = 4;
+        y._render = []( int i ) {
+            switch (i) {
+                case 0: return "mutex";
+                case 1: return "spin";
+                case 2: return "linked";
+                case 3: return "ring";
+                case 4: return "hybrid";
+                case 5: return "student";
+                default: ASSERT_UNREACHABLE_F( "bad i = %d", i );
+            }
+        };
+    }
+
+    std::string describe() {
+        return "category:shmem category:fifo";
+    }
+
+    template< typename Q >
+    void length_() {
+        Q fifo;
+        InsertThread< Q > t;
+        t.q = &fifo;
+        t.items = 1024 * 1024;
+
+        t.start();
+
+        for ( int i = 0; i < t.items; ++i ) {
+            while ( fifo.empty() );
+            fifo.pop();
+        }
+        ASSERT( fifo.empty() );
+    }
+
+    template< typename T >
+    void param() {
+        switch (q) {
+            case 0: return length_< Naive< T > >();
+            case 1: return length_< LockedQueue< T > >();
+            case 2: return length_< Linked< T > >();
+            case 3: return length_< Ring< T  > >();
+            case 4: return length_< Fifo< T > >();
+            case 5: return length_< Student< T > >();
+            default: ASSERT_UNREACHABLE_F( "bad q = %d", q );
+        }
+    }
+
+    BENCHMARK(p_char) { param< char >(); }
+    BENCHMARK(p_int) { param< int >(); }
+    BENCHMARK(p_intptr) { param< intptr_t >(); }
+    BENCHMARK(p_16b) { param< padded< 16 > >(); }
+    BENCHMARK(p_64b) { param< padded< 64 > >(); }
+};
+
+}
+}
+
+#endif
+#endif
+#endif
+
+// vim: syntax=cpp tabstop=4 shiftwidth=4 expandtab
diff --git a/bricks/brick-types.h b/bricks/brick-types.h
new file mode 100644
index 000000000..39dd03139
--- /dev/null
+++ b/bricks/brick-types.h
@@ -0,0 +1,1206 @@
+// -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+
+/*
+ * Assorted types, mostly for C++11.
+ * - Maybe a = Just a | Nothing (w/ a limited variant for C++98)
+ * - Unit: single-valued type (empty structure)
+ * - Union: discriminated (tagged) union
+ * - StrongEnumFlags
+ */
+
+/*
+ * (c) 2006, 2014 Petr Ročkai <me@mornfall.net>
+ * (c) 2013-2014 Vladimír Štill <xstill@fi.muni.cz>
+ */
+
+/* Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE. */
+
+#include <bricks/brick-assert.h>
+
+#include <memory>
+#include <cstring>
+#include <type_traits>
+#include <functional>
+
+#ifndef BRICK_TYPES_H
+#define BRICK_TYPES_H
+
+#if __cplusplus >= 201103L
+#define CONSTEXPR constexpr
+#else
+#define CONSTEXPR
+#endif
+
+#if __cplusplus > 201103L
+#define CPP1Y_CONSTEXPR constexpr // C++1y
+#else
+#define CPP1Y_CONSTEXPR // C++11
+#endif
+
+namespace brick {
+namespace types {
+
+struct Unit {
+    bool operator<( Unit ) const { return false; }
+    bool operator==( Unit ) const { return true; }
+};
+
+struct Preferred { CONSTEXPR Preferred() { } };
+struct NotPreferred { CONSTEXPR NotPreferred( Preferred ) {} };
+
+struct Comparable {
+    typedef bool IsComparable;
+};
+
+template< typename T >
+typename T::IsComparable operator!=( const T &a, const T &b ) {
+    return not( a == b );
+}
+
+template< typename T >
+typename T::IsComparable operator==( const T &a, const T &b ) {
+    return a <= b && b <= a;
+}
+
+template< typename T >
+typename T::IsComparable operator<( const T &a, const T &b ) {
+    return a <= b && a != b;
+}
+
+template< typename T >
+typename T::IsComparable operator>( const T &a, const T &b ) {
+    return b <= a && a != b;
+}
+
+template< typename T >
+typename T::IsComparable operator>=( const T &a, const T &b ) {
+    return b <= a;
+}
+
+struct Defer {
+    template< typename F >
+    Defer( F fn ) : fn( fn ), _deleted( false ) { }
+
+    void run() {
+        if ( !_deleted ) {
+            fn();
+            _deleted = true;
+        }
+    }
+
+    bool deleted() const { return _deleted; }
+    void pass() { _deleted = true; }
+    ~Defer() { run(); }
+  private:
+    std::function< void() > fn;
+    bool _deleted;
+};
+
+namespace mixin {
+
+#if __cplusplus >= 201103L
+template< typename Self >
+struct LexComparable {
+    const Self &lcSelf() const { return *static_cast< const Self * >( this ); }
+
+    bool operator==( const Self &o ) const {
+        return lcSelf().toTuple() == o.toTuple();
+    }
+
+    bool operator!=( const Self &o ) const {
+        return lcSelf().toTuple() != o.toTuple();
+    }
+
+    bool operator<( const Self &o ) const {
+        return lcSelf().toTuple() < o.toTuple();
+    }
+
+    bool operator<=( const Self &o ) const {
+        return lcSelf().toTuple() <= o.toTuple();
+    }
+
+    bool operator>( const Self &o ) const {
+        return lcSelf().toTuple() > o.toTuple();
+    }
+
+    bool operator>=( const Self &o ) const {
+        return lcSelf().toTuple() >= o.toTuple();
+    }
+};
+#endif
+
+}
+
+#if __cplusplus < 201103L
+
+/*
+  A Maybe type. Values of type Maybe< T > can be either Just T or Nothing.
+
+  Maybe< int > foo;
+  foo = Maybe::Nothing();
+  // or
+  foo = Maybe::Just( 5 );
+  if ( !foo.nothing() ) {
+    int real = foo;
+  } else {
+    // we haven't got anythig in foo
+  }
+
+  Maybe takes a default value, which is normally T(). That is what you
+  get if you try to use Nothing as T.
+*/
+
+template <typename T>
+struct Maybe : Comparable {
+    bool nothing() const { return m_nothing; }
+    bool isNothing() const { return nothing(); }
+    T &value() { return m_value; }
+    const T &value() const { return m_value; }
+    Maybe( bool n, const T &v ) : m_nothing( n ), m_value( v ) {}
+    Maybe( const T &df = T() )
+       : m_nothing( true ), m_value( df ) {}
+    static Maybe Just( const T &t ) { return Maybe( false, t ); }
+    static Maybe Nothing( const T &df = T() ) {
+        return Maybe( true, df ); }
+    operator T() const { return value(); }
+
+    bool operator <=( const Maybe< T > &o ) const {
+        if (o.nothing())
+            return true;
+        if (nothing())
+            return false;
+        return value() <= o.value();
+    }
+protected:
+    bool m_nothing:1;
+    T m_value;
+};
+
+#else
+
+template< typename T >
+struct StorableRef {
+    T _t;
+    T &t() { return _t; }
+    const T &t() const { return _t; }
+    StorableRef( T t ) : _t( t ) {}
+};
+
+template< typename T >
+struct StorableRef< T & > {
+    T *_t;
+    T &t() { return *_t; }
+    const T &t() const { return *_t; }
+    StorableRef( T &t ) : _t( &t ) {}
+};
+
+template< typename _T >
+struct Maybe : Comparable
+{
+    using T = _T;
+
+    bool isNothing() const { return _nothing; }
+    bool isJust() const { return !_nothing; }
+
+    T &value() {
+        ASSERT( isJust() );
+        return _v.t.t();
+    }
+
+    const T &value() const {
+        ASSERT( isJust() );
+        return _v.t.t();
+    }
+
+    T fromMaybe( T x ) const { return isJust() ? value() : x; }
+
+    explicit operator bool() const { return isJust(); }
+
+    static Maybe Just( const T &t ) { return Maybe( t ); }
+    static Maybe Nothing() { return Maybe(); }
+
+    Maybe( const Maybe &m ) {
+        _nothing = m.isNothing();
+        if ( !_nothing )
+            _v.t = m._v.t;
+    }
+
+    ~Maybe() {
+        if ( !_nothing )
+            _v.t.~StorableRef< T >();
+    }
+
+    bool operator <=( const Maybe< T > &o ) const {
+        if (o.isNothing())
+            return true;
+        if (isNothing())
+            return false;
+        return value() <= o.value();
+    }
+
+protected:
+
+    Maybe( const T &v ) : _v( v ), _nothing( false ) {}
+    Maybe() : _nothing( true ) {}
+    struct Empty {
+        char x[ sizeof( T ) ];
+    };
+
+    union V {
+        StorableRef< T > t;
+        Empty empty;
+        V() : empty() {}
+        V( const T &t ) : t( t ) {}
+        ~V() { } // see dtor of Maybe
+    };
+    V _v;
+    bool _nothing;
+};
+
+#endif
+
+template<>
+struct Maybe< void > {
+    typedef void T;
+    static Maybe Just() { return Maybe( false ); }
+    static Maybe Nothing() { return Maybe( true ); }
+    bool isNothing() { return _nothing; }
+    bool isJust() { return !_nothing; }
+private:
+    Maybe( bool nothing ) : _nothing( nothing ) {}
+    bool _nothing;
+};
+
+#if __cplusplus >= 201103L
+
+template< typename E >
+using is_enum_class = std::integral_constant< bool,
+        std::is_enum< E >::value && !std::is_convertible< E, int >::value >;
+
+template< typename Self >
+struct StrongEnumFlags {
+    static_assert( is_enum_class< Self >::value, "Not an enum class." );
+    using This = StrongEnumFlags< Self >;
+    using UnderlyingType = typename std::underlying_type< Self >::type;
+
+    constexpr StrongEnumFlags() noexcept : store( 0 ) { }
+    constexpr StrongEnumFlags( Self flag ) noexcept :
+        store( static_cast< UnderlyingType >( flag ) )
+    { }
+    explicit constexpr StrongEnumFlags( UnderlyingType st ) noexcept : store( st ) { }
+
+    constexpr explicit operator UnderlyingType() const noexcept {
+        return store;
+    }
+
+    This &operator|=( This o ) noexcept {
+        store |= o.store;
+        return *this;
+    }
+
+    This &operator&=( This o ) noexcept {
+        store &= o.store;
+        return *this;
+    }
+
+    This &operator^=( This o ) noexcept {
+        store ^= o.store;
+        return *this;
+    }
+
+    friend constexpr This operator|( This a, This b ) noexcept {
+        return This( a.store | b.store );
+    }
+
+    friend constexpr This operator&( This a, This b ) noexcept {
+        return This( a.store & b.store );
+    }
+
+    friend constexpr This operator^( This a, This b ) noexcept {
+        return This( a.store ^ b.store );
+    }
+
+    friend constexpr bool operator==( This a, This b ) noexcept {
+        return a.store == b.store;
+    }
+
+    friend constexpr bool operator!=( This a, This b ) noexcept {
+        return a.store != b.store;
+    }
+
+    constexpr bool has( Self x ) const noexcept {
+        return ((*this) & x) == x;
+    }
+
+    This clear( Self x ) noexcept {
+        store &= ~UnderlyingType( x );
+        return *this;
+    }
+
+    explicit constexpr operator bool() const noexcept {
+        return store;
+    }
+
+  private:
+    UnderlyingType store;
+};
+
+// don't catch integral types and classical enum!
+template< typename Self, typename = typename
+          std::enable_if< is_enum_class< Self >::value >::type >
+constexpr StrongEnumFlags< Self > operator|( Self a, Self b ) noexcept {
+    using Ret = StrongEnumFlags< Self >;
+    return Ret( a ) | Ret( b );
+}
+
+template< typename Self, typename = typename
+          std::enable_if< is_enum_class< Self >::value >::type >
+constexpr StrongEnumFlags< Self > operator&( Self a, Self b ) noexcept {
+    using Ret = StrongEnumFlags< Self >;
+    return Ret( a ) & Ret( b );
+}
+
+/* implementation of Union */
+
+namespace _impl {
+    template< size_t val, typename... >
+    struct MaxSizeof : std::integral_constant< size_t, val > { };
+
+    template< size_t val, typename T, typename... Ts >
+    struct MaxSizeof< val, T, Ts... > :
+        MaxSizeof< ( val > sizeof( T ) ) ? val : sizeof( T ), Ts... >
+    { };
+
+    template< size_t val, typename... >
+    struct MaxAlign : std::integral_constant< size_t, val > { };
+
+    template< size_t val, typename T, typename... Ts >
+    struct MaxAlign< val, T, Ts... > :
+        MaxAlign< ( val > std::alignment_of< T >::value )
+                      ? val : std::alignment_of< T >::value, Ts... >
+    { };
+
+    template< typename... >
+    struct AllDistinct : std::true_type { };
+
+    template< typename, typename... >
+    struct In : std::false_type { };
+
+    template< typename Needle, typename T, typename... Ts >
+    struct In< Needle, T, Ts... > : std::integral_constant< bool,
+        std::is_same< Needle, T >::value || In< Needle, Ts... >::value >
+    { };
+
+    template< typename _T >
+    struct Witness { using T = _T; };
+
+    template< typename, typename... >
+    struct _OneConversion { };
+
+    template< typename From, typename To, typename... >
+    struct NoneConvertible { using T = To; };
+
+    template< typename From, typename To, typename T, typename... Ts >
+    struct NoneConvertible< From, To, T, Ts... > : std::conditional<
+        std::is_convertible< From, T >::value,
+        Unit,
+        NoneConvertible< From, To, Ts... > >::type { };
+
+    static_assert( std::is_convertible< Witness< int >, Witness< int > >::value, "is_convertible" );
+
+    template< typename Needle, typename T, typename... Ts >
+    struct _OneConversion< Needle, T, Ts... > : std::conditional<
+        std::is_convertible< Needle, T >::value,
+        NoneConvertible< Needle, T, Ts... >,
+        _OneConversion< Needle, Ts... > >::type { };
+
+    template< typename Needle, typename... Ts >
+    struct OneConversion : std::conditional<
+        In< Needle, Ts... >::value,
+        Witness< Needle >,
+        _OneConversion< Needle, Ts... > >::type { };
+
+    static_assert( std::is_same< OneConversion< int, int >::T, int >::value, "OneConversion" );
+    static_assert( std::is_same< OneConversion< long, int >::T, int >::value, "OneConversion" );
+    static_assert( std::is_same< OneConversion< long, std::string, int >::T, int >::value, "OneConversion" );
+    static_assert( std::is_same< OneConversion< long, int, long, int >::T, long >::value, "OneConversion" );
+
+    template< typename T, typename... Ts >
+    struct AllDistinct< T, Ts... > : std::integral_constant< bool,
+        !In< T, Ts... >::value && AllDistinct< Ts... >::value >
+    { };
+
+template< typename F, typename T, typename Fallback, typename Check = bool >
+struct _ApplyResult : Fallback {};
+
+template< typename F, typename T, typename Fallback >
+struct _ApplyResult< F, T, Fallback, decltype( std::declval< F >()( std::declval< T >() ), true ) >
+{
+    using Parameter = T;
+    using Result = decltype( std::declval< F >()( std::declval< T >() ) );
+};
+
+template< typename F, typename... Ts > struct ApplyResult;
+
+template< typename F, typename T, typename... Ts >
+struct ApplyResult< F, T, Ts... > : _ApplyResult< F, T, ApplyResult< F, Ts... > > {};
+
+template< typename F > struct ApplyResult< F > {};
+
+}
+
+struct UnionException : std::exception {
+    UnionException( std::string msg ) : msg( msg ) { }
+
+    virtual const char *what() const noexcept override { return msg.c_str(); }
+
+    std::string msg;
+};
+
+template< typename T >
+struct InPlace { };
+
+struct NullUnion { };
+
+template< typename... Types >
+struct Union : Comparable {
+    static_assert( sizeof...( Types ) < 0xff, "Too much unioned types, sorry" );
+    static_assert( _impl::AllDistinct< Types... >::value,
+            "All types in union must be distinct" );
+
+    constexpr Union() : _discriminator( 0 ) { }
+    constexpr Union( NullUnion ) : _discriminator( 0 ) { }
+
+    Union( const Union &other ) {
+        ASSERT_LEQ( size_t( other._discriminator ), sizeof...( Types ) );
+        if ( other._discriminator > 0 )
+            _copyConstruct< 1, Types... >( other._discriminator, other );
+        _discriminator = other._discriminator;
+    }
+
+    Union( Union &&other ) {
+        ASSERT_LEQ( size_t( other._discriminator ), sizeof...( Types ) );
+        auto target = other._discriminator;
+        other._discriminator = 0;
+        if ( target > 0 )
+            _moveConstruct< 1, Types... >( target, std::move( other ) );
+        _discriminator = target;
+    }
+
+    template< typename T, typename U = typename _impl::OneConversion< T, Types... >::T >
+    CPP1Y_CONSTEXPR Union( T val ) {
+        new ( &storage ) U( std::move( val ) );
+        _discriminator = discriminator< U >();
+    }
+
+    template< typename T, typename... Args >
+    Union( InPlace< T >, Args &&... args ) : _discriminator( discriminator< T >() ) {
+        new ( &storage ) T( std::forward< Args >( args )... );
+    }
+
+    // use copy and swap
+    Union &operator=( Union other ) {
+        swap( other );
+        return *this;
+    }
+
+    template< typename T >
+    auto operator=( const T &other ) -> typename
+        std::enable_if< std::is_lvalue_reference< T & >::value, Union & >::type
+    {
+        if ( is< T >() )
+            unsafeGet< T >() = other;
+        else
+            _copyAssignDifferent( Union( other ) );
+        return *this;
+    }
+
+    template< typename T >
+    auto operator=( T &&other ) -> typename
+        std::enable_if< std::is_rvalue_reference< T && >::value, Union & >::type
+    {
+        if ( is< T >() )
+            unsafeGet< T >() = std::move( other );
+        else
+            _moveAssignDifferent( std::move( other ) );
+        return *this;
+    }
+
+    void swap( Union other ) {
+        typename std::aligned_storage< size, algignment >::type tmpStor;
+        unsigned char tmpDis;
+
+        std::memcpy( &tmpStor, &other.storage, size );
+        tmpDis = other._discriminator;
+        other._discriminator = 0;
+        std::memcpy( &other.storage, &storage, size );
+        other._discriminator = _discriminator;
+        _discriminator = 0;
+        std::memcpy( &storage, &tmpStor, size );
+        _discriminator = tmpDis;
+    }
+
+    bool empty() {
+        return _discriminator == 0;
+    }
+
+    explicit operator bool() {
+        return !empty();
+    }
+
+    template< typename T >
+    bool is() const {
+        return discriminator< T >() == _discriminator;
+    }
+
+    template< typename T >
+    explicit operator T() const {
+        return convert< T >();
+    }
+
+    template< typename T >
+    T &get() {
+        ASSERT( is< T >() );
+        return unsafeGet< T >();
+    }
+
+    template< typename T >
+    const T &get() const {
+        return cget< T >();
+    }
+
+    template< typename T >
+    const T &cget() const {
+        ASSERT( is< T >() );
+        return unsafeGet< T >();
+    }
+
+    template< typename T >
+    const T &getOr( const T &val ) const {
+        if ( is< T >() )
+            return unsafeGet< T >();
+        return val;
+    }
+
+    template< typename T >
+    T convert() const { return _convert< T >(); }
+
+    template< typename T >
+    T &unsafeGet() {
+        return *reinterpret_cast< T * >( &storage );
+    }
+
+    template< typename T >
+    const T &unsafeGet() const {
+        return *reinterpret_cast< const T * >( &storage );
+    }
+
+    template< typename T >
+    T &&moveOut() {
+        ASSERT( is< T >() );
+        return unsafeMoveOut< T >();
+    }
+
+    template< typename T >
+    T &&unsafeMoveOut() {
+        return std::move( *reinterpret_cast< T * >( &storage ) );
+    }
+
+    template< typename F >
+    using Applied = Maybe< typename _impl::ApplyResult< F, Types... >::Result >;
+
+    // invoke `f` on the stored value if the type currently stored in the union
+    // can be legally passed to that function as an argument
+    template< typename F >
+    auto apply( F f ) -> Applied< F > {
+        return _apply< F, Types... >( Preferred(), f );
+    }
+
+    template< typename R >
+    R _match() { return R::Nothing(); }
+
+    // invoke the first function that can handle the currently stored value
+    // (type-based pattern matching)
+    template< typename R, typename F, typename... Args >
+    R _match( F f, Args... args ) {
+        auto x = apply( f );
+        if ( x.isNothing() )
+            return _match< R >( args... );
+        else
+            return x;
+    }
+
+    template< typename F, typename... Args >
+    Applied< F > match( F f, Args... args ) {
+        return _match< Applied< F > >( f, args... );
+    }
+
+    bool operator==( const Union &other ) const {
+        return _discriminator == other._discriminator
+            && _compare< std::equal_to >( other );
+    }
+
+    bool operator!=( const Union &other ) const {
+        return _discriminator != other._discriminator
+            || _compare< std::not_equal_to >( other );
+    }
+
+    bool operator<( const Union &other ) const {
+        return _discriminator < other._discriminator
+            || (_discriminator == other._discriminator
+                    && _compare< std::less >( other ) );
+    }
+
+    unsigned char discriminator() const { return _discriminator; }
+
+    template< typename T >
+    unsigned char discriminator() const {
+        static_assert( _impl::In< T, Types... >::value,
+                "Trying to construct Union from value of type not allowed for it." );
+        return _discriminatorF< 1, T, Types... >();
+    }
+
+  private:
+    static constexpr size_t size = _impl::MaxSizeof< 1, Types... >::value;
+    static constexpr size_t algignment = _impl::MaxAlign< 1, Types... >::value;
+    typename std::aligned_storage< size, algignment >::type storage;
+    unsigned char _discriminator;
+
+
+    template< unsigned char i, typename Needle, typename T, typename... Ts >
+    constexpr unsigned char _discriminatorF() const {
+        return std::is_same< Needle, T >::value
+            ? i : _discriminatorF< i + 1, Needle, Ts... >();
+    }
+
+    template< unsigned char, typename >
+    constexpr unsigned char _discriminatorF() const { return 0; /* cannot happen */ }
+
+    template< unsigned char i, typename T, typename... Ts >
+    void _copyConstruct( unsigned char d, const Union &other ) {
+        if ( i == d )
+            new ( &storage ) T( other.unsafeGet< T >() );
+        else
+            _copyConstruct< i + 1, Ts... >( d, other );
+    }
+
+    template< unsigned char >
+    unsigned char _copyConstruct( unsigned char, const Union & )
+    { ASSERT_UNREACHABLE( "invalid _copyConstruct" ); }
+
+    template< unsigned char i, typename T, typename... Ts >
+    void _moveConstruct( unsigned char d, Union &&other ) {
+        if ( i == d )
+            new ( &storage ) T( other.unsafeMoveOut< T >() );
+        else
+            _moveConstruct< i + 1, Ts... >( d, std::move( other ) );
+    }
+
+    template< unsigned char >
+    unsigned char _moveConstruct( unsigned char, Union && )
+    { ASSERT_UNREACHABLE( "invalid _moveConstruct" ); }
+
+    void _copyAssignDifferent( const Union &other ) {
+        auto tmp = _discriminator;
+        _discriminator = 0;
+        if ( tmp )
+            _destruct< 1, Types... >( tmp );
+        if ( other._discriminator )
+            _copyConstruct< 1, Types... >( other._discriminator, other );
+        _discriminator = other._discriminator;
+    }
+
+    void _copyAssignSame( const Union &other ) {
+        ASSERT_EQ( _discriminator, other._discriminator );
+        if ( _discriminator == 0 )
+            return;
+        _copyAssignSame< 1, Types... >( other );
+    }
+
+    template< unsigned char i, typename T, typename... Ts >
+    void _copyAssignSame( const Union &other ) {
+        if ( i == _discriminator )
+            unsafeGet< T >() = other.unsafeGet< T >();
+        else
+            _copyAssignSame< i + 1, Ts... >( other );
+    }
+
+    template< unsigned char >
+    void _copyAssignSame( const Union & ) { ASSERT_UNREACHABLE( "invalid _copyAssignSame" ); }
+
+    template< unsigned char i, typename T, typename... Ts >
+    void _destruct( unsigned char d ) {
+        if ( i == d )
+            unsafeGet< T >().~T();
+        else
+            _destruct< i + 1, Ts... >( d );
+    }
+
+    template< unsigned char >
+    void _destruct( unsigned char ) { ASSERT_UNREACHABLE( "invalid _destruct" ); }
+
+    void _moveAssignSame( Union &&other ) {
+        ASSERT_EQ( _discriminator, other._discriminator );
+        if ( _discriminator == 0 )
+            return;
+        _moveAssignSame< 1, Types... >( std::move( other ) );
+    }
+
+    template< unsigned char i, typename T, typename... Ts >
+    void _moveAssignSame( Union &&other ) {
+        if ( i == _discriminator )
+            unsafeGet< T >() = other.unsafeMoveOut< T >();
+        else
+            _moveAssignSame< i + 1, Ts... >( std::move( other ) );
+    }
+
+    template< unsigned char >
+    void _moveAssignSame( Union && ) { ASSERT_UNREACHABLE( "invalid _moveAssignSame" ); }
+
+    void _moveAssignDifferent( Union &&other ) {
+        auto tmp = _discriminator;
+        auto target = other._discriminator;
+        _discriminator = 0;
+        if ( tmp )
+            _destruct< 1, Types... >( tmp );
+        if ( target )
+            _moveConstruct< 1, Types... >( target, std::move( other ) );
+        _discriminator = target;
+    }
+
+    template< typename F > Applied< F > _apply( Preferred, F ) { return Applied< F >::Nothing(); }
+
+    template< typename F, typename T >
+    auto fixvoid( F f ) ->
+        typename std::enable_if< std::is_void< typename Applied< F >::T >::value, Applied< F > >::type
+    {
+        f( get< T >() );
+        return Maybe< void >::Just();
+    }
+
+    template< typename F, typename T >
+    auto fixvoid( F f ) ->
+        typename std::enable_if< !std::is_void< typename Applied< F >::T >::value, Applied< F > >::type
+    {
+        return Applied< F >::Just( f( get< T >() ) );
+    }
+
+    template< typename F, typename T, typename... Args >
+    auto _apply( Preferred, F f ) -> Maybe< typename _impl::_ApplyResult< F, T, Unit >::Result >
+    {
+        if ( !is< T >() )
+            return _apply< F, Args... >( Preferred(), f );
+
+        return fixvoid< F, T >( f );
+    }
+
+    template< typename F, typename T, typename... Args >
+    auto _apply( NotPreferred, F f ) -> Applied< F >
+    {
+        return _apply< F, Args... >( Preferred(), f );
+    }
+
+    template< template< typename > class Compare, int d >
+    bool _compare2( const Union & ) const { ASSERT_UNREACHABLE( "invalid discriminator" ); }
+
+    template< template< typename > class Compare, int d, typename T, typename... Ts >
+    bool _compare2( const Union &other ) const {
+        return d == _discriminator
+            ? Compare< T >()( get< T >(), other.template get< T >() )
+            : _compare2< Compare, d + 1, Ts... >( other );
+    }
+
+    template< template< typename > class Compare >
+    bool _compare( const Union &other ) const {
+        return _compare2< Compare, 1, Types... >( other );
+    }
+
+    template< typename Target, bool anyCastPossible, int >
+    Target _convert2( Preferred ) const {
+        static_assert( anyCastPossible, "Cast of Union can never succeed" );
+        ASSERT_UNREACHABLE( "wrong _convert2 in Union" );
+    }
+
+    template< typename Target, bool any, int d, typename, typename... Ts >
+    Target _convert2( NotPreferred ) const {
+        return _convert2< Target, any, d + 1, Ts... >( Preferred() );
+    }
+
+    template< typename Target, bool any, int d, typename T, typename... Ts >
+    auto _convert2( Preferred ) const -> decltype( static_cast< Target >( this->unsafeGet< T >() ) )
+    {
+        if ( _discriminator == d )
+            return static_cast< Target >( unsafeGet< T >() );
+        return _convert2< Target, true, d + 1, Ts... >( Preferred() );
+    }
+
+    template< typename Target >
+    Target _convert() const {
+        return _convert2< Target, false, 1, Types... >( Preferred() );
+    }
+
+};
+
+template< template< typename > class C, typename T, typename F >
+using FMap = C< typename std::result_of< F( T ) >::type >;
+
+template< typename T >
+struct NewType
+{
+    T _value;
+
+    template< typename X > using FMap = NewType< X >;
+    NewType() noexcept {}
+    NewType( const T &t ) noexcept : _value( t ) {}
+
+    T &unwrap() { return _value; }
+    const T &unwrap() const { return _value; }
+};
+
+template< typename T >
+struct Wrapper : NewType< T >
+{
+    Wrapper() = default;
+    Wrapper( const T &t ) : NewType< T >( t ) {}
+    operator T() { return this->unwrap(); }
+    T &value() { return this->unwrap(); }
+    T &operator*() { return this->unwrap(); }
+    T *operator->() { return &this->unwrap(); }
+};
+
+template< template< typename > class C, typename S, typename F >
+auto fmap( F, C< S > n ) -> decltype( FMap< C, S, F >( n.unwrap() ) ) {
+    return FMap< C, S, F >( n.unwrap() );
+}
+
+template< typename T >
+struct IsUnion : std::false_type { };
+
+template< typename... Ts >
+struct IsUnion< Union< Ts... > > : std::true_type { };
+
+template< typename A, typename B >
+struct _OneUnion : std::enable_if<
+                       ( IsUnion< A >::value || IsUnion< B >::value )
+                       && !(IsUnion< A >::value && IsUnion< B >::value ),
+                   bool > { };
+
+template< typename A, typename B >
+auto operator==( const A &a, const B &b ) ->
+    typename std::enable_if< IsUnion< A >::value && !IsUnion< B >::value, bool >::type
+{
+    return a.template is< B >() && a.template get< B >() == b;
+}
+
+template< typename A, typename B >
+auto operator==( const A &a, const B &b ) ->
+    typename std::enable_if< !IsUnion< A >::value && IsUnion< B >::value, bool >::type
+{ return b == a; }
+
+
+template< typename A, typename B >
+auto operator<( const A &a, const B &b ) ->
+    typename std::enable_if< IsUnion< A >::value && !IsUnion< B >::value, bool >::type
+{
+    return a.discriminator() < a.template discriminator< B >()
+        || (a.template is< B >() && a.template get< B >() < b);
+}
+
+template< typename A, typename B >
+auto operator<( const A &a, const B &b ) ->
+    typename std::enable_if< !IsUnion< A >::value && IsUnion< B >::value, bool >::type
+{
+    return b.template discriminator< A >() < b.discriminator()
+        || (b.template is< A >() && a < b.template get< A >());
+}
+
+template< typename A, typename B >
+auto operator!=( const A &a, const B &b ) -> typename _OneUnion< A, B >::type
+{ return !(a == b); }
+
+template< typename A, typename B >
+auto operator<=( const A &a, const B &b ) -> typename _OneUnion< A, B >::type
+{ return a < b || a == b; }
+
+template< typename A, typename B >
+auto operator>( const A &a, const B &b ) -> typename _OneUnion< A, B >::type
+{ return b < a; }
+
+template< typename A, typename B >
+auto operator>=( const A &a, const B &b ) -> typename _OneUnion< A, B >::type
+{ return b <= a; }
+
+#endif // C++11
+
+}
+}
+
+namespace brick_test {
+namespace types {
+
+using namespace ::brick::types;
+
+struct Integer : Comparable
+{
+    int val;
+public:
+    Integer(int val) : val(val) {}
+    bool operator<=( const Integer& o ) const { return val <= o.val; }
+};
+
+struct Mixins {
+
+    TEST(comparable) {
+        Integer i10(10);
+        Integer i10a(10);
+        Integer i20(20);
+
+        ASSERT(i10 <= i10a);
+        ASSERT(i10a <= i10);
+        ASSERT(i10 <= i20);
+        ASSERT(! (i20 <= i10));
+
+        ASSERT(i10 != i20);
+        ASSERT(!(i10 != i10a));
+
+        ASSERT(i10 == i10a);
+        ASSERT(!(i10 == i20));
+
+        ASSERT(i10 < i20);
+        ASSERT(!(i20 < i10));
+        ASSERT(!(i10 < i10a));
+
+        ASSERT(i20 > i10);
+        ASSERT(!(i10 > i20));
+        ASSERT(!(i10 > i10a));
+
+        ASSERT(i10 >= i10a);
+        ASSERT(i10a >= i10);
+        ASSERT(i20 >= i10);
+        ASSERT(! (i10 >= i20));
+    }
+
+};
+
+#if __cplusplus >= 201103L
+
+struct A { };
+struct B { B() { }; ~B() { } };
+struct C { int x; C( int x ) : x( x ) {} C() : x( 0 ) {} };
+
+static_assert( _impl::In< int, int >::value, "" );
+static_assert( _impl::In< A, A, B >::value, "" );
+static_assert( _impl::In< A, B, A >::value, "" );
+
+// test instances
+struct UnionInstances {
+    Union<> a;
+    Union< int, long > b;
+    Union< int, long, A > c;
+    Union< int, long, A, B > d;
+    Union< int, long, A, B, std::string > e;
+};
+
+struct UnionTest {
+    TEST(basic) {
+        Union< int > u( 1 );
+        ASSERT( !!u );
+        ASSERT( !u.empty() );
+        ASSERT( u.is< int >() );
+        ASSERT_EQ( u.get< int >(), 1 );
+        u = 2; // move
+        ASSERT( !!u );
+        ASSERT_EQ( u.get< int >(), 2 );
+        int i = 3;
+        u = i; // copy
+        ASSERT( !!u );
+        ASSERT_EQ( u.get< int >(), 3 );
+        u = types::Union< int >( 4 );
+        ASSERT( u.is< int >() );
+        ASSERT_EQ( u.get< int >(), 4 );
+        u = types::Union< int >();
+        ASSERT( !u );
+        ASSERT( !u.is< int >() );
+        u = 5;
+        ASSERT( u );
+        ASSERT( u.is< int >() );
+        ASSERT_EQ( u.get< int >(), 5 );
+    }
+
+    TEST(moveNoCopy) {
+        // if one of contained structures does not define copy ctor+assignment
+        // move should still be available
+        struct Move {
+            Move() = default;
+            Move( const Move & ) = delete;
+            Move( Move && ) = default;
+
+            Move &operator=( Move ) { return *this; }
+        };
+        Union< long, Move > wierd;
+        ASSERT( wierd.empty() );
+
+        wierd = 2L;
+        ASSERT( !!wierd );
+        ASSERT( wierd.is< long >() );
+        ASSERT_EQ( wierd.get< long >(), 2L );
+
+        wierd = Move();
+        ASSERT( !!wierd );
+        ASSERT( wierd.is< Move >() );
+    }
+
+    TEST(ctorCast) {
+        ASSERT( ( Union< int, long >{ int( 1 ) }.is< int >() ) );
+        ASSERT( ( Union< int, long >{ long( 1 ) }.is< long >() ) );
+
+        ASSERT( ( Union< long, std::string >{ int( 1 ) }.is< long >() ) );
+
+        struct A { operator int(){ return 1; } };
+        ASSERT( ( Union< int, A >{ A() }.is< A >() ) );
+        ASSERT( ( Union< int, std::string >{ A() }.is< int >() ) );
+
+        struct B { B( int ) { } B() = default; };
+        ASSERT( ( Union< int, B >{ B() }.is< B >() ) );
+        ASSERT( ( Union< int, B >{ 1 }.is< int >() ) );
+        ASSERT( ( Union< B, std::string >{ 1 }.is< B >() ) );
+    }
+
+    static C idC( C c ) { return c; };
+    static C constC( B ) { return C( 32 ); };
+
+    TEST(apply) {
+        Union< B, C > u;
+        u = B();
+
+        Maybe< C > result = u.match( idC, constC );
+        ASSERT( !result.isNothing() );
+        ASSERT_EQ( result.value().x, 32 );
+
+        u = C( 12 );
+        result = u.match( idC, constC );
+        ASSERT( !result.isNothing() );
+        ASSERT_EQ( result.value().x, 12 );
+
+        result = u.match( constC );
+        ASSERT( result.isNothing() );
+    }
+
+    TEST(eq) {
+        Union< int, long > u{ 1 };
+        Union< int, long > v{ 2 };
+        Union< int, long > w{ 2l };
+
+        ASSERT( u == u );
+        ASSERT( u != v );
+        ASSERT( v != w );
+        ASSERT( u != w );
+
+        ASSERT( u == 1 );
+        ASSERT( v == 2 );
+        ASSERT( w == 2l );
+
+        ASSERT( u != 1l );
+        ASSERT( v != 2l );
+        ASSERT( w != 2 );
+    }
+
+    TEST(ord) {
+        Union< int, long > u{ 1 };
+        Union< int, long > v{ 2 };
+        Union< int, long > w{ 2l };
+
+        ASSERT( u < v );
+        ASSERT( !(v < u) );
+        ASSERT( u < w );
+        ASSERT( !(w < u) );
+        ASSERT( v < w );
+        ASSERT( !(w < v) );
+
+        ASSERT( u <= 1 );
+        ASSERT( v > 1 );
+        ASSERT( w > 1 );
+
+        ASSERT( u < 1l );
+        ASSERT( v < 1l );
+        ASSERT( w > 1l );
+
+        ASSERT( u < 2 );
+        ASSERT( v <= 2 );
+        ASSERT( w > 2 );
+
+        ASSERT( u < 2l );
+        ASSERT( v < 2l );
+        ASSERT( w <= 2l );
+    }
+};
+
+enum class FA : unsigned char  { X = 1, Y = 2, Z = 4 };
+enum class FB : unsigned short { X = 1, Y = 2, Z = 4 };
+enum class FC : unsigned       { X = 1, Y = 2, Z = 4 };
+enum class FD : unsigned long  { X = 1, Y = 2, Z = 4 };
+
+struct StrongEnumFlagsTest {
+    template< typename Enum >
+    void testEnum() {
+        StrongEnumFlags< Enum > e1;
+        StrongEnumFlags< Enum > e2( Enum::X );
+
+        ASSERT( !e1 );
+        ASSERT( e2 );
+
+        ASSERT( e1 | e2 );
+        ASSERT( Enum::X | Enum::Y );
+        ASSERT( e2 | Enum::Z );
+        ASSERT( e2.has( Enum::X ) );
+
+        ASSERT( e2 & Enum::X );
+        ASSERT( !( Enum::X & Enum::Y ) );
+
+        ASSERT( Enum::X | Enum::Y | Enum::Z );
+        ASSERT( !( Enum::X & Enum::Y & Enum::Z ) );
+        ASSERT( ( Enum::X | Enum::Y | Enum::Z ) & Enum::X );
+    }
+
+    // we don't want to break classical enums and ints by out operators
+    TEST(regression) {
+        enum Classic { C_X = 1, C_Y = 2, C_Z = 4 };
+
+        ASSERT( C_X | C_Y | C_Z );
+        ASSERT( 1 | 2 | 4 );
+        ASSERT( C_X & 1 );
+    }
+
+    TEST(enum_uchar) { testEnum< FA >(); }
+    TEST(enum_ushort) { testEnum< FB >(); }
+    TEST(enum_uint) { testEnum< FC >(); }
+    TEST(enum_ulong) { testEnum< FD >(); }
+};
+
+#endif
+
+}
+}
+
+#endif
+// vim: syntax=cpp tabstop=4 shiftwidth=4 expandtab
diff --git a/configure.ac b/configure.ac
index 5039b5adb..5ad155170 100644
--- a/configure.ac
+++ b/configure.ac
@@ -77,6 +77,8 @@ fi
 # Activate C11 for gnulib tests
 AX_CHECK_COMPILE_FLAG([-std=c11], [CFLAGS="$CFLAGS -std=c11"])
 
+AX_CHECK_COMPILE_FLAG([-lpthread], [CFLAGS="$CFLAGS -lpthread"])
+
 gl_INIT
 
 # Use -Werror since using -fvisibility under MinGW is only a warning.
@@ -142,6 +144,7 @@ fi
 AX_CHECK_BUDDY
 
 AC_CHECK_HEADERS([sys/times.h valgrind/memcheck.h spawn.h])
+AX_CHECK_DIVINE
 AC_CHECK_FUNCS([times kill alarm sigaction])
 
 LT_CONFIG_LTDL_DIR([ltdl])
diff --git a/debian/copyright b/debian/copyright
index 669d5abd2..efae81d0d 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -49,6 +49,33 @@ License: BSD-2-Clause
  OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
  MODIFICATIONS.
 
+Files: bricks/*
+Copyright: 2010-2014 Petr Ročkai, Jiří Weiser, Vladimír Štill
+License: BSD-2-Clause
+ Permission is hereby granted, without written agreement and without
+ license or royalty fees, to use, reproduce, prepare derivative
+ works, distribute, and display this software and its documentation
+ for any purpose, provided that (1) the above copyright notice and
+ the following two paragraphs appear in all copies of the source code
+ and (2) redistributions, including without limitation binaries,
+ reproduce these notices in the supporting documentation. Substantial
+ modifications to this software may be copyrighted by their authors
+ and need not follow the licensing terms described here, provided
+ that the new terms are clearly indicated in all files where they apply.
+ .
+ IN NO EVENT SHALL JORN LIND-NIELSEN, OR DISTRIBUTORS OF THIS
+ SOFTWARE BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL,
+ INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF THIS
+ SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE AUTHORS OR ANY OF THE
+ ABOVE PARTIES HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ .
+ JORN LIND-NIELSEN SPECIFICALLY DISCLAIM ANY WARRANTIES, INCLUDING,
+ BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
+ ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE NO
+ OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+ MODIFICATIONS.
+
 Files: utf8/*
 Copyright: 2006 Nemanja Trifunovic
 License: BSL-1.0
diff --git a/debian/libspot-dev.install b/debian/libspot-dev.install
index 68289ba57..beae1938a 100644
--- a/debian/libspot-dev.install
+++ b/debian/libspot-dev.install
@@ -1,4 +1,5 @@
 usr/include/spot
+usr/include/bricks
 usr/lib/*-*/libspot.so
 usr/lib/*-*/libspot.a
 usr/lib/*-*/pkgconfig/libspot.pc
diff --git a/m4/bricks.m4 b/m4/bricks.m4
new file mode 100644
index 000000000..a26b55fc4
--- /dev/null
+++ b/m4/bricks.m4
@@ -0,0 +1,4 @@
+AC_DEFUN([AX_CHECK_BRICKS], [
+  AC_SUBST([BRICKS_CPPFLAGS], ['-I$(top_srcdir)/bricks'])
+  AC_CONFIG_SUBDIRS([bricks])
+])
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 4c305f393..467649218 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -22,7 +22,7 @@
 
 AUTOMAKE_OPTIONS = subdir-objects
 AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir) $(BUDDY_CPPFLAGS) \
-              -I$(top_builddir)/lib -I$(top_srcdir)/lib
+              $(BRICKS_CPPFLAGS) -I$(top_builddir)/lib -I$(top_srcdir)/lib
 AM_CXXFLAGS = $(WARNING_CXXFLAGS)
 LDADD = $(top_builddir)/spot/libspot.la $(top_builddir)/buddy/src/libbddx.la
 
@@ -68,6 +68,7 @@ check_PROGRAMS = \
   core/checkta \
   core/consterm \
   core/cube \
+  core/bricks \
   core/emptchk \
   core/equals \
   core/graph \
@@ -121,6 +122,7 @@ core_taatgba_SOURCES = core/taatgba.cc
 core_tgbagraph_SOURCES = core/twagraph.cc
 core_consterm_SOURCES = core/consterm.cc
 core_cube_SOURCES = core/cube.cc
+core_bricks_SOURCES = core/bricks.cc
 core_equals_SOURCES = core/equalsf.cc
 core_kind_SOURCES = core/kind.cc
 core_length_SOURCES = core/length.cc
diff --git a/tests/core/.gitignore b/tests/core/.gitignore
index 6c7c4a1dc..ecd5f0560 100644
--- a/tests/core/.gitignore
+++ b/tests/core/.gitignore
@@ -12,6 +12,7 @@ cube
 defs
 .deps
 *.dot
+bricks
 eltl2tgba
 emptchk
 defs
diff --git a/tests/core/bricks.cc b/tests/core/bricks.cc
new file mode 100644
index 000000000..089015ac1
--- /dev/null
+++ b/tests/core/bricks.cc
@@ -0,0 +1,80 @@
+// -*- coding: utf-8 -*-
+// Copyright (C) 2016 Laboratoire de Recherche et Développement
+// de l'Epita.
+//
+// This file is part of Spot, a model checking library.
+//
+// Spot is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 3 of the License, or
+// (at your option) any later version.
+//
+// Spot is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+// License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+#include <bricks/brick-hashset.h>
+#include <bricks/brick-hash.h>
+
+struct both
+{
+  int x;
+  int y;
+};
+
+template<typename T>
+struct mytest_hasher_both
+{
+  template<typename X>
+  mytest_hasher_both(X&)
+  { }
+
+  mytest_hasher_both() = default;
+
+  brick::hash::hash128_t hash(both t) const
+  {
+    return std::make_pair(t.x*10, t.x);
+  }
+  bool valid(both t) const
+  {
+    return t.x != 0;
+  }
+  bool equal(both a, both b) const
+  {
+    return a.x == b.x;
+  }
+};
+
+int main()
+{
+  // Declare concurrent hash table
+  brick::hashset::FastConcurrent<both , mytest_hasher_both<both>> ht2;
+
+  // Set initial size
+  ht2.setSize(1024);
+
+  // Declare workers and provide them some jobs.
+  std::vector<std::thread> workers;
+  for (int i = 0; i < 6; i++)
+    workers.
+      push_back(std::thread([&ht2](int tid)
+			    {
+			      for (int i = 0; i< 2000; ++i)
+				ht2.insert({i, tid});
+			    }, i));
+
+  // Wait the end of all threads.
+  for (auto& t: workers)
+    t.join();
+
+  // Display the whole table.
+  for (unsigned i = 0; i < ht2.size(); ++ i)
+    if (ht2.valid(i))
+      std::cout << i << ": {"
+		<< ht2[i].x << ',' << ht2[i].y  << "}\n";
+  return 0;
+}