#include <assert.h>
#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <stdint.h>

#define RANK_0_LOG  5
#define RANK_0_SIZE (1ul << RANK_0_LOG)
#define RANKS       20

// NPER is an architecture-dependent number.
// NPER is nodes per element of `nodes`
#define NPERBITS    4
#define NPER        (1ul << NPERBITS)
// BPER is bits per element of `nodes`
#define BPERBITS    1
#define BPER        (1ul << BPERBITS)

#define SI static inline

// TODO need to guarantee alignment of the pool
static char pool[RANK_0_SIZE << (RANKS - 1)];
static const size_t pool_size = sizeof pool;
static unsigned counts[RANKS] = { [RANKS - 1] = 1 };
// implicit binary tree of nodes
// node at n has children at (2n + 1) and (2n + 2)
// bit0 : 0=leaf, 1=internal
// bit1 : full
static uint32_t nodes[(1ul << RANKS) / NPER - (1 / NPER)];

typedef uint32_t np;
typedef size_t sz;
#define TN 0 /* top node */

// -----------------------------------------------------------------------------
static inline unsigned ilog2(unsigned long x)
{
    unsigned result = 0;
    while (x >>= 1)
        result++;
    return result;
}

SI sz IRANK     (sz r) { return RANKS - 1 - r; }
SI np LLINK     (np n) { return (n << 1) + 1; }
SI np RLINK     (np n) { return (n << 1) + 2; }
SI sz ISLEFT    (np n) { return n & 1; }
SI sz ISRIGHT   (np n) { return !ISLEFT(n); }
SI np PARENT    (np n) { return (n - 1) >> 1; }
SI np SIBLING   (np n) { return n + ISLEFT(n) - ISRIGHT(n); }
SI sz NODE2RANK (np n) { return (n ? NODE2RANK(PARENT(n)) : RANKS) - 1; }
SI sz SIZE2BLKS (sz x) { return (x + RANK_0_SIZE - 1) >> RANK_0_LOG; }
SI sz SIZE2RANK (sz s) { return s ? ilog2(SIZE2BLKS(s) - 1) + 1 : 0; }
SI sz RANK2BYTES(sz r) { return 1ul << (r + RANK_0_LOG); }
SI sz NODE2BASE (np n) { return n >> NPERBITS; }
SI sz NODE2SH   (np n) { return n & (NPER - 1); }

SI sz   GET_COUNT(sz r)       { return counts[r]; }
SI void SET_COUNT(sz r, sz v) { counts[r] = v; }

SI sz SHAMT(np n, sz b) { return NODE2SH(n) * BPER + b; }
SI uint32_t* NODE2PTR(np n) { return &nodes[NODE2BASE(n)]; }

SI sz   GET_BIT  (np n, sz b) { return (*NODE2PTR(n) >> SHAMT(n, b)) & 1; }
SI void CLR_BIT  (np n, sz b) { *NODE2PTR(n) &= ~(1ul << SHAMT(n,b)); }
SI void SET_BIT  (np n, sz b) { *NODE2PTR(n) |=  (1ul << SHAMT(n,b)); }

SI sz   GET_LEAF (np n      ) { return !GET_BIT(n, 0); }
SI sz   GET_FULL (np n      ) { return  GET_BIT(n, 1); }
SI sz   GET_VALID(np n      ) { return n == TN || !GET_LEAF(PARENT(n)); }
SI void SET_LEAF (np n, sz v) { !v ? SET_BIT(n, 0) : CLR_BIT(n, 0); }
SI void SET_FULL (np n, sz v) {  v ? SET_BIT(n, 1) : CLR_BIT(n, 1); }

static char *NODE2ADDR(np n)
{
    size_t base = 0;
    int i = NODE2RANK(n) + RANK_0_LOG;
    while (n != TN) {
        base |= ISRIGHT(n) << i;
        n = PARENT(n);
        i++;
    }

    return pool + base;
}

static np ADDR2NODE(char *key)
{
    // We start at the second-highest rank because we are always checking the
    // key against the midpoint of the current-rank node, which midpoint is
    // calculated using the size of the next-smallest-rank node.
    size_t rank = RANKS - 2;
    np n = TN;
    char *base = pool;

    while (!GET_LEAF(n)) {
        if (key - base >= (signed)RANK2BYTES(rank)) {
            n = RLINK(n);
            base += RANK2BYTES(rank);
        } else {
            n = LLINK(n);
        }

        rank--;
    }

    // Now we are at a leaf. Either it matches our key, or we bail.
    if (key != base)
        abort();

    return n;
}

// -----------------------------------------------------------------------------
static void buddy_splitnode(np n, sz rank)
{
    assert(("node is legally splittable", GET_LEAF(n)));

    // remove one node of our rank and create two of a smaller rank
    if (!GET_FULL(n))
        SET_COUNT(rank, GET_COUNT(rank) - 1);
    SET_COUNT(rank - 1, GET_COUNT(rank - 1) + 2);

    SET_LEAF(n,0);

    // Both children are already empty leaf nodes, either because they have
    // never been written, or because buddy_{free,realloc} left them that way.
}

static void *buddy_alloc(size_t rank)
{
    for (size_t r = rank; r < RANKS; r++) {
        if (!GET_COUNT(r))
            continue;

        size_t rp = IRANK(r);
        size_t first = (1ul << rp) - 1;
        for (np n = first; n < first + (1ul << rp); n++) {
            if (!GET_VALID(n) || !GET_LEAF(n) || GET_FULL(n))
                continue;

            while (r > rank) {
                buddy_splitnode(n, r--);
                n = LLINK(n);
            }

            SET_FULL(n,1);
            SET_COUNT(r, GET_COUNT(r) - 1);
            return NODE2ADDR(n);
        }
    }

    // if we make it here, there were no nodes of appropriate size or larger
    errno = ENOMEM;
    return NULL;
}

// -----------------------------------------------------------------------------
// External API
void *buddy_malloc(size_t size)
{
    return buddy_alloc(SIZE2RANK(size));
}

void *buddy_calloc(size_t nelem, size_t width)
{
    return memset(buddy_malloc(nelem * width), 0, nelem * width);
}

void buddy_free(void *orig)
{
    if (!orig)
        return;

    np n = ADDR2NODE(orig);
    assert(("freeing node is a leaf", GET_LEAF(n) == 1));
    SET_FULL(n,0);
    size_t rank = NODE2RANK(n);
    assert(("rank is valid", rank < RANKS && rank >= 0));
    SET_COUNT(rank, GET_COUNT(rank) + 1); // mark self free
    assert(("count is legal", GET_COUNT(rank) < (1ul << (IRANK(rank) + 1))));
    if (n == TN)
        return;

    np p = PARENT(n);
    np b = SIBLING(n);

    while (GET_LEAF(b) && !GET_FULL(b)) {
        // TODO write this recursively too ?
        assert(("count will not underflow", GET_COUNT(rank) >= 2));
        SET_COUNT(rank, GET_COUNT(rank) - 2); // b and n are disappearing
        assert(("count legal", GET_COUNT(rank + 1) < (2u << IRANK(rank + 1))));
        SET_COUNT(rank + 1, GET_COUNT(rank + 1) + 1); // parent appears as leaf
        rank++;

        SET_LEAF(p,1);
        SET_FULL(p,0);

        n = p;

        if (n == TN)
            break;

        p = PARENT(n);
        assert(("parent is valid", GET_VALID(p)));
        b = SIBLING(n);
        assert(("sibling is valid", GET_VALID(b)));
    }
}

void *buddy_realloc(void *orig, size_t size)
{
    if (!size) {
        buddy_free(orig);
        return NULL;
    }
    if (!orig)
        return buddy_malloc(size);

    void *what = orig;
    np n = ADDR2NODE(what);
    size_t rank = NODE2RANK(n);
    size_t max = RANK2BYTES(rank);
    if (size > max) {
        if (n == TN || size >= pool_size) {
            errno = ENOMEM;
            return NULL;
        }

        #if !NO_PROMOTE
        // Without the ability to promote and demote existing allocation sizes,
        // we drop from 70% utilisation to 50%. Demotion seems more important
        // than promotion, but that may be an effect of the allocation pattern.
        size_t get = max;
        np p = PARENT(n), b = SIBLING(n);
        while (GET_LEAF(b) && !GET_FULL(b) && (get <<= 1) < size && p != TN) {
            b = SIBLING(p);
            p = PARENT(p);
        }

        if (get >= pool_size) {
            errno = ENOMEM;
            return NULL;
        }

        if (get >= size) {
            // can promote existing allocation
            do {
                np q = PARENT(n);
                SET_LEAF(q,1);
                SET_FULL(q,1);
                // Set node empty even though invalid, so future allocations
                // can depend on it without setting it explicitly.
                SET_FULL(n,0);
                n = q;
                // coalescing a full and an empty
                SET_COUNT(rank, GET_COUNT(rank) - 1);
                rank++;
            } while (n != p);
            what = NODE2ADDR(n);
            memmove(what, orig, max);
        } else
        #endif
        {
            what = buddy_alloc(SIZE2RANK(size));
            if (!what)
                return NULL; // ENOMEM is already set

            memcpy(what, orig, max);
            buddy_free(orig);
        }
    }
    #if !NO_DEMOTE
    else while (rank > 0 && size <= (max >>= 1)) {
        // demote node
        buddy_splitnode(n, rank--);
        // claim the left child of the newly split node
        n = LLINK(n);
        SET_FULL(n,1);

        SET_COUNT(rank, GET_COUNT(rank) - 1);
    }
    #endif

    return what;
}