Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240610123624.305051-2-stefanjumarea02@gmail.com>
Date: Mon, 10 Jun 2024 15:36:25 +0300
From: Stefan Jumarea <stefanjumarea02@...il.com>
To: musl@...ts.openwall.com
Cc: dalias@...c.org,
	Stefan Jumarea <stefanjumarea02@...il.com>
Subject: [PATCH] mallocng: Add MTE support for Aarch64

Add support for Memory Tagging Extension.

All the memory tagging code is placed within compiler guards, and is
enabled by using the `--enable-mte` configure option.
The option can only be used if compiling for Aarch64.

All the primitives for generating, storing and loading the memory tags
are placed in a new header under `arch/aarch64/`.

For now, only the actual user data is tagged. All metadata is untagged.

Signed-off-by: Stefan Jumarea <stefanjumarea02@...il.com>
---
 arch/aarch64/mte.h                  | 44 ++++++++++++++++++++++++++++
 configure                           | 17 +++++++++++
 src/malloc/mallocng/aligned_alloc.c | 24 +++++++++------
 src/malloc/mallocng/free.c          | 28 ++++++++++++++----
 src/malloc/mallocng/malloc.c        | 18 ++++++++++++
 src/malloc/mallocng/meta.h          | 45 ++++++++++++++++++++---------
 src/malloc/mallocng/realloc.c       | 34 ++++++++++++++++++----
 7 files changed, 178 insertions(+), 32 deletions(-)
 create mode 100644 arch/aarch64/mte.h

diff --git a/arch/aarch64/mte.h b/arch/aarch64/mte.h
new file mode 100644
index 00000000..35148381
--- /dev/null
+++ b/arch/aarch64/mte.h
@@ -0,0 +1,44 @@
+#include <stdint.h>
+
+#define MTE_TAG_GRANULE		16
+#define MTE_TAG_MASK		(0xFULL << 56)
+
+static inline uint64_t mte_load_tag(uint64_t addr)
+{
+	uint64_t tag;
+
+	__asm__ __volatile__ ("ldg	%x0, [%x1]\n"
+			      : "=&r"(tag) : "r"(addr));
+
+	return tag;
+}
+
+static inline void mte_store_tag(uint64_t addr)
+{
+	__asm__ __volatile__ ("stg	%0, [%0]"
+			      : : "r"(addr) : "memory");
+}
+
+static inline void mte_store_zero_tag(uint64_t addr)
+{
+	__asm__ __volatile__ ("stzg	%x0, [%x0]"
+			      : : "r"(addr) : "memory");
+}
+
+static inline uint64_t mte_get_exclude_mask(uint64_t addr)
+{
+	uint64_t reg;
+
+	__asm__ __volatile__("gmi	%x0, %x1, xzr\n"
+			     : "=r"(reg) : "r" (addr));
+	return reg;
+}
+
+static inline uint64_t mte_insert_random_tag(uint64_t addr, uint64_t mask)
+{
+	uint64_t reg;
+
+	__asm__ __volatile__("irg	%x0, %x2, %x1\n"
+			     : "=r"(reg) : "r" (mask), "r" (addr));
+	return reg;
+}
diff --git a/configure b/configure
index bc9fbe48..edcd4911 100755
--- a/configure
+++ b/configure
@@ -34,6 +34,8 @@ Optional features:
   --enable-wrapper=...    build given musl toolchain wrapper [auto]
   --disable-shared        inhibit building shared library [enabled]
   --disable-static        inhibit building static library [enabled]
+  --enable-mte            build with MTE support [disabled]
+                          only available for aarch64 and mallocng
 
 Optional packages:
   --with-malloc=...       choose malloc implementation [mallocng]
@@ -139,6 +141,7 @@ debug=no
 warnings=yes
 shared=auto
 static=yes
+mte=no
 wrapper=auto
 gcc_wrapper=no
 clang_wrapper=no
@@ -158,6 +161,8 @@ case "$arg" in
 --disable-shared|--enable-shared=no) shared=no ;;
 --enable-static|--enable-static=yes) static=yes ;;
 --disable-static|--enable-static=no) static=no ;;
+--enable-mte|--enable-mte=yes) mte=yes ;;
+--disable-mte|--enable-mte=no) mte=no ;;
 --enable-optimize) optimize=yes ;;
 --enable-optimize=*) optimize=${arg#*=} ;;
 --disable-optimize) optimize=no ;;
@@ -790,6 +795,18 @@ if trycppif "__FAST_MATH__" \
 fail "$0: error: compiler has broken floating point; check CFLAGS"
 fi
 
+if test "$mte" = "yes" ; then
+	printf "Checking whether target architecture supports MTE... "
+	if test "$ARCH" != "aarch64"; then
+		printf "no\n"
+		fail "$0: error: mte only supported with aarch64"
+	fi
+
+	printf "yes\n"
+	CFLAGS_AUTO="$CFLAGS_AUTO -DMEMTAG -march=armv8.5-a+memtag"
+	SUBARCH=${SUBARCH}+memtag
+fi
+
 printf "creating config.mak... "
 
 cmdline=$(quote "$0")
diff --git a/src/malloc/mallocng/aligned_alloc.c b/src/malloc/mallocng/aligned_alloc.c
index e0862a83..2205f6bb 100644
--- a/src/malloc/mallocng/aligned_alloc.c
+++ b/src/malloc/mallocng/aligned_alloc.c
@@ -25,31 +25,37 @@ void *aligned_alloc(size_t align, size_t len)
 	if (!p)
 		return 0;
 
+#ifdef MEMTAG
+	unsigned char *untagged = (unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+	unsigned char *untagged = p;
+#endif
 	struct meta *g = get_meta(p);
-	int idx = get_slot_index(p);
+	int idx = get_slot_index(untagged);
 	size_t stride = get_stride(g);
 	unsigned char *start = g->mem->storage + stride*idx;
 	unsigned char *end = g->mem->storage + stride*(idx+1) - IB;
 	size_t adj = -(uintptr_t)p & (align-1);
 
 	if (!adj) {
-		set_size(p, end, len);
+		set_size(untagged, end, len);
 		return p;
 	}
 	p += adj;
+	untagged += adj;
 	uint32_t offset = (size_t)(p-g->mem->storage)/UNIT;
 	if (offset <= 0xffff) {
-		*(uint16_t *)(p-2) = offset;
-		p[-4] = 0;
+		*(uint16_t *)(untagged-2) = offset;
+		untagged[-4] = 0;
 	} else {
 		// use a 32-bit offset if 16-bit doesn't fit. for this,
 		// 16-bit field must be zero, [-4] byte nonzero.
-		*(uint16_t *)(p-2) = 0;
-		*(uint32_t *)(p-8) = offset;
-		p[-4] = 1;
+		*(uint16_t *)(untagged-2) = 0;
+		*(uint32_t *)(untagged-8) = offset;
+		untagged[-4] = 1;
 	}
-	p[-3] = idx;
-	set_size(p, end, len);
+	untagged[-3] = idx;
+	set_size(untagged, end, len);
 	// store offset to aligned enframing. this facilitates cycling
 	// offset and also iteration of heap for debugging/measurement.
 	// for extreme overalignment it won't fit but these are classless
diff --git a/src/malloc/mallocng/free.c b/src/malloc/mallocng/free.c
index 43f32aad..1a86c8eb 100644
--- a/src/malloc/mallocng/free.c
+++ b/src/malloc/mallocng/free.c
@@ -25,8 +25,13 @@ static struct mapinfo free_group(struct meta *g)
 		mi.len = g->maplen*4096UL;
 	} else {
 		void *p = g->mem;
+#ifdef MEMTAG
+		unsigned char *untagged = (unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+		unsigned char *untagged = p;
+#endif
 		struct meta *m = get_meta(p);
-		int idx = get_slot_index(p);
+		int idx = get_slot_index(untagged);
 		g->mem->meta = 0;
 		// not checking size/reserved here; it's intentionally invalid
 		mi = nontrivial_free(m, idx);
@@ -102,17 +107,30 @@ void free(void *p)
 {
 	if (!p) return;
 
+#ifdef MEMTAG
+	void *untagged = (void *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+	void *untagged = p;
+#endif
+
 	struct meta *g = get_meta(p);
-	int idx = get_slot_index(p);
+	int idx = get_slot_index(untagged);
 	size_t stride = get_stride(g);
 	unsigned char *start = g->mem->storage + stride*idx;
 	unsigned char *end = start + stride - IB;
-	get_nominal_size(p, end);
+#ifdef MEMTAG
+	size_t nom_size = get_nominal_size(untagged, end);
+#endif
 	uint32_t self = 1u<<idx, all = (2u<<g->last_idx)-1;
-	((unsigned char *)p)[-3] = 255;
+	((unsigned char *)untagged)[-3] = 255;
 	// invalidate offset to group header, and cycle offset of
 	// used region within slot if current offset is zero.
-	*(uint16_t *)((char *)p-2) = 0;
+	*(uint16_t *)((char *)untagged-2) = 0;
+
+#ifdef MEMTAG
+	for (size_t i = 0; i < nom_size; i += 16)
+		mte_store_tag((uint64_t)((unsigned char *)untagged + i));
+#endif
 
 	// release any whole pages contained in the slot to be freed
 	// unless it's a single-slot group that will be unmapped.
diff --git a/src/malloc/mallocng/malloc.c b/src/malloc/mallocng/malloc.c
index d695ab8e..89294526 100644
--- a/src/malloc/mallocng/malloc.c
+++ b/src/malloc/mallocng/malloc.c
@@ -298,6 +298,8 @@ static int alloc_slot(int sc, size_t req)
 
 void *malloc(size_t n)
 {
+	n = ALIGN_UP(n, 16);
+
 	if (size_overflows(n)) return 0;
 	struct meta *g;
 	uint32_t mask, first;
@@ -310,6 +312,9 @@ void *malloc(size_t n)
 		void *p = mmap(0, needed, PROT_READ|PROT_WRITE,
 			MAP_PRIVATE|MAP_ANON, -1, 0);
 		if (p==MAP_FAILED) return 0;
+
+
+
 		wrlock();
 		step_seq();
 		g = alloc_meta();
@@ -376,7 +381,20 @@ void *malloc(size_t n)
 success:
 	ctr = ctx.mmap_counter;
 	unlock();
+
+#if MEMTAG
+	void *ptr = enframe(g, idx, n, ctr);
+
+	uint64_t mask_mte = mte_get_exclude_mask((uint64_t)ptr);
+	uint64_t addr = mte_insert_random_tag((uint64_t)ptr, mask_mte);
+
+	for (size_t i = 0; i < n; i += 16)
+		mte_store_tag(addr + i);
+
+	return (void *)addr;
+#else
 	return enframe(g, idx, n, ctr);
+#endif
 }
 
 int is_allzero(void *p)
diff --git a/src/malloc/mallocng/meta.h b/src/malloc/mallocng/meta.h
index 61ec53f9..f5896fe4 100644
--- a/src/malloc/mallocng/meta.h
+++ b/src/malloc/mallocng/meta.h
@@ -4,6 +4,9 @@
 #include <stdint.h>
 #include <errno.h>
 #include <limits.h>
+#ifdef MEMTAG
+#include <mte.h>
+#endif
 #include "glue.h"
 
 __attribute__((__visibility__("hidden")))
@@ -14,6 +17,10 @@ extern const uint16_t size_classes[];
 #define UNIT 16
 #define IB 4
 
+#ifndef ALIGN_UP
+#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1))
+#endif
+
 struct group {
 	struct meta *meta;
 	unsigned char active_idx:5;
@@ -129,14 +136,19 @@ static inline int get_slot_index(const unsigned char *p)
 static inline struct meta *get_meta(const unsigned char *p)
 {
 	assert(!((uintptr_t)p & 15));
-	int offset = *(const uint16_t *)(p - 2);
-	int index = get_slot_index(p);
-	if (p[-4]) {
+#ifdef MEMTAG
+	const unsigned char *untagged = (const unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+	const unsigned char *untagged = p;
+#endif
+	int offset = *(const uint16_t *)(untagged - 2);
+	int index = get_slot_index(untagged);
+	if (untagged[-4]) {
 		assert(!offset);
-		offset = *(uint32_t *)(p - 8);
+		offset = *(uint32_t *)(untagged - 8);
 		assert(offset > 0xffff);
 	}
-	const struct group *base = (const void *)(p - UNIT*offset - UNIT);
+	const struct group *base = (const void *)(untagged - UNIT*offset - UNIT);
 	const struct meta *meta = base->meta;
 	assert(meta->mem == base);
 	assert(index <= meta->last_idx);
@@ -199,10 +211,15 @@ static inline void *enframe(struct meta *g, int idx, size_t n, int ctr)
 	size_t slack = (stride-IB-n)/UNIT;
 	unsigned char *p = g->mem->storage + stride*idx;
 	unsigned char *end = p+stride-IB;
+#ifdef MEMTAG
+	unsigned char *untagged = (unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+	unsigned char *untagged = p;
+#endif
 	// cycle offset within slot to increase interval to address
 	// reuse, facilitate trapping double-free.
-	int off = (p[-3] ? *(uint16_t *)(p-2) + 1 : ctr) & 255;
-	assert(!p[-4]);
+	int off = (untagged[-3] ? *(uint16_t *)(untagged-2) + 1 : ctr) & 255;
+	assert(!untagged[-4]);
 	if (off > slack) {
 		size_t m = slack;
 		m |= m>>1; m |= m>>2; m |= m>>4;
@@ -213,16 +230,18 @@ static inline void *enframe(struct meta *g, int idx, size_t n, int ctr)
 	if (off) {
 		// store offset in unused header at offset zero
 		// if enframing at non-zero offset.
-		*(uint16_t *)(p-2) = off;
-		p[-3] = 7<<5;
+		*(uint16_t *)(untagged-2) = off;
+		untagged[-3] = 7<<5;
 		p += UNIT*off;
+		untagged += UNIT*off;
 		// for nonzero offset there is no permanent check
 		// byte, so make one.
-		p[-4] = 0;
+		untagged[-4] = 0;
 	}
-	*(uint16_t *)(p-2) = (size_t)(p-g->mem->storage)/UNIT;
-	p[-3] = idx;
-	set_size(p, end, n);
+	*(uint16_t *)(untagged-2) = (size_t)(untagged-g->mem->storage)/UNIT;
+	untagged[-3] = idx;
+	set_size(untagged, end, n);
+
 	return p;
 }
 
diff --git a/src/malloc/mallocng/realloc.c b/src/malloc/mallocng/realloc.c
index 18769f42..0fab0df7 100644
--- a/src/malloc/mallocng/realloc.c
+++ b/src/malloc/mallocng/realloc.c
@@ -1,4 +1,5 @@
 #define _GNU_SOURCE
+#include <stdint.h>
 #include <stdlib.h>
 #include <sys/mman.h>
 #include <string.h>
@@ -6,23 +7,46 @@
 
 void *realloc(void *p, size_t n)
 {
+	n = ALIGN_UP(n, 16);
 	if (!p) return malloc(n);
 	if (size_overflows(n)) return 0;
 
+#ifdef MEMTAG
+	unsigned char *untagged = (unsigned char *)((uint64_t)p & ~MTE_TAG_MASK);
+#else
+	unsigned char *untagged = p;
+#endif
 	struct meta *g = get_meta(p);
-	int idx = get_slot_index(p);
+	int idx = get_slot_index(untagged);
 	size_t stride = get_stride(g);
 	unsigned char *start = g->mem->storage + stride*idx;
 	unsigned char *end = start + stride - IB;
-	size_t old_size = get_nominal_size(p, end);
-	size_t avail_size = end-(unsigned char *)p;
+	size_t old_size = get_nominal_size(untagged, end);
+	size_t avail_size = end-(unsigned char *)untagged;
 	void *new;
 
 	// only resize in-place if size class matches
 	if (n <= avail_size && n<MMAP_THRESHOLD
 	    && size_to_class(n)+1 >= g->sizeclass) {
-		set_size(p, end, n);
-		return p;
+
+		uint64_t addr;
+
+#ifdef MEMTAG
+		for (size_t i = 0; i < old_size; i += 16)
+			mte_store_tag((uint64_t)(untagged + i));
+
+		uint64_t mask_mte = mte_get_exclude_mask((uint64_t)p);
+		addr = mte_insert_random_tag((uint64_t)p, mask_mte);
+
+		for (size_t i = 0; i < n; i += 16)
+			mte_store_tag(addr + i);
+#else
+		addr = (uint64_t)p;
+#endif
+
+		set_size(untagged, end, n);
+
+		return (void *)addr;
 	}
 
 	// use mremap if old and new size are both mmap-worthy
-- 
2.43.0

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.