musl - Re: Some additional qsort patches

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Q1ct8sn6_u6gCrS1HCaTb64PhUWy8yNboi9hPS7NtweBGewdedupT35AYRqxd_WZX0ZkyAYd9hAqjrsdVORHN7pgnb1ossy08rrUzQOFuWg=@proton.me>
Date: Wed, 15 Apr 2026 01:05:45 +0000
From: David Sparks <sparks05@...ton.me>
To: "dalias@...ifal.cx" <dalias@...ifal.cx>
Cc: "musl@...ts.openwall.com" <musl@...ts.openwall.com>, "mailto.luca.kellermann@...il.com" <mailto.luca.kellermann@...il.com>, David Sparks <sparks05@...ton.me>
Subject: Re: Some additional qsort patches

On Tuesday, April 14th, 2026 at 16:10, dalias@...ifal.cx <dalias@...ifal.cx> wrote:
> On Tue, Apr 14, 2026 at 03:24:32AM +0000, David Sparks wrote:
>> The recent qsort patches caused conflicts with some local changes
>> I've had sitting around for a long time.  Since the code is fresh
>
> This is exactly why we don't introduce gratuitous churn in the form of
> whitespace changes, minor refactorings, etc. If we'd applied these
> patches, the CVE fixes would not have applied cleanly to previous
> versions, and we would had had to backport them manually. On top of
> that, churn puts a further load on anyone who is actually reviewing
> all changes between versions to make sure they trust them, for anyone
> maintaining their own out-of-tree functional patchsets, etc.

Thank you!  This would be a wonderful addition to the coding-style
wiki page, which I read very carefully before sending anything.

(The reason this happened is a result of how I came to hack
on the code: I wanted to understand smoothsort, and musl has
the best-known implementation, so I set out to read it.  I started
with some "safe" legibility tweaks for my own reading convenience
before I felt confident enough to imagine I could improve it.)

> I know the compare has come up before, and just never been acted on.
> We should probably go ahead and do it.
>
> I don't see how the refactoring of pntz/shr is an improvement. It
> makes some call points "prettier", others "uglier", but it's entirely
> a cosmetic change.

In addition to being prettier, it gives the compiler more scope for
optimization.

Because of the 2-word size, pntz() branches on p[0] == 1, and then
shr() branches on trail >= 64.  This double if() seems pointless when
they're both actually the same condition.  (I just don't know what to
call the operation.  If it were a left shift, it'd be "normalize".
"Strip trailing zeros"?  "pop_root"?  But naming is a bikeshed issue,
not an efficiency one.)

Now, a modern optimizing compiler can in fact see the equivalence and
merge the two branches, but  it's not as good.  Here's a standalone
test program:

#include <stdlib.h>
#define WORDBITS (int)(8 * sizeof(size_t))

static int ctz(size_t x)
{
	return __builtin_ctzg(x);
}

static int pctz(size_t p[2])
{
	size_t p0 = p[0] & -(size_t)2;
	if (p0)
		return ctz(p0);
	else
		return ctz(p[1]) + WORDBITS;
}

static void shr(size_t p[2], int sh)
{
	if (sh >= WORDBITS) {
		p[0] = p[1];
		p[1] = 0;
		sh -= WORDBITS;
		if (!sh)
			return;
	}
	p[0] = p[0] >> sh | p[1] << (WORDBITS - sh);
	p[1] >>= sh;
}

int
normalize1(size_t p[2])
{
	int sh = pctz(p);
	shr(p, sh);
	return sh;
}

int
normalize2(size_t p[2])
{
	size_t p0 = p[0] & -(size_t)2;
	int sh;
	if (p0) {
		sh = ctz(p0);
		p[0] = p0 >> sh | p[1] << (WORDBITS - sh);
		p[1] >>= sh;
	} else {
		sh = ctz(p[1]);
		p[0] = p[1] >> sh;
		p[1] = 0;
		sh += WORDBITS;
	}
	return sh;
}

int
normalize3(size_t p[2])
{
	unsigned __int128 x = (unsigned __int128)p[1] << WORDBITS | (p[0] & -(size_t)2);
	int sh = __builtin_ctzg(x);
	x >>= sh;
	p[0] = x;
	p[1] = x >> sh;
	return sh;
}

Which compiles (gcc 15.2 -O2 -m64 generic) to

	.file	"shr.c"
	.text
	.p2align 4
	.globl	normalize1
normalize1:
	pushq	%rbx
	movq	(%rcx), %r8
	movq	8(%rcx), %r9
	movq	%r8, %r10
	movq	%rcx, %rdx
	andq	$-2, %r10
	je	.L2
	xorl	%eax, %eax
	movq	%r9, %rbx
	rep bsfq	%r10, %rax
	movq	%r9, %r10
	movl	%eax, %ecx
	movl	%eax, %r11d
	negl	%ecx
	salq	%cl, %r10
	movl	%eax, %ecx
	shrq	%cl, %rbx
.L3:
	movl	%r11d, %ecx
	movq	%rbx, 8(%rdx)
	shrq	%cl, %r8
	orq	%r10, %r8
	movq	%r8, (%rdx)
.L1:
	popq	%rbx
	ret
	.p2align 4,,10
	.p2align 3
.L2:
	xorl	%ecx, %ecx
	movq	%r9, (%rdx)
	movl	$64, %eax
	rep bsfq	%r9, %rcx
	movq	$0, 8(%rdx)
	movl	%ecx, %r11d
	testl	%ecx, %ecx
	je	.L1
	leal	64(%rcx), %eax
	xorl	%ebx, %ebx
	movq	%r9, %r8
	jmp	.L3

	.p2align 4
	.globl	normalize2
normalize2:
	movq	(%rcx), %rdx
	movq	8(%rcx), %r10
	movq	%rcx, %r8
	andq	$-2, %rdx
	je	.L8
	xorl	%r11d, %r11d
	movq	%r10, %r9
	rep bsfq	%rdx, %r11
	movl	%r11d, %ecx
	movl	%r11d, %eax
	negl	%ecx
	salq	%cl, %r9
	movl	%r11d, %ecx
	shrq	%cl, %rdx
	orq	%rdx, %r9
	movq	%r10, %rdx
	shrq	%cl, %rdx
	movq	%r9, (%r8)
	movq	%rdx, 8(%r8)
	ret
	.p2align 4,,10
	.p2align 3
.L8:
	xorl	%ecx, %ecx
	movq	%rdx, 8(%r8)
	rep bsfq	%r10, %rcx
	shrq	%cl, %r10
	leal	64(%rcx), %eax
	movq	%r10, %r9
	movq	%r9, (%r8)
	ret

	.p2align 4
	.globl	normalize3
normalize3:
	xorl	%r10d, %r10d
	movq	8(%rcx), %rdx
	movq	(%rcx), %rax
	andq	$-2, %rax
	rep bsfq	%rax, %r10
	movq	%rcx, %r8
	xorl	%ecx, %ecx
	rep bsfq	%rdx, %rcx
	addl	$64, %ecx
	testq	%rax, %rax
	cmovne	%r10d, %ecx
	xorl	%r9d, %r9d
	shrdq	%cl, %rdx, %rax
	shrq	%cl, %rdx
	testb	$64, %cl
	cmovne	%rdx, %rax
	cmovne	%r9, %rdx
	movq	%rax, (%r8)
	shrdq	%cl, %rdx, %rax
	shrq	%cl, %rdx
	testb	$64, %cl
	cmovne	%rdx, %rax
	movq	%rax, 8(%r8)
	movl	%ecx, %eax
	ret

normalize1 is 0x70 bytes (including 6 bytes of padding after the ret)
with two internal branches, normalize2 is 0x58 bytes (3 bytes internal
padding) with one internal branch, and normalize3 is 0x54 bytes with
no internal branches.

>> TODO: Clean out redundant header files.  Neither stdint.h nor stdlib.h
>> are actually needed.  (Noticed as I was composing this mail.)
>
> stdlib.h is needed because it's the public declaration of qsort. We
> always include the public declaration of the interface being
> implemented. This ensures the compiler checks that the signatures
> match.

Doh!  An excellent reason and I apologize for not realizing it.
Which is also the reason for #define _BSD_SOURCE.

I will prepare a smaller revised patch set.

There are two additional changes I could prepare if you're
interested in seeing them, but I held off for fear of the
magnitude of the surgery:

1) Using __uint128_t (or uint64_t on 32-bit) for the tree-size
   bitmap.  This would require a chunk of ugly preprocessor
   code to find the appropriate data type, plus the current
   code as a fallback.

2) Saving a compare in trinkle().  There seem to be two ways
   to go about this:
   2a) Move the top level of sift() into trinkle(), and have
       the call to sift() take the appropriate subtree, or
   2b) Implement a do_sift() helper which takes the root compare
       as an argument and can be used by both sift() and
       trinkle().  (This has the potential to take an ar[] array
       as well and merge the cycle() operations.)
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.