musl - Re: Re: a bug in bindtextdomain() and strip '.UTF-8'

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170212023422.GE1520@brightrain.aerifal.cx>
Date: Sat, 11 Feb 2017 21:34:22 -0500
From: Rich Felker <dalias@...c.org>
To: musl@...ts.openwall.com
Subject: Re: Re: a bug in bindtextdomain() and strip '.UTF-8'

On Sat, Feb 11, 2017 at 02:00:56PM +0800, He X wrote:
> --- a/src/locale/dcngettext.c	2017-02-06 14:39:17.860482624 +0000 
> +++ b/src/locale/dcngettext.c	2017-02-06 14:39:17.860482624 +0000
> @@ -100,7 +100,8 @@
>  	size_t map_size;
>  	void *volatile plural_rule;
>  	volatile int nplurals;
> -	char name[];
> +	struct binding *binding;
> +	struct __locale_map *lm;
>  };

As stated in the reply to message body, I think you need the category
in the keying too, because there can be different .mo files loaded
depending on which category was requested.

>  static char *dummy_gettextdomain()
> @@ -120,58 +122,87 @@
>  	struct msgcat *p;
>  	struct __locale_struct *loc = CURRENT_LOCALE;
>  	const struct __locale_map *lm;
> -	const char *dirname, *locname, *catname;
> -	size_t dirlen, loclen, catlen, domlen;
> +	size_t domlen;
> +	struct binding *q;
>  
>  	if ((unsigned)category >= LC_ALL) goto notrans;
>  
>  	if (!domainname) domainname = __gettextdomain();
>  
>  	domlen = strnlen(domainname, NAME_MAX+1);
>  	if (domlen > NAME_MAX) goto notrans;
>  
> -	dirname = gettextdir(domainname, &dirlen);
> -	if (!dirname) goto notrans;
> +	for (q=bindings; q; q=q->next)
> +		if (!strcmp(q->domainname, domainname) && q->active)
> +			break;
> +	if (!q) goto notrans;

Looks ok. I had said this should be a function but it really doesn't
need to be; it's plenty simple inline.

>  	lm = loc->cat[category];
>  	if (!lm) {
>  notrans:
>  		return (char *) ((n == 1) ? msgid1 : msgid2);
>  	}
> -	locname = lm->name;
> -
> -	catname = catnames[category];
> -	catlen = catlens[category];
> -	loclen = strlen(locname);
> -
> -	size_t namelen = dirlen+1 + loclen+1 + catlen+1 + domlen+3;
> -	char name[namelen+1], *s = name;
> -
> -	memcpy(s, dirname, dirlen);
> -	s[dirlen] = '/';
> -	s += dirlen + 1;
> -	memcpy(s, locname, loclen);
> -	s[loclen] = '/';
> -	s += loclen + 1;
> -	memcpy(s, catname, catlen);
> -	s[catlen] = '/';
> -	s += catlen + 1;
> -	memcpy(s, domainname, domlen);
> -	s[domlen] = '.';
> -	s[domlen+1] = 'm';
> -	s[domlen+2] = 'o';
> -	s[domlen+3] = 0;
>  
>  	for (p=cats; p; p=p->next)
> -		if (!strcmp(p->name, name))
> +		if (p->binding == q && p->lm == lm)
>  			break;

&& p->cat == category

>  	if (!p) {
> +		const char *dirname, *locname, *catname;
> +		size_t dirlen, loclen, catlen;
>  		void *old_cats;
>  		size_t map_size;
> +
> +		dirname = q->dirname;
> +		locname = lm->name;
> +		catname = catnames[category];
> +
> +		dirlen = q->dirlen;
> +		loclen = strlen(locname);
> +		catlen = catlens[category];

Now that these are only computed once rather than per-call, optimizing
out strlen is probably not worthwhile anymore, but it doesn't really
hurt either. Not something you need to change, just a comment.

> +
> +		size_t namelen = dirlen+1 + loclen+1 + catlen+1 + domlen+3;
> +		char name[namelen+1], *s = name;
> +		char *str = name;
> +
> +		memcpy(s, dirname, dirlen);
> +		s[dirlen] = '/';
> +		s += dirlen + 1;
> +		memcpy(s, locname, loclen);
> +		s[loclen] = '/';
> +		s += loclen + 1;
> +skip_loc:
> +		memcpy(s, catname, catlen);
> +		s[catlen] = '/';
> +		s += catlen + 1;
> +		memcpy(s, domainname, domlen);
> +		s[domlen] = '.';
> +		s[domlen+1] = 'm';
> +		s[domlen+2] = 'o';
> +		s[domlen+3] = 0;

Actually, now that this code is not a hot path, it should just be
using snprintf to construct the pathname, I think. It would be a lot
simpler and easier to ensure correctness.

> +
>  		const void *map = __map_file(name, &map_size);
> -		if (!map) goto notrans;
> +		if (!map) {
> +			if (s = strchr(name+dirlen+1, '@')) {
> + 				*s++ = '/';
> + 				goto skip_loc;;
> + 			}
> + 			if ( str && (s = strchr(name+dirlen+1, '_')) && (s < strchr(name+dirlen+1, '/')) ) {
> + 				if (str = strchr(locname, '@')) {
> + 					loclen += locname - str;
> +					memcpy(s, str, loclen);
> +					s[loclen] = '/';
> +					s += loclen + 1;
> +					str = 0;
> + 					goto skip_loc;
> + 				} else {
> +					*s++ = '/';
> + 					goto skip_loc;
> + 				}
> + 			}
> +			goto notrans;
> +		}

Using snprintf should also make it easy to get rid of the goto/retry
logic here, perhaps even with a 4-iteration loop and array of which
format modifications happen on each iteration.

>  		p = calloc(sizeof *p + namelen + 1, 1);
>  		if (!p) {
>  			__munmap((void *)map, map_size);
>  			goto notrans;
> @@ -209,7 +209,6 @@
>  		}
>  		p->map = map;
>  		p->map_size = map_size;
> -		memcpy(p->name, name, namelen+1);
>  		do {
>  			old_cats = cats;
>  			p->next = old_cats;
> --- a/src/locale/locale_map.c	2017-02-06 14:39:17.797148750 +0000
> +++ b/src/locale/locale_map.c	2017-02-06 14:39:17.797148750 +0000
> @@ -49,8 +49,8 @@
>  	}
>  
>  	/* Limit name length and forbid leading dot or any slashes. */
> -	for (n=0; n<LOCALE_NAME_MAX && val[n] && val[n]!='/'; n++);
> -	if (val[0]=='.' || val[n]) val = "C.UTF-8";
> +	for (n=0; n<LOCALE_NAME_MAX && val[n] && val[n]!='/' && val[n]!='.'; n++);
> +	if (val[0]=='.' || (val[n] && val[n]!='.')) val = "C.UTF-8";
>  	int builtin = (val[0]=='C' && !val[1])
>  		|| !strcmp(val, "C.UTF-8")
>  		|| !strcmp(val, "POSIX");

This looks ok but might still need some tweaks. Should an input like
"zh_CN.GBK" get treated as "zh_CN" (thus outputting UTF-8 that might
appear as junk on the user's terminal) or as "C" (no localization)
with only ASCII characters (safe for the user's terminal), or even
cause setlocale to fail and return an error so that the application
can decide what to do? These are not technical comments on your patch
but policy matters the community should weigh in on.

Rich
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.