musl - Re: Re: a bug in bindtextdomain() and strip '.UTF-8'

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAPG2z0-_RPsjSRGMQPKYcMXnfSt_Q=hNwoe9OEP0PRZC8XAXuQ@mail.gmail.com>
Date: Mon, 13 Feb 2017 16:01:31 +0800
From: He X <xw897002528@...il.com>
To: musl@...ts.openwall.com
Subject: Re: Re: a bug in bindtextdomain() and strip '.UTF-8'

New find, as you can see, zh_CN is different from zh_CN.UTF-8, it's GBK
codeset, we can't strip .UTF-8 easily, or we will get a lot of junk:
```
[xhe@...-PC ~]$ ls /share/vim/lang/zh_
zh_CN/       zh_CN.UTF-8/ zh_CN.cp936/ zh_TW/       zh_TW.UTF-8/
```
I add this to the loop, and delete strip in setlocale:
+ if (locp = strchr(locbuf, '.')) {
+ *locp = 0;
+ } else if (locp = strchr(locbuf, '@')) {
Now i think we should just fail non-UTF8 codeset in setlocale for safe, and
check the codeset before map that file.

2017-02-12 10:34 GMT+08:00 Rich Felker <dalias@...c.org>:

> On Sat, Feb 11, 2017 at 02:00:56PM +0800, He X wrote:
> > --- a/src/locale/dcngettext.c 2017-02-06 14:39:17.860482624 +0000
> > +++ b/src/locale/dcngettext.c 2017-02-06 14:39:17.860482624 +0000
> > @@ -100,7 +100,8 @@
> >       size_t map_size;
> >       void *volatile plural_rule;
> >       volatile int nplurals;
> > -     char name[];
> > +     struct binding *binding;
> > +     struct __locale_map *lm;
> >  };
>
> As stated in the reply to message body, I think you need the category
> in the keying too, because there can be different .mo files loaded
> depending on which category was requested.
>
> >  static char *dummy_gettextdomain()
> > @@ -120,58 +122,87 @@
> >       struct msgcat *p;
> >       struct __locale_struct *loc = CURRENT_LOCALE;
> >       const struct __locale_map *lm;
> > -     const char *dirname, *locname, *catname;
> > -     size_t dirlen, loclen, catlen, domlen;
> > +     size_t domlen;
> > +     struct binding *q;
> >
> >       if ((unsigned)category >= LC_ALL) goto notrans;
> >
> >       if (!domainname) domainname = __gettextdomain();
> >
> >       domlen = strnlen(domainname, NAME_MAX+1);
> >       if (domlen > NAME_MAX) goto notrans;
> >
> > -     dirname = gettextdir(domainname, &dirlen);
> > -     if (!dirname) goto notrans;
> > +     for (q=bindings; q; q=q->next)
> > +             if (!strcmp(q->domainname, domainname) && q->active)
> > +                     break;
> > +     if (!q) goto notrans;
>
> Looks ok. I had said this should be a function but it really doesn't
> need to be; it's plenty simple inline.
>
> >       lm = loc->cat[category];
> >       if (!lm) {
> >  notrans:
> >               return (char *) ((n == 1) ? msgid1 : msgid2);
> >       }
> > -     locname = lm->name;
> > -
> > -     catname = catnames[category];
> > -     catlen = catlens[category];
> > -     loclen = strlen(locname);
> > -
> > -     size_t namelen = dirlen+1 + loclen+1 + catlen+1 + domlen+3;
> > -     char name[namelen+1], *s = name;
> > -
> > -     memcpy(s, dirname, dirlen);
> > -     s[dirlen] = '/';
> > -     s += dirlen + 1;
> > -     memcpy(s, locname, loclen);
> > -     s[loclen] = '/';
> > -     s += loclen + 1;
> > -     memcpy(s, catname, catlen);
> > -     s[catlen] = '/';
> > -     s += catlen + 1;
> > -     memcpy(s, domainname, domlen);
> > -     s[domlen] = '.';
> > -     s[domlen+1] = 'm';
> > -     s[domlen+2] = 'o';
> > -     s[domlen+3] = 0;
> >
> >       for (p=cats; p; p=p->next)
> > -             if (!strcmp(p->name, name))
> > +             if (p->binding == q && p->lm == lm)
> >                       break;
>
> && p->cat == category
>
> >       if (!p) {
> > +             const char *dirname, *locname, *catname;
> > +             size_t dirlen, loclen, catlen;
> >               void *old_cats;
> >               size_t map_size;
> > +
> > +             dirname = q->dirname;
> > +             locname = lm->name;
> > +             catname = catnames[category];
> > +
> > +             dirlen = q->dirlen;
> > +             loclen = strlen(locname);
> > +             catlen = catlens[category];
>
> Now that these are only computed once rather than per-call, optimizing
> out strlen is probably not worthwhile anymore, but it doesn't really
> hurt either. Not something you need to change, just a comment.
>
> > +
> > +             size_t namelen = dirlen+1 + loclen+1 + catlen+1 + domlen+3;
> > +             char name[namelen+1], *s = name;
> > +             char *str = name;
> > +
> > +             memcpy(s, dirname, dirlen);
> > +             s[dirlen] = '/';
> > +             s += dirlen + 1;
> > +             memcpy(s, locname, loclen);
> > +             s[loclen] = '/';
> > +             s += loclen + 1;
> > +skip_loc:
> > +             memcpy(s, catname, catlen);
> > +             s[catlen] = '/';
> > +             s += catlen + 1;
> > +             memcpy(s, domainname, domlen);
> > +             s[domlen] = '.';
> > +             s[domlen+1] = 'm';
> > +             s[domlen+2] = 'o';
> > +             s[domlen+3] = 0;
>
> Actually, now that this code is not a hot path, it should just be
> using snprintf to construct the pathname, I think. It would be a lot
> simpler and easier to ensure correctness.
>
> > +
> >               const void *map = __map_file(name, &map_size);
> > -             if (!map) goto notrans;
> > +             if (!map) {
> > +                     if (s = strchr(name+dirlen+1, '@')) {
> > +                             *s++ = '/';
> > +                             goto skip_loc;;
> > +                     }
> > +                     if ( str && (s = strchr(name+dirlen+1, '_')) && (s
> < strchr(name+dirlen+1, '/')) ) {
> > +                             if (str = strchr(locname, '@')) {
> > +                                     loclen += locname - str;
> > +                                     memcpy(s, str, loclen);
> > +                                     s[loclen] = '/';
> > +                                     s += loclen + 1;
> > +                                     str = 0;
> > +                                     goto skip_loc;
> > +                             } else {
> > +                                     *s++ = '/';
> > +                                     goto skip_loc;
> > +                             }
> > +                     }
> > +                     goto notrans;
> > +             }
>
> Using snprintf should also make it easy to get rid of the goto/retry
> logic here, perhaps even with a 4-iteration loop and array of which
> format modifications happen on each iteration.
>
> >               p = calloc(sizeof *p + namelen + 1, 1);
> >               if (!p) {
> >                       __munmap((void *)map, map_size);
> >                       goto notrans;
> > @@ -209,7 +209,6 @@
> >               }
> >               p->map = map;
> >               p->map_size = map_size;
> > -             memcpy(p->name, name, namelen+1);
> >               do {
> >                       old_cats = cats;
> >                       p->next = old_cats;
> > --- a/src/locale/locale_map.c 2017-02-06 14:39:17.797148750 +0000
> > +++ b/src/locale/locale_map.c 2017-02-06 14:39:17.797148750 +0000
> > @@ -49,8 +49,8 @@
> >       }
> >
> >       /* Limit name length and forbid leading dot or any slashes. */
> > -     for (n=0; n<LOCALE_NAME_MAX && val[n] && val[n]!='/'; n++);
> > -     if (val[0]=='.' || val[n]) val = "C.UTF-8";
> > +     for (n=0; n<LOCALE_NAME_MAX && val[n] && val[n]!='/' &&
> val[n]!='.'; n++);
> > +     if (val[0]=='.' || (val[n] && val[n]!='.')) val = "C.UTF-8";
> >       int builtin = (val[0]=='C' && !val[1])
> >               || !strcmp(val, "C.UTF-8")
> >               || !strcmp(val, "POSIX");
>
> This looks ok but might still need some tweaks. Should an input like
> "zh_CN.GBK" get treated as "zh_CN" (thus outputting UTF-8 that might
> appear as junk on the user's terminal) or as "C" (no localization)
> with only ASCII characters (safe for the user's terminal), or even
> cause setlocale to fail and return an error so that the application
> can decide what to do? These are not technical comments on your patch
> but policy matters the community should weigh in on.
>
> Rich
>

Content of type "text/html" skipped
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.