From: "Andy Bradford" Subject: Re: Implement -E for rs(1) To: tech@openbsd.org Date: 14 Mar 2024 22:31:51 -0600 Thus said Stefan Sperling on Thu, 14 Mar 2024 11:27:59 +0100: > Please test the UTF-8 code path as well. Now replying with better testing and updated code. I'll demonstrate also your multibyte-character tests with some additional observations of my own. > $ export LC_CTYPE=en_US.UTF-8 > $ printf '\xc3\xa4bcd\xc3\xa9fgh\xc3\xafjklmnop\n' | ./rs -E 4 > > The expected output would be: > > ä b c d > é f g h > ï j k l > m n o p Here it is: $ export LC_CTYPE=en_US.UTF-8 $ printf '\xc3\xa4bcd\xc3\xa9fgh\xc3\xafjklmnop\n' | ./rs -E 4 ä b c d é f g h ï j k l m n o p > Using display width to advance a pointer is wrong. You will want to use > the amount of input bytes consumed by mbtowc(), which the mbsavis() > interface does not yet expose. You will need to extend mbsavis() Good catch, I did not notice that mbsavis() was returning character widths and not number of bytes. I've updated the code with your suggestion and found in testing, however, that additional places needed to handle the bytes_used as well, to avoid infinite loops or out of memory conditions (please review in patch at the end). > $ export LC_CTYPE=en_US.UTF-8 > $ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 4 > > I do not speak Japanese but I suspect we should see the symbols > arranged like this instead: > > 漢 字 > 漢 字 With the changes that I made to the code I was able to obtain the output that you expect, but not with the arguments you suggested. Here are 3 iterations: $ export LC_CTYPE=en_US.UTF-8 $ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 4 漢 字 漢 字 $ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 0 4 漢 字 漢 字 $ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 0 2 漢 字 漢 字 So you can see that only when I specify 2 columns of output do I get what you expect. Is this correct? > Getting the array tabulated correctly might be more difficult if a string > contains a mix of characters using 1 column and characters using > 1 column. Is this what you mean? $ export LC_CTYPE=en_US.UTF-8 $ printf '\xcf\x80\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\xcf\x80\n' | ./rs -E 0 2 π 漢 字 漢 字 π > $ export LC_CTYPE=en_US.UTF-8 > $ echo ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | ./rs -E 4 > ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz > $ echo ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | ./rs -E 4 > AB CD EF GH IJ KL MN > OP QR ST UV WX YZ ab > cd ef gh ij kl mn op > qr st uv wx yz And this too appears to work correctly every time now both with and without UTF-8: $ export LC_CTYPE=en_US.UTF-8 $ echo ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | ./rs -E 4 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z I did some experimentation with other characters and found some oddities that I don't believe are introduced with my changes but exist somewhere else in the library I believe. Here it is with my changes: $ printf '\xc2\xab\xd2\x88\xd2\x89\xd1\xae25\xc2\xba67\xc2\xba35\xc2\xba\xcf\x86\xcf\x80\xc2\xbb\n' | ./rs -E 0 2 « ҈ ҉ Ѯ 2 5 º 6 7 º 3 5 º φ π » And here it is with the rs included with 7.4: $ printf '\xc2\xab \xd2\x88 \xd2\x89 \xd1\xae 2 5 \xc2\xba 6 7 \xc2\xba 3 5 \xc2\xba \xcf\x86 \xcf\x80 \xc2\xbb\n' | /usr/bin/rs -E 0 2 « ҈ ҉ Ѯ 2 5 º 6 7 º 3 5 º φ π » Notice that in both cases, the gutter width on line 2 (row 2) is actually 3 spaces instead of two as the rest of the columns appear to be. I'm not sure why this is (perhaps the character somehow got a space prepended). I also notice some strange things if I just output that text line without filtering through rs (the characters seem to smash each other at the beginning of the string). The only part of my change that I wasn't certain how to test was the code found in this condition block: } else if ((width = wcwidth(wc)) == -1) { So if wcwidth() returns -1 that indicates that the character is not printable; perhaps characters 0x01--0x1f? I think that even in this case the code still wants to advance bytes_used by the length of the bytes, right? Updated patch: Index: rs.c =================================================================== RCS file: /cvs/src/usr.bin/rs/rs.c,v retrieving revision 1.30 diff -u -p -r1.30 rs.c --- rs.c 3 Dec 2015 12:23:15 -0000 1.30 +++ rs.c 15 Mar 2024 03:26:52 -0000 @@ -81,7 +81,7 @@ int propgutter; char isep = ' ', osep = ' '; int owidth = 80, gutter = 2; -int mbsavis(char **, const char *); +int mbsavis(char **, const char *, int *, int); void usage(void); void getargs(int, char *[]); @@ -120,10 +120,12 @@ void getfile(void) { const char delim[2] = { isep, '\0' }; - char *p; + char *p, *f; struct entry *ep; int multisep = (flags & ONEISEPONLY ? 0 : 1); int nullpad = flags & NULLPAD; + int oneperchar = flags & ONEPERCHAR; + int byte_len = 0; struct entry *padto; curline = NULL; @@ -139,6 +141,8 @@ getfile(void) flags |= ONEPERLINE; if (flags & ONEPERLINE) icols = 1; + else if (oneperchar) + icols = mbsavis(&f, curline, NULL, 0); else /* count cols on first line */ for (p = curline; *p != '\0'; p++) { if (*p == isep && multisep) @@ -151,7 +155,7 @@ getfile(void) p = curline; do { if (flags & ONEPERLINE) { - ep->w = mbsavis(&ep->s, curline); + ep->w = mbsavis(&ep->s, curline, NULL, 0); if (maxwidth < ep->w) maxwidth = ep->w; INCR(ep); /* prepare for next entry */ @@ -160,14 +164,18 @@ getfile(void) } p = curline; while (p != NULL && *p != '\0') { - if (*p == isep) { + if (oneperchar) { + ep->w = mbsavis(&ep->s, p, &byte_len, 1); + p += byte_len; + } else if (*p == isep) { p++; if (multisep) continue; ep->s = ""; /* empty column */ ep->w = 0; } else - ep->w = mbsavis(&ep->s, strsep(&p, delim)); + ep->w = mbsavis(&ep->s, strsep(&p, delim), + NULL, 0); if (maxwidth < ep->w) maxwidth = ep->w; INCR(ep); /* prepare for next entry */ Index: utf8.c =================================================================== RCS file: /cvs/src/usr.bin/rs/utf8.c,v retrieving revision 1.1 diff -u -p -r1.1 utf8.c --- utf8.c 3 Dec 2015 12:23:15 -0000 1.1 +++ utf8.c 15 Mar 2024 03:26:52 -0000 @@ -20,7 +20,7 @@ #include int -mbsavis(char** outp, const char *mbs) +mbsavis(char** outp, const char *mbs, int *bytes_used, int nchars) { const char *src; /* Iterate mbs. */ char *dst; /* Iterate *outp. */ @@ -28,33 +28,49 @@ mbsavis(char** outp, const char *mbs) int total_width; /* Display width of the whole string. */ int width; /* Display width of a single Unicode char. */ int len; /* Length in bytes of UTF-8 encoded string. */ + int count; /* Count of chars read. */ + int nbytes; /* Number of bytes to increment in loop */ len = strlen(mbs); if ((*outp = malloc(len + 1)) == NULL) err(1, NULL); if (MB_CUR_MAX == 1) { + len = (nchars && nchars < len) ? nchars : len; memcpy(*outp, mbs, len + 1); + (*outp)[len] = '\0'; + if (bytes_used != NULL) + *bytes_used = len; return len; } src = mbs; dst = *outp; total_width = 0; + count = 0; + nbytes = 0; + if (bytes_used != NULL) + *bytes_used = 0; while (*src != '\0') { if ((len = mbtowc(&wc, src, MB_CUR_MAX)) == -1) { total_width++; *dst++ = '?'; src++; + nbytes = 1; } else if ((width = wcwidth(wc)) == -1) { total_width++; *dst++ = '?'; src += len; + nbytes = len; } else { total_width += width; + nbytes = len; while (len-- > 0) *dst++ = *src++; } + if (bytes_used != NULL) + *bytes_used += nbytes; + if (nchars > 0 && ++count >= nchars) break; } *dst = '\0'; return total_width; -- Thanks, Andy