Download raw body.
Implement -E for rs(1)
Thus said Stefan Sperling on Thu, 14 Mar 2024 11:27:59 +0100:
> Please test the UTF-8 code path as well.
Now replying with better testing and updated code. I'll demonstrate also
your multibyte-character tests with some additional observations of my
own.
> $ export LC_CTYPE=en_US.UTF-8
> $ printf '\xc3\xa4bcd\xc3\xa9fgh\xc3\xafjklmnop\n' | ./rs -E 4
>
> The expected output would be:
>
> ä b c d
> é f g h
> ï j k l
> m n o p
Here it is:
$ export LC_CTYPE=en_US.UTF-8
$ printf '\xc3\xa4bcd\xc3\xa9fgh\xc3\xafjklmnop\n' | ./rs -E 4
ä b c d
é f g h
ï j k l
m n o p
> Using display width to advance a pointer is wrong. You will want to use
> the amount of input bytes consumed by mbtowc(), which the mbsavis()
> interface does not yet expose. You will need to extend mbsavis()
Good catch, I did not notice that mbsavis() was returning character
widths and not number of bytes. I've updated the code with your
suggestion and found in testing, however, that additional places needed
to handle the bytes_used as well, to avoid infinite loops or out of
memory conditions (please review in patch at the end).
> $ export LC_CTYPE=en_US.UTF-8
> $ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 4
>
> I do not speak Japanese but I suspect we should see the symbols
> arranged like this instead:
>
> 漢 字
> 漢 字
With the changes that I made to the code I was able to obtain the output
that you expect, but not with the arguments you suggested. Here are 3
iterations:
$ export LC_CTYPE=en_US.UTF-8
$ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 4
漢
字
漢
字
$ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 0 4
漢 字 漢 字
$ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 0 2
漢 字
漢 字
So you can see that only when I specify 2 columns of output do I get
what you expect. Is this correct?
> Getting the array tabulated correctly might be more difficult if a string
> contains a mix of characters using 1 column and characters using > 1 column.
Is this what you mean?
$ export LC_CTYPE=en_US.UTF-8
$ printf '\xcf\x80\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\xcf\x80\n' | ./rs -E 0 2
π 漢
字 漢
字 π
> $ export LC_CTYPE=en_US.UTF-8
> $ echo ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | ./rs -E 4
> ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
> $ echo ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | ./rs -E 4
> AB CD EF GH IJ KL MN
> OP QR ST UV WX YZ ab
> cd ef gh ij kl mn op
> qr st uv wx yz
And this too appears to work correctly every time now both with and
without UTF-8:
$ export LC_CTYPE=en_US.UTF-8
$ echo ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | ./rs -E 4
A B C D E F G H I J K L M
N O P Q R S T U V W X Y Z
a b c d e f g h i j k l m
n o p q r s t u v w x y z
I did some experimentation with other characters and found some oddities
that I don't believe are introduced with my changes but exist somewhere
else in the library I believe.
Here it is with my changes:
$ printf '\xc2\xab\xd2\x88\xd2\x89\xd1\xae25\xc2\xba67\xc2\xba35\xc2\xba\xcf\x86\xcf\x80\xc2\xbb\n' | ./rs -E 0 2
« ҈
҉ Ѯ
2 5
º 6
7 º
3 5
º φ
π »
And here it is with the rs included with 7.4:
$ printf '\xc2\xab \xd2\x88 \xd2\x89 \xd1\xae 2 5 \xc2\xba 6 7 \xc2\xba 3 5 \xc2\xba \xcf\x86 \xcf\x80 \xc2\xbb\n' | /usr/bin/rs -E 0 2
« ҈
҉ Ѯ
2 5
º 6
7 º
3 5
º φ
π »
Notice that in both cases, the gutter width on line 2 (row 2) is
actually 3 spaces instead of two as the rest of the columns appear to
be. I'm not sure why this is (perhaps the character somehow got a space
prepended). I also notice some strange things if I just output that text
line without filtering through rs (the characters seem to smash each
other at the beginning of the string).
The only part of my change that I wasn't certain how to test was the
code found in this condition block:
} else if ((width = wcwidth(wc)) == -1) {
So if wcwidth() returns -1 that indicates that the character is not
printable; perhaps characters 0x01--0x1f? I think that even in this case
the code still wants to advance bytes_used by the length of the bytes,
right?
Updated patch:
Index: rs.c
===================================================================
RCS file: /cvs/src/usr.bin/rs/rs.c,v
retrieving revision 1.30
diff -u -p -r1.30 rs.c
--- rs.c 3 Dec 2015 12:23:15 -0000 1.30
+++ rs.c 15 Mar 2024 03:26:52 -0000
@@ -81,7 +81,7 @@ int propgutter;
char isep = ' ', osep = ' ';
int owidth = 80, gutter = 2;
-int mbsavis(char **, const char *);
+int mbsavis(char **, const char *, int *, int);
void usage(void);
void getargs(int, char *[]);
@@ -120,10 +120,12 @@ void
getfile(void)
{
const char delim[2] = { isep, '\0' };
- char *p;
+ char *p, *f;
struct entry *ep;
int multisep = (flags & ONEISEPONLY ? 0 : 1);
int nullpad = flags & NULLPAD;
+ int oneperchar = flags & ONEPERCHAR;
+ int byte_len = 0;
struct entry *padto;
curline = NULL;
@@ -139,6 +141,8 @@ getfile(void)
flags |= ONEPERLINE;
if (flags & ONEPERLINE)
icols = 1;
+ else if (oneperchar)
+ icols = mbsavis(&f, curline, NULL, 0);
else /* count cols on first line */
for (p = curline; *p != '\0'; p++) {
if (*p == isep && multisep)
@@ -151,7 +155,7 @@ getfile(void)
p = curline;
do {
if (flags & ONEPERLINE) {
- ep->w = mbsavis(&ep->s, curline);
+ ep->w = mbsavis(&ep->s, curline, NULL, 0);
if (maxwidth < ep->w)
maxwidth = ep->w;
INCR(ep); /* prepare for next entry */
@@ -160,14 +164,18 @@ getfile(void)
}
p = curline;
while (p != NULL && *p != '\0') {
- if (*p == isep) {
+ if (oneperchar) {
+ ep->w = mbsavis(&ep->s, p, &byte_len, 1);
+ p += byte_len;
+ } else if (*p == isep) {
p++;
if (multisep)
continue;
ep->s = ""; /* empty column */
ep->w = 0;
} else
- ep->w = mbsavis(&ep->s, strsep(&p, delim));
+ ep->w = mbsavis(&ep->s, strsep(&p, delim),
+ NULL, 0);
if (maxwidth < ep->w)
maxwidth = ep->w;
INCR(ep); /* prepare for next entry */
Index: utf8.c
===================================================================
RCS file: /cvs/src/usr.bin/rs/utf8.c,v
retrieving revision 1.1
diff -u -p -r1.1 utf8.c
--- utf8.c 3 Dec 2015 12:23:15 -0000 1.1
+++ utf8.c 15 Mar 2024 03:26:52 -0000
@@ -20,7 +20,7 @@
#include <wchar.h>
int
-mbsavis(char** outp, const char *mbs)
+mbsavis(char** outp, const char *mbs, int *bytes_used, int nchars)
{
const char *src; /* Iterate mbs. */
char *dst; /* Iterate *outp. */
@@ -28,33 +28,49 @@ mbsavis(char** outp, const char *mbs)
int total_width; /* Display width of the whole string. */
int width; /* Display width of a single Unicode char. */
int len; /* Length in bytes of UTF-8 encoded string. */
+ int count; /* Count of chars read. */
+ int nbytes; /* Number of bytes to increment in loop */
len = strlen(mbs);
if ((*outp = malloc(len + 1)) == NULL)
err(1, NULL);
if (MB_CUR_MAX == 1) {
+ len = (nchars && nchars < len) ? nchars : len;
memcpy(*outp, mbs, len + 1);
+ (*outp)[len] = '\0';
+ if (bytes_used != NULL)
+ *bytes_used = len;
return len;
}
src = mbs;
dst = *outp;
total_width = 0;
+ count = 0;
+ nbytes = 0;
+ if (bytes_used != NULL)
+ *bytes_used = 0;
while (*src != '\0') {
if ((len = mbtowc(&wc, src, MB_CUR_MAX)) == -1) {
total_width++;
*dst++ = '?';
src++;
+ nbytes = 1;
} else if ((width = wcwidth(wc)) == -1) {
total_width++;
*dst++ = '?';
src += len;
+ nbytes = len;
} else {
total_width += width;
+ nbytes = len;
while (len-- > 0)
*dst++ = *src++;
}
+ if (bytes_used != NULL)
+ *bytes_used += nbytes;
+ if (nchars > 0 && ++count >= nchars) break;
}
*dst = '\0';
return total_width;
--
Thanks,
Andy
Implement -E for rs(1)