From: "Andy Bradford" <amb-sendok-1715661111.dafhampkpcdbbbkconfg@bradfords.org>
Subject: Re: Implement -E for rs(1)
To: tech@openbsd.org
Date: 14 Mar 2024 22:31:51 -0600

Thus said Stefan Sperling on Thu, 14 Mar 2024 11:27:59 +0100:

> Please test the UTF-8 code path as well.

Now replying with better testing and updated code. I'll demonstrate also
your multibyte-character  tests with some additional  observations of my
own.


> $ export LC_CTYPE=en_US.UTF-8
> $ printf '\xc3\xa4bcd\xc3\xa9fgh\xc3\xafjklmnop\n' | ./rs -E 4
>
> The expected output would be:
>
> ä  b  c  d
> é  f  g  h
> ï  j  k  l
> m  n  o  p

Here it is:

$ export LC_CTYPE=en_US.UTF-8           
$ printf '\xc3\xa4bcd\xc3\xa9fgh\xc3\xafjklmnop\n' | ./rs -E 4                 
ä  b  c  d
é  f  g  h
ï  j  k  l
m  n  o  p


> Using display width to advance a pointer is wrong. You will want to use
> the amount of input bytes consumed by mbtowc(), which the mbsavis()
> interface does not yet expose. You will need to extend mbsavis()

Good  catch, I  did not  notice that  mbsavis() was  returning character
widths  and  not number  of  bytes.  I've  updated  the code  with  your
suggestion and found in testing,  however, that additional places needed
to handle  the bytes_used  as well,  to avoid infinite  loops or  out of
memory conditions (please review in patch at the end).


> $ export LC_CTYPE=en_US.UTF-8
> $ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 4
>
> I do not speak Japanese but I suspect we should see the symbols
> arranged like this instead:
>
> 漢  字
> 漢  字

With the changes that I made to the code I was able to obtain the output
that you  expect, but not with  the arguments you suggested.  Here are 3
iterations:

$ export LC_CTYPE=en_US.UTF-8
$ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 4
漢
字
漢
字
$ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 0 4
漢  字  漢  字
$ printf '\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\n' | ./rs -E 0 2
漢  字
漢  字

So you  can see that only  when I specify 2  columns of output do  I get
what you expect. Is this correct?


> Getting the array tabulated correctly might be more difficult if a string
> contains a mix of characters using 1 column and characters using > 1 column.

Is this what you mean?

$ export LC_CTYPE=en_US.UTF-8
$ printf '\xcf\x80\xe6\xbc\xa2\xe5\xad\x97\xe6\xbc\xa2\xe5\xad\x97\xcf\x80\n' | ./rs -E 0 2
π   漢
字  漢
字  π


> $ export LC_CTYPE=en_US.UTF-8
> $ echo ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | ./rs -E 4
> ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
> $ echo ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | ./rs -E 4
> AB  CD  EF  GH  IJ  KL  MN                                                   
> OP  QR  ST  UV  WX  YZ  ab                                                   
> cd  ef  gh  ij  kl  mn  op                                                   
> qr  st  uv  wx  yz  

And this  too appears  to work  correctly every time  now both  with and
without UTF-8:

$ export LC_CTYPE=en_US.UTF-8
$ echo ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz | ./rs -E 4
A  B  C  D  E  F  G  H  I  J  K  L  M
N  O  P  Q  R  S  T  U  V  W  X  Y  Z
a  b  c  d  e  f  g  h  i  j  k  l  m
n  o  p  q  r  s  t  u  v  w  x  y  z


I did some experimentation with other characters and found some oddities
that I don't believe are introduced  with my changes but exist somewhere
else in the library I believe.

Here it is with my changes:

$ printf '\xc2\xab\xd2\x88\xd2\x89\xd1\xae25\xc2\xba67\xc2\xba35\xc2\xba\xcf\x86\xcf\x80\xc2\xbb\n' | ./rs -E 0 2
«  ҈
҉   Ѯ
2  5
º  6
7  º
3  5
º  φ
π  »

And here it is with the rs included with 7.4:

$ printf '\xc2\xab \xd2\x88 \xd2\x89 \xd1\xae 2 5 \xc2\xba 6 7 \xc2\xba 3 5 \xc2\xba \xcf\x86 \xcf\x80 \xc2\xbb\n' | /usr/bin/rs -E 0 2  
«  ҈
҉   Ѯ
2  5
º  6
7  º
3  5
º  φ
π  »

Notice  that in  both cases,  the  gutter width  on  line 2  (row 2)  is
actually 3 spaces  instead of two as  the rest of the  columns appear to
be. I'm not sure why this is  (perhaps the character somehow got a space
prepended). I also notice some strange things if I just output that text
line without  filtering through  rs (the characters  seem to  smash each
other at the beginning of the string).


The only  part of my change  that I wasn't  certain how to test  was the
code found in this condition block:

                } else if ((width = wcwidth(wc)) == -1) {

So if  wcwidth() returns  -1 that  indicates that  the character  is not
printable; perhaps characters 0x01--0x1f? I think that even in this case
the code still  wants to advance bytes_used by the  length of the bytes,
right?

Updated patch:


Index: rs.c
===================================================================
RCS file: /cvs/src/usr.bin/rs/rs.c,v
retrieving revision 1.30
diff -u -p -r1.30 rs.c
--- rs.c	3 Dec 2015 12:23:15 -0000	1.30
+++ rs.c	15 Mar 2024 03:26:52 -0000
@@ -81,7 +81,7 @@ int	propgutter;
 char	isep = ' ', osep = ' ';
 int	owidth = 80, gutter = 2;
 
-int	  mbsavis(char **, const char *);
+int	  mbsavis(char **, const char *, int *, int);
 
 void	  usage(void);
 void	  getargs(int, char *[]);
@@ -120,10 +120,12 @@ void
 getfile(void)
 {
 	const char delim[2] = { isep, '\0' };
-	char *p;
+	char *p, *f;
 	struct entry *ep;
 	int multisep = (flags & ONEISEPONLY ? 0 : 1);
 	int nullpad = flags & NULLPAD;
+	int oneperchar = flags & ONEPERCHAR;
+	int byte_len = 0;
 	struct entry *padto;
 
 	curline = NULL;
@@ -139,6 +141,8 @@ getfile(void)
 		flags |= ONEPERLINE;
 	if (flags & ONEPERLINE)
 		icols = 1;
+	else if (oneperchar)
+		icols = mbsavis(&f, curline, NULL, 0);
 	else				/* count cols on first line */
 		for (p = curline; *p != '\0'; p++) {
 			if (*p == isep && multisep)
@@ -151,7 +155,7 @@ getfile(void)
 	p = curline;
 	do {
 		if (flags & ONEPERLINE) {
-			ep->w = mbsavis(&ep->s, curline);
+			ep->w = mbsavis(&ep->s, curline, NULL, 0);
 			if (maxwidth < ep->w)
 				maxwidth = ep->w;
 			INCR(ep);		/* prepare for next entry */
@@ -160,14 +164,18 @@ getfile(void)
 		}
 		p = curline;
 		while (p != NULL && *p != '\0') {
-			if (*p == isep) {
+			if (oneperchar) {
+				ep->w = mbsavis(&ep->s, p, &byte_len, 1);
+				p += byte_len;
+			} else if (*p == isep) {
 				p++;
 				if (multisep)
 					continue;
 				ep->s = "";	/* empty column */
 				ep->w = 0;
 			} else
-				ep->w = mbsavis(&ep->s, strsep(&p, delim));
+				ep->w = mbsavis(&ep->s, strsep(&p, delim),
+				    NULL, 0);
 			if (maxwidth < ep->w)
 				maxwidth = ep->w;
 			INCR(ep);		/* prepare for next entry */
Index: utf8.c
===================================================================
RCS file: /cvs/src/usr.bin/rs/utf8.c,v
retrieving revision 1.1
diff -u -p -r1.1 utf8.c
--- utf8.c	3 Dec 2015 12:23:15 -0000	1.1
+++ utf8.c	15 Mar 2024 03:26:52 -0000
@@ -20,7 +20,7 @@
 #include <wchar.h>
 
 int
-mbsavis(char** outp, const char *mbs)
+mbsavis(char** outp, const char *mbs, int *bytes_used, int nchars)
 {
 	const char *src;  /* Iterate mbs. */
 	char	 *dst;  /* Iterate *outp. */
@@ -28,33 +28,49 @@ mbsavis(char** outp, const char *mbs)
 	int	  total_width;  /* Display width of the whole string. */
 	int	  width;  /* Display width of a single Unicode char. */
 	int	  len;  /* Length in bytes of UTF-8 encoded string. */
+	int	  count; /* Count of chars read. */
+	int	  nbytes; /* Number of bytes to increment in loop */
 
 	len = strlen(mbs);
 	if ((*outp = malloc(len + 1)) == NULL)
 		err(1, NULL);
 
 	if (MB_CUR_MAX == 1) {
+		len = (nchars && nchars < len) ? nchars : len;
 		memcpy(*outp, mbs, len + 1);
+		(*outp)[len] = '\0';
+		if (bytes_used != NULL)
+			*bytes_used = len;
 		return len;
 	}
 
 	src = mbs;
 	dst = *outp;
 	total_width = 0;
+	count = 0;
+	nbytes = 0;
+	if (bytes_used != NULL)
+		*bytes_used = 0;
 	while (*src != '\0') {
 		if ((len = mbtowc(&wc, src, MB_CUR_MAX)) == -1) {
 			total_width++;
 			*dst++ = '?';
 			src++;
+			nbytes = 1;
 		} else if ((width = wcwidth(wc)) == -1) {
 			total_width++;
 			*dst++ = '?';
 			src += len;
+			nbytes = len;
 		} else {
 			total_width += width;
+			nbytes = len;
 			while (len-- > 0)
 				*dst++ = *src++;
 		}
+		if (bytes_used != NULL)
+			*bytes_used += nbytes;
+		if (nchars > 0 && ++count >= nchars) break;
 	}
 	*dst = '\0';
 	return total_width;

--
Thanks,

Andy