Index | Thread | Search

From:
Lucas Gabriel Vuotto <lucas@sexy.is>
Subject:
finger: handle UTF-8 characters in .plan and .project
To:
tech@openbsd.org
Date:
Sun, 25 Aug 2024 19:01:28 +0000

Download raw body.

Thread
Hey,

I noticed that finger(8) didn't show non-ASCII UTF-8 characters
correctly. Here's a patch that fixes that, mostly lifted from
usr.bin/less/line.c . Correct sequences are well-tested, control
sequences are well-tested, incorrect sequences handling is slightly
tested.

- Correct sequence: printf '\xc3\xa1\n' >.plan . Will render an U+00E1,
  LATIN SMALL LETTER A WITH ACUTE.
- Incorrect sequence: printf '\xdf\xdf\xdf\xdf\xdf\xdf\xdf\xdf' >.plan
  will render 8 underscores _.

btw, is there a reason why the files are read one char at a time?
Should it be replace with getline?

	Lucas


diff refs/heads/master ca749c632f5b1a52dc18ecfd675d1cfa7d153a2e
commit - ed8f5e8d82ace15e4cefca2c82941b15cb1a7830
commit + ca749c632f5b1a52dc18ecfd675d1cfa7d153a2e
blob - 11d5295c613f7ce1ebf6106d00ee8043e3e64ef1
blob + 9bed5bc14d805cb5e9f4d4d21015279695e9b63c
--- usr.bin/finger/finger.c
+++ usr.bin/finger/finger.c
@@ -63,6 +63,7 @@
 #include <time.h>
 #include <unistd.h>
 #include <limits.h>
+#include <locale.h>
 #include <err.h>
 #include "finger.h"
 #include "extern.h"
@@ -82,6 +83,8 @@ main(int argc, char *argv[])
 	char domain[HOST_NAME_MAX+1];
 	struct stat sb;
 
+	setlocale(LC_ALL, "");
+
 	oflag = 1;		/* default to old "office" behavior */
 
 	while ((ch = getopt(argc, argv, "lmMpsho")) != -1)
blob - 03de1aa5cf3e7123cab82b51b222e3e1f0f27a64
blob + baeaf87b87c826f89c764645f36ef02d41f49735
--- usr.bin/finger/lprint.c
+++ usr.bin/finger/lprint.c
@@ -38,13 +38,15 @@
 #include <ctype.h>
 #include <paths.h>
 #include <vis.h>
+#include <wchar.h>
 #include "finger.h"
 #include "extern.h"
 
-#define	LINE_LEN	80
-#define	TAB_LEN		8		/* 8 spaces between tabs */
-#define	_PATH_PLAN	".plan"
-#define	_PATH_PROJECT	".project"
+#define	LINE_LEN		80
+#define	MAX_UTF_CHAR_LEN	6
+#define	TAB_LEN			8		/* 8 spaces between tabs */
+#define	_PATH_PLAN		".plan"
+#define	_PATH_PROJECT		".project"
 
 void
 lflag_print(void)
@@ -252,6 +254,10 @@ demi_print(char *str, int oddfield)
 int
 show_text(char *directory, char *file_name, char *header)
 {
+	mbstate_t mbs;
+	char mbbuf[MAX_UTF_CHAR_LEN];
+	size_t i, mbidx, sz;
+	wchar_t wc;
 	int ch, lastc;
 	FILE *fp;
 
@@ -260,8 +266,40 @@ show_text(char *directory, char *file_name, char *head
 	if ((fp = fopen(tbuf, "r")) == NULL)
 		return (0);
 	(void)printf("%s\n", header);
-	while ((ch = getc(fp)) != EOF)
-		vputc(lastc = ch);
+	mbidx = 0;
+	while ((ch = getc(fp)) != EOF) {
+		lastc = ch;
+		mbbuf[mbidx++] = ch;
+		memset(&mbs, 0, sizeof(mbs));
+		sz = mbrtowc(&wc, mbbuf, mbidx, &mbs);
+
+		/* Incomplete UTF-8 sequence. */
+		if (sz == (size_t)-2)
+			continue;
+
+		/* Complete UTF-8 sequence. */
+		if (sz != (size_t)-1) {
+			if (sz > 1)
+				(void)putwchar(wc);
+			else
+				vputc(ch);
+			mbidx = 0;
+			continue;
+		}
+
+		/*
+		 * Invalid UTF-8 sequence. vis the first buffered char, advance
+		 * the buffer one char and retry.
+		 */
+		vputc(mbbuf[0]);
+		memmove(mbbuf, mbbuf + 1, mbidx - 1);
+		mbidx--;
+	}
+
+	/* If the UTF-8 sequence is incomplete, vis all the buffered chars. */
+	for (i = 0; i < mbidx; i++)
+		vputc(lastc = mbbuf[i]);
+
 	if (lastc != '\n')
 		(void)putchar('\n');
 	(void)fclose(fp);