From: Lucas Gabriel Vuotto Subject: finger: handle UTF-8 characters in .plan and .project To: tech@openbsd.org Date: Sun, 25 Aug 2024 19:01:28 +0000 Hey, I noticed that finger(8) didn't show non-ASCII UTF-8 characters correctly. Here's a patch that fixes that, mostly lifted from usr.bin/less/line.c . Correct sequences are well-tested, control sequences are well-tested, incorrect sequences handling is slightly tested. - Correct sequence: printf '\xc3\xa1\n' >.plan . Will render an U+00E1, LATIN SMALL LETTER A WITH ACUTE. - Incorrect sequence: printf '\xdf\xdf\xdf\xdf\xdf\xdf\xdf\xdf' >.plan will render 8 underscores _. btw, is there a reason why the files are read one char at a time? Should it be replace with getline? Lucas diff refs/heads/master ca749c632f5b1a52dc18ecfd675d1cfa7d153a2e commit - ed8f5e8d82ace15e4cefca2c82941b15cb1a7830 commit + ca749c632f5b1a52dc18ecfd675d1cfa7d153a2e blob - 11d5295c613f7ce1ebf6106d00ee8043e3e64ef1 blob + 9bed5bc14d805cb5e9f4d4d21015279695e9b63c --- usr.bin/finger/finger.c +++ usr.bin/finger/finger.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include "finger.h" #include "extern.h" @@ -82,6 +83,8 @@ main(int argc, char *argv[]) char domain[HOST_NAME_MAX+1]; struct stat sb; + setlocale(LC_ALL, ""); + oflag = 1; /* default to old "office" behavior */ while ((ch = getopt(argc, argv, "lmMpsho")) != -1) blob - 03de1aa5cf3e7123cab82b51b222e3e1f0f27a64 blob + baeaf87b87c826f89c764645f36ef02d41f49735 --- usr.bin/finger/lprint.c +++ usr.bin/finger/lprint.c @@ -38,13 +38,15 @@ #include #include #include +#include #include "finger.h" #include "extern.h" -#define LINE_LEN 80 -#define TAB_LEN 8 /* 8 spaces between tabs */ -#define _PATH_PLAN ".plan" -#define _PATH_PROJECT ".project" +#define LINE_LEN 80 +#define MAX_UTF_CHAR_LEN 6 +#define TAB_LEN 8 /* 8 spaces between tabs */ +#define _PATH_PLAN ".plan" +#define _PATH_PROJECT ".project" void lflag_print(void) @@ -252,6 +254,10 @@ demi_print(char *str, int oddfield) int show_text(char *directory, char *file_name, char *header) { + mbstate_t mbs; + char mbbuf[MAX_UTF_CHAR_LEN]; + size_t i, mbidx, sz; + wchar_t wc; int ch, lastc; FILE *fp; @@ -260,8 +266,40 @@ show_text(char *directory, char *file_name, char *head if ((fp = fopen(tbuf, "r")) == NULL) return (0); (void)printf("%s\n", header); - while ((ch = getc(fp)) != EOF) - vputc(lastc = ch); + mbidx = 0; + while ((ch = getc(fp)) != EOF) { + lastc = ch; + mbbuf[mbidx++] = ch; + memset(&mbs, 0, sizeof(mbs)); + sz = mbrtowc(&wc, mbbuf, mbidx, &mbs); + + /* Incomplete UTF-8 sequence. */ + if (sz == (size_t)-2) + continue; + + /* Complete UTF-8 sequence. */ + if (sz != (size_t)-1) { + if (sz > 1) + (void)putwchar(wc); + else + vputc(ch); + mbidx = 0; + continue; + } + + /* + * Invalid UTF-8 sequence. vis the first buffered char, advance + * the buffer one char and retry. + */ + vputc(mbbuf[0]); + memmove(mbbuf, mbbuf + 1, mbidx - 1); + mbidx--; + } + + /* If the UTF-8 sequence is incomplete, vis all the buffered chars. */ + for (i = 0; i < mbidx; i++) + vputc(lastc = mbbuf[i]); + if (lastc != '\n') (void)putchar('\n'); (void)fclose(fp);