Download raw body.
[REPOST] ksh: utf8 full width character support for emacs.c
Hi list,
Thank everyone for the attention on the issue and the discussion on
unicode.
To make it partially work in the simpliest way imho would be replacing
is_fullwidth and u8_to_cpt with wcwidth(3) and u8_to_wc (with mbtowc).
To be clear, that doesn't mean it will refactor the entire shell to
wchar,
it uses wchar routines only if an unicode occurred to calculate its
width.
Therefore, there won't be an performance issue of calling mbtowc(3)
repeatedly as it only calls it on unicode instead of regular ascii
chars.
After testing, that works pretty good for me.
However there are pros and cons on that solution, and here are some of
my thoughts:
1. setlocale(3) call can't be avoided. It would be
`setlocale(LC_ALL, "")' to make it default to the encoding in
the user envirnoment. And it's quiet clear that's a must if one
wants any i18n from the C library.
2. It behaves differently across locales. If C/POSIX (ASCII)
or encodings other than utf-8 are choosen, unicode character
correctly. [But if we fallback wcwidth of something failed to
one column, it will matche the current (broken) behaviour.] But,
one would find it parsing utf-8 anyways in other locales.
Unfortunately, the issue also exists in the current ksh.
(Since OpenBSD only supports ASCII and UTF8, I don't think
that's really big of a problem.)
3. Also an user-perceived character may not be one rune. But
like emojis, in a terminal, they aren't combined together (at least
on urxvt with Unifont). So I think it's already really ambiguous
to determine the actuall width of a cluster character.
So in conclusion, I suggest switching to wcwidth(3) and mbtowc(3)
despite the (maybe) disadvantage of calling setlocale(3). It solves the
problem without introducing too much complexity and other defects.
Any thoughts?
---
bin/ksh/Makefile | 2 +-
bin/ksh/emacs.c | 187 +++++++++++++++++++++++++++++++++++++++++-----
bin/ksh/main.c | 8 ++
bin/ksh/unicode.c | 33 ++++++++
bin/ksh/unicode.h | 6 ++
5 files changed, 216 insertions(+), 20 deletions(-)
create mode 100644 bin/ksh/unicode.c
create mode 100644 bin/ksh/unicode.h
diff --git a/bin/ksh/Makefile b/bin/ksh/Makefile
index 621c38bcf..d800154e4 100644
--- a/bin/ksh/Makefile
+++ b/bin/ksh/Makefile
@@ -7,7 +7,7 @@ LDADD+= -lcurses
SRCS= alloc.c c_ksh.c c_sh.c c_test.c c_ulimit.c edit.c emacs.c
eval.c \
exec.c expr.c history.c io.c jobs.c lex.c mail.c main.c \
misc.c path.c shf.c syn.c table.c trap.c tree.c tty.c var.c \
- version.c vi.c
+ version.c vi.c unicode.c
WARNINGS=yes
DEFS= -DEMACS -DVI
diff --git a/bin/ksh/emacs.c b/bin/ksh/emacs.c
index 3c9b0074d..813bc8e5a 100644
--- a/bin/ksh/emacs.c
+++ b/bin/ksh/emacs.c
@@ -29,6 +29,11 @@
#include "sh.h"
#include "edit.h"
+#ifndef SMALL
+#include "unicode.h"
+#else
+#define x_size_rev x_size
+#endif
static Area aedit;
#define AEDIT &aedit /* area for kill ring and
macro defns */
@@ -126,6 +131,7 @@ static int x_fword(void);
static void x_goto(char *);
static void x_bs(int);
static int x_size_str(char *);
+static int x_size_rev(int);
static int x_size(int);
static void x_zots(char *);
static void x_zotc(int);
@@ -459,7 +465,7 @@ x_ins(char *s)
if (adj == x_adj_done) { /* has x_adjust() been called?
*/
/* no */
for (cp = xlp; cp > xcp; )
- x_bs(*--cp);
+ x_bs((unsigned char)*--cp);
}
x_adj_ok = 1;
@@ -653,7 +659,7 @@ x_bs(int c)
{
int i;
- i = x_size(c);
+ i = x_size_rev(c);
while (i--)
x_e_putc('\b');
}
@@ -663,20 +669,89 @@ x_size_str(char *cp)
{
int size = 0;
while (*cp)
- size += x_size(*cp++);
+ size += x_size((unsigned char)*cp++);
return size;
}
+#ifndef SMALL
+static int
+x_size_rev(int c)
+{
+ static unsigned char ch[5] = { 0 };
+ static int cnt = 3;
+ int w;
+
+ if (c=='\t')
+ return 4; /* Kludge, tabs are always four
spaces. */
+ if (iscntrl(c)) /* control char */
+ return 2;
+
+ if (!isu8cont(c)) {
+ if (c <= 0x7f) {
+ cnt = 3;
+ return 1;
+ }
+
+ ch[cnt] = c;
+ w = u8width(ch + cnt);
+
+ cnt = 3;
+ memset(ch, 0, 4);
+ return w;
+ } else {
+ if (cnt <= 0)
+ return 0;
+ ch[cnt] = c;
+ cnt--;
+ }
+
+ return 0;
+}
+#endif
+
static int
x_size(int c)
{
+#ifndef SMALL
+ static unsigned char ch[5] = { 0 };
+ static int len = 0, cnt = 0;
+#endif
if (c=='\t')
return 4; /* Kludge, tabs are always four
spaces. */
if (iscntrl(c)) /* control char */
return 2;
+#ifdef SMALL
if (isu8cont(c))
return 0;
return 1;
+#else
+ if (!isu8cont(c)) {
+ if (c <= 0x7f) {
+ len = 0;
+ return 1;
+ }
+
+ if ((c & 0xf8) == 0xf0 && c < 0xf5)
+ len = 3;
+ else if ((c & 0xf0) == 0xe0)
+ len = 2;
+ else if ((c & 0xe0) == 0xc0 && c > 0xc1)
+ len = 1;
+ else {
+ len = 0;
+ return 0;
+ }
+
+ cnt = 0;
+ memset(ch, 0, 5);
+ ch[cnt++] = c;
+ } else {
+ ch[cnt++] = c;
+ if (cnt > len)
+ return u8width(ch);
+ }
+ return 0;
+#endif
}
static void
@@ -1099,6 +1174,8 @@ static int
x_transpose(int c)
{
char tmp;
+ char rune1[4], rune2[4];
+ char *p1, *p2, *p;
/* What transpose is meant to do seems to be up for debate.
This
* is a general summary of the options; the text is abcd with
the
@@ -1124,25 +1201,55 @@ x_transpose(int c)
/* Gosling/Unipress emacs style: Swap two characters
before the
* cursor, do not change cursor position
*/
- x_bs(xcp[-1]);
- x_bs(xcp[-2]);
- x_zotc(xcp[-1]);
- x_zotc(xcp[-2]);
- tmp = xcp[-1];
- xcp[-1] = xcp[-2];
- xcp[-2] = tmp;
+ p1 = xcp;
+ do {
+ x_bs((unsigned char) *--p1);
+ } while (xbuf < p1 && (xcp - p1) < 5 &&
isu8cont(*p1));
+
+ if (p1 == xbuf) {
+ x_e_putc(BEL);
+ return KSTD;
+ }
+
+ p2 = p1;
+ do {
+ x_bs((unsigned char) *--p2);
+ } while (xbuf <= p2 && (p1 - p2) < 5 &&
isu8cont(*p2));
+
+ for (p = p1; p < xcp; p++)
+ x_zotc(*p);
+ for (p = p2; p < p1; p++)
+ x_zotc(*p);
+
+ memcpy(rune1, p1, xcp - p1);
+ memcpy(rune2, p2, p1 - p2);
+ memcpy(p2, rune1, xcp - p1);
+ memcpy(p2 + (xcp - p1), rune2, p1 - p2);
} else {
/* GNU emacs style: Swap the characters before and
under the
* cursor, move cursor position along one.
*/
- x_bs(xcp[-1]);
- x_zotc(xcp[0]);
- x_zotc(xcp[-1]);
- tmp = xcp[-1];
- xcp[-1] = xcp[0];
- xcp[0] = tmp;
- x_bs(xcp[0]);
- x_goto(xcp + 1);
+ p1 = xcp + 1;
+ while (p1 < xep && *p1 && (p1 - xcp) <= 5 &&
isu8cont(*p1))
+ p1++;
+
+ p2 = xcp;
+ do {
+ x_bs((unsigned char) *--p2);
+ } while (xbuf <= p2 && (xcp - p2) < 5 &&
isu8cont(*p2));
+
+ for (p = xcp; p < p1; p++)
+ x_zotc(*p);
+ for (p = p2; p < xcp; p++)
+ x_zotc(*p);
+
+ memcpy(rune1, xcp, p1 - xcp);
+ memcpy(rune2, p2, xcp - p2);
+ memcpy(p2, rune1, p1 - xcp);
+ memcpy(p2 + (p1 - xcp), rune2, xcp - p2);
+
+ xcp = p1;
+ x_goto(p1);
}
return KSTD;
}
@@ -1804,6 +1911,11 @@ x_adjust(void)
*/
if ((xbp = xcp - (x_displen / 2)) < xbuf)
xbp = xbuf;
+ else {
+ /* rewind to the last valid codepoint */
+ while (xbp > xbuf && isu8cont((unsigned char) *xbp))
+ xbp--;
+ }
xlp_valid = false;
x_redraw(xx_cols);
x_flush();
@@ -1882,8 +1994,15 @@ x_e_getu8(char *buf, int off)
}
static void
-x_e_putc(int c)
+x_e_putc(int sc)
{
+#ifndef SMALL
+ static unsigned char ch[5] = { 0 };
+ static int len = 0, cnt = 0;
+#endif
+ unsigned char c;
+
+ c = sc;
if (c == '\r' || c == '\n')
x_col = 0;
if (x_col < xx_cols) {
@@ -1898,9 +2017,39 @@ x_e_putc(int c)
x_col--;
break;
default:
+#ifdef SMALL
if (!isu8cont(c))
x_col++;
break;
+#else
+ if (!isu8cont(c)) {
+ if (c <= 0x7f) {
+ x_col++;
+ len = 0;
+ break;
+ }
+
+ if ((c & 0xf8) == 0xf0 && c < 0xf5)
+ len = 3;
+ else if ((c & 0xf0) == 0xe0)
+ len = 2;
+ else if ((c & 0xe0) == 0xc0 && c >
0xc1)
+ len = 1;
+ else {
+ len = 0;
+ break;
+ }
+
+ cnt = 0;
+ memset(ch, 0, 5);
+ ch[cnt++] = c;
+ } else {
+ ch[cnt++] = c;
+ if (cnt > len)
+ x_col += u8width(ch);
+ }
+ break;
+#endif
}
}
if (x_adj_ok && (x_col < 0 || x_col >= (xx_cols - 2)))
diff --git a/bin/ksh/main.c b/bin/ksh/main.c
index b011b22bf..a269c5364 100644
--- a/bin/ksh/main.c
+++ b/bin/ksh/main.c
@@ -15,6 +15,10 @@
#include <string.h>
#include <unistd.h>
+#ifndef SMALL
+#include <locale.h>
+#endif
+
#include "sh.h"
extern char **environ;
@@ -152,6 +156,10 @@ main(int argc, char *argv[])
struct env env;
pid_t ppid;
+#ifndef SMALL
+ setlocale(LC_ALL, "");
+#endif
+
kshname = argv[0];
if (issetugid()) { /* could later drop privileges */
diff --git a/bin/ksh/unicode.c b/bin/ksh/unicode.c
new file mode 100644
index 000000000..a1b3c4a39
--- /dev/null
+++ b/bin/ksh/unicode.c
@@ -0,0 +1,33 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+#include "unicode.h"
+
+#ifndef SMALL
+
+int
+u8width(const char *buf) {
+ const unsigned char *ubuf = buf;
+ wchar_t wc;
+ int u8len;
+
+ u8len = 0;
+ if (ubuf[0] <= 0x7F)
+ u8len = 1;
+ else if ((ubuf[0] & 0xE0) == 0xC0)
+ u8len = 2;
+ else if ((ubuf[0] & 0xF0) == 0xE0)
+ u8len = 3;
+ else if ((ubuf[0] & 0xF8) == 0xF0)
+ u8len = 4;
+
+ if (mbtowc(&wc, buf, u8len) == u8len)
+ return wcwidth(wc);
+ else {
+ mbtowc(&wc, NULL, 0);
+ return 1;
+ }
+}
+
+#endif
diff --git a/bin/ksh/unicode.h b/bin/ksh/unicode.h
new file mode 100644
index 000000000..61ac18cce
--- /dev/null
+++ b/bin/ksh/unicode.h
@@ -0,0 +1,6 @@
+#ifndef UNICODE_H
+#define UNICODE_H
+
+int u8width(const char *);
+
+#endif /* UNICODE_H */
[REPOST] ksh: utf8 full width character support for emacs.c