ktrans: pinyin, cleanup and documentation updates

* Added pinyin alternative Chinese input dictionary * Remove Cyrilic and Greek input, use kbmap instead * Ensure ktrans dictionaries are copied to iso * Cleanup ktrans(1) * Document dictionary format in ktrans(6) * Fix ktrans example in riow(1)
2025-01-12 11:10:06 +00:00 · 2023-08-09 23:33:59 +00:00 · 2023-08-09 23:33:59 +00:00 · 19b38409fc
commit 19b38409fc
parent e5c7fe6305
10 changed files with 28439 additions and 222 deletions
--- a/lib/ktrans/cyril.map
+++ b/lib/ktrans/cyril.map
@ -1,81 +0,0 @@
-YO	Ё
-Yo	Ё
-A	А
-B	Б
-V	В
-G	Г
-D	Д
-Ye	Е
-YE	Е
-E	Е
-Zh	Ж
-ZH	Ж
-Z	З
-I	И
-J	Й
-K	К
-L	Л
-M	М
-N	Н
-O	О
-P	П
-R	Р
-S	С
-T	Т
-U	У
-F	Ф
-Kh	Х
-KH	Х
-X	Х
-Ts	Ц
-TS	Ц
-Ch	Ч
-CH	Ч
-Sh	Ш
-SH	Ш
-Shch	Щ 
-SHCH	Щ 
-''	ъ
-Y	Ы
-'	ь
-EH	Э
-Eh	Э
-Yu	Ю
-YU	Ю
-Ya	Я
-YA	Я
-a	а
-b	б
-v	в
-g	г
-d	д
-ye	е
-e	е
-zh	ж
-z	з
-i	и
-j	й
-k	к
-l	л
-m	м
-n	н
-o	о
-p	п
-r	р
-s	с
-t	т
-u	у
-f	ф
-kh	х
-x	х
-ts	ц
-ch	ч
-sh	ш
-shch	щ 
-''	ъ
-y	ы
-'	ь
-eh	э
-yu	ю
-ya	я
-yo	ё
--- a/lib/ktrans/greek.map
+++ b/lib/ktrans/greek.map
@ -1,82 +0,0 @@
-A	Α
-'A	Ά
-B	Β
-G	Γ
-D	Δ
-E	Ε
-'E	Έ
-Z	Ζ
-E!	Η
-'E!	Έ
-TH	Θ
-Th	Θ
-I	Ι
-'I	Ί
-K	Κ
-L	Λ
-M	Μ
-N	Ν
-KS	Ξ
-Ks	Ξ
-O	Ο
-'O	Ό
-P	Π
-R	Ρ
-S	Σ
-T	Τ
-U	Υ
-'U	Ύ
-F	Φ
-CH	Χ
-Ch	Χ
-PS	Ψ
-Ps	Ψ
-O!	Ω
-W	Ω
-'O!	Ώ
-'W	Ώ
-a	α
-'a	ά
-b	β
-v	β
-g	γ
-d	δ
-e	ε
-'e	έ
-z	ζ
-e!	η
-'e!	ή
-ii	η
-'ii	ή
-h	η
-'h	ή
-th	θ
-i	ι
-'i	ί
-k	κ
-l	λ
-m	μ
-n	ν
-ks	ξ
-x	ξ
-o	ο
-'o	ό
-p	π
-r	ρ
-s 	ς
-s.	ς
-s	ς
-s\n	ς
-s	σ
-t	τ
-u	υ
-'u	ΰ
-y	υ
-'y	ΰ
-f	φ
-ch	χ
-ps	ψ
-o!	ω
-w	ω
-'o!	ώ
-'w	ώ
--- a/lib/ktrans/judou.map
+++ b/lib/ktrans/judou.map
@ -0,0 +1,28 @@
+ 	　
+,	，
+.	。
+<	《
+>	》
+/	／
+?	？
+;	；
+:	：
+\	、
+|	・
+`	｀
+~	〜
+!	！
+@	＠
+#	＃
+$	￥
+&	＆
+*	＊
+(	（
+)	）
+-	－
+	＋
+=	＝
+[	「
+]	」
+{	『
+}	』
--- a/lib/ktrans/pinyin.dict
+++ b/lib/ktrans/pinyin.dict
--- a/sys/lib/sysconfig/proto/distproto
+++ b/sys/lib/sysconfig/proto/distproto
@ -25,6 +25,8 @@ lib	d775
 	font	d775
 		bit	d775
 			+
+	ktrans	d755
+		+
 	legal	d775
 		*
 	map	d775
--- a/sys/man/1/ktrans
+++ b/sys/man/1/ktrans
@ -42,12 +42,6 @@ Japanese Katakana.
 .B ctl-c
 Chinese.
 .TP
-.B ctl-r
-Russian.
-.TP
-.B ctl-o
-Greek.
-.TP
 .B ctl-s
 Korean.
 .TP
@ -75,8 +69,7 @@ be explicitly matched by cycling through a list of options.
 automatically maintains a buffer of the current series of
 key strokes being considered for an explicit match, and resets
 that buffer on logical "word" breaks depending on the language.
-However manual hints of when to reset this buffer will likely
-still be required.
+However in some cases the automatic hinting will be insufficient.
 .PP
 Input is always passed along, when a match is found
 .I Ktrans
@ -113,7 +106,7 @@ Clear Kanji buffer (ctl-l)
 Switch to Hiragana (ctl-n)
 .TP
 .B Shift + Hiragana / Katakana
-Switch to Katakana (ctl-v)
+Switch to Katakana (ctl-k)
 .TP
 .B Hankaku / Zenkaku
 Switch to Hiragana (ctl-n)
@ -124,7 +117,23 @@ Switch to passthrough (ctl-t)
 .B Shift + Space
 Convert to Kanji (ctl-\e).
 This is a fallback for keyboards without a physical Henkan key.
-.SH JAPANESE
+.SH DICTIONARIES
+All implicit and explicit matching dictionaries are provided as plain
+text files within
+.BR /lib/ktrans .
+The formats of which are specified within
+.IR ktrans (6).
+Users may create and or modify existing dictionaries by binding over
+the system defaults.
+.PP
+For backwards compatibility the
+.B jisho
+and
+.B zidian
+environment variables may also be set to pick explicit lookup dictionaries
+for Japanese and Chinese respectfully.
+.SH LANGUAGES
+.SS JAPANESE
 The Hiragana and Katakana modes implicitly turn Hepburn representations
 in to their Kana counterparts. Explicit conversions combine sequences
 of Hiragana in to Kanji.
@ -145,34 +154,19 @@ as part of the lookup sequence itself. So to write
 私の猫
 .ft
 the user types "watashiNO[^\e]neko[^\e]". Note that in both cases
-we have successfully communicated to krans when to reset the explicit
+we have successfully communicated to ktrans when to reset the explicit
 match buffer without needing to explicitly give a ctl-l character.
-.SH CHINESE
-The Wubizixing input method is used. No implicit conversion is done,
-explicit conversion interprets Latin characters as their Wubi counterparts
-to do lookup of Hanzi.
-.SH RUSSIAN
-Implicit layer converts latin to Cyrillic; the transliteration is mostly
-phonetic, with
-.B '
-for
-.IR myagkij-znak
-(ь),
-.B ''
-for
-.I tverdyj-znak
-(ъ)
-.I yo
-for ё,
-.B j
-for
-.IR i-kratkaya
-(й).
-.SH VIETNAMESE
+.SS CHINESE
+Implicit conversion converts punctuation. Explicit matches
+use a dictionary to convert a series of Latin characters
+into Hanzi. By default a Wubizixing input dictionary
+is used. Additionally a Pinyin input dictionary 
+is provided.
+.SS VIETNAMESE
 Implicit conversion is modeled after Telex, supporting
 standard diacritic suffixes.
-.SH KOREAN
-Mapping is done by emulating a Dubeolsik layout, with each latin
+.SS KOREAN
+Mapping is done by emulating a Dubeolsik layout, with each Latin
 character mapping to a single Jamo. Sequences of up to three Jamo
 are automatically converted to Hangul syllables.
 .SH EXAMPLES
@ -195,14 +189,15 @@ respectively.
 .SH SOURCE
 .B /sys/src/cmd/ktrans
 .SH SEE ALSO
-.IR rio (4)
+.IR ktrans (6),
+.IR rio (4),
 .IR kbdfs (8)
 .SH BUGS
 .PP
 There is no hint from rio when the user moves the cursor, as such
 moving it is unlikely to result in what the user expects.
 .PP
-Plan9 lacks support for rendering combinational Unicode sequences,
+Plan 9 lacks support for rendering combinational Unicode sequences,
 limiting the use of some code ranges.
 .SH HISTORY
 Ktrans was originally written by Kenji Okamoto in August of 2000 for
--- a/sys/man/1/riow
+++ b/sys/man/1/riow
@ -36,7 +36,7 @@ Example of running
 .I riow
 with other programs handling input:
 .EX
-	</dev/kbdtap ktrans |
+	</dev/kbdtap ktrans -G |
 		reform/shortcuts |
 		riow >/dev/kbdtap |[3] bar
 .EE
--- a/sys/man/6/ktrans
+++ b/sys/man/6/ktrans
@ -0,0 +1,56 @@
+.TH KTRANS 6
+.SH NAME
+ktrans \- format of ktrans dictionaries
+.SH DESCRIPTION
+.I Ktrans
+uses two plain text file formats for lookup dictionaries, one each for
+implicit and explicit matches. Both are stored within
+.BR /lib/ktrans .
+.SS MAP
+The
+.B .map
+files provide the implicit matching information. The file is a
+sequence of lines, each of which is a single character sequence mapping.
+Each mapping is a sequence of one or more input keys and output keys, separated
+by a single tab character.
+.PP
+The following
+.B .map
+files correspond to the following conversions:
+.TP
+hira.map
+Japanese Hiragana
+.TP
+kata.map
+Japanese Katakana
+.TP
+judou.map
+Chinese punctuation
+.TP
+hangul.map
+Korean Hangul
+.TP
+telex.map
+Vietnamese Telex
+.SS DICT
+The
+.B .dict
+files provide the explicit matching information. The file is
+a sequence of lines, each of which is comprised of an input sequence,
+followed by a tab, followed by the list of all candidates. Each candidate
+is separated by a single space character.
+.PP
+The following
+.B .dict
+files correspond to the following conversions:
+.TP
+kanji.dict
+Japanese Kanji
+.TP
+wubi.dict
+Chinese Wubizixing (default)
+.TP
+pinyin.dict
+Chinese Pinyin
+.SH "SEE ALSO"
+.IR ktrans (1)
--- a/sys/src/cmd/ktrans/hash.c
+++ b/sys/src/cmd/ktrans/hash.c
@ -34,7 +34,7 @@ hmapalloc(int nbuckets, int size)
 	nsz = Tagsize + size;
 	store = mallocz(sizeof(*h) + (nbuckets * nsz), 1);
 	if(store == nil)
-		return nil;
+		sysfatal("hmapalloc: out of memory");

 	h = store;
 	h->nbs = nbuckets;
@ -82,6 +82,8 @@ hmaprepl(Hmap **store, char *key, void *new, void *old, int freekeys)

 		h->cap *= 2;
 		*store = realloc(*store, sizeof(*h) + h->cap*h->nsz);
+		if(*store == nil)
+			sysfatal("hmaprepl: out of memory");
 		h = *store;
 		h->nodes = (uchar*)*store + sizeof(*h);
 		memset(h->nodes + h->len*h->nsz, 0, h->nsz);
--- a/sys/src/cmd/ktrans/main.c
+++ b/sys/src/cmd/ktrans/main.c
@ -198,8 +198,6 @@ enum{
 	LangEN 	= '',	// ^t
 	LangJP	= '', 	// ^n
 	LangJPK = '',	// ^k
-	LangRU 	= '',	// ^r
-	LangEL	= '',	// ^o
 	LangKO	= '',	// ^s
 	LangZH	= '',	// ^c
 	LangVN	= '',	// ^v
@ -209,20 +207,16 @@ int deflang;

 Hmap *natural;
 Hmap *hira, *kata, *jisho;
-Hmap *cyril;
-Hmap *greek;
 Hmap *hangul;
-Hmap *hanzi, *zidian;
+Hmap *judou, *zidian;
 Hmap *telex;

 Hmap **langtab[] = {
 	[LangEN]  &natural,
 	[LangJP]  &hira,
 	[LangJPK] &kata,
-	[LangRU]  &cyril,
-	[LangEL]  &greek,
 	[LangKO]  &hangul,
-	[LangZH]  &hanzi,
+	[LangZH]  &judou,
 	[LangVN]  &telex,
 };

@ -230,8 +224,6 @@ char *langcodetab[] = {
 	[LangEN]  "en",
 	[LangJP]  "jp",
 	[LangJPK] "jpk",
-	[LangRU]  "ru",
-	[LangEL]  "el",
 	[LangKO]  "ko",
 	[LangZH]  "zh",
 	[LangVN]  "vn",
@ -527,7 +519,7 @@ dictthread(void*)
 					mode = Okuri;
 					*p = tolower(*p);
 					okuri.p = pushutf(okuri.b, strend(&okuri), p, 1);
-					goto Line;	
+					goto Line;
 				}

 				switch(mode){
@ -652,7 +644,7 @@ keythread(void*)
 			switch(lang){
 			case LangZH:
 				emitutf(dictch, p, 1);
-				continue;
+				break;
 			case LangJP:
 				emitutf(dictch, p, 1);
 				if(isupper(*p))
@ -792,12 +784,24 @@ usage(void)
 	threadexits("usage");
 }

+struct {
+	char *s;
+	Hmap **m;
+} inittab[] = {
+	"judou", &judou,
+	"hira", &hira,
+	"kata", &kata,
+	"hangul", &hangul,
+	"telex", &telex,
+};
+
 mainstacksize = 8192*2;

 void
 threadmain(int argc, char *argv[])
 {
-	int nogui;
+	int nogui, i;
+	char buf[128];
 	char *jishoname, *zidianname;

 	deflang = LangEN;
@ -849,13 +853,12 @@ threadmain(int argc, char *argv[])
 		zidianname = "/lib/ktrans/wubi.dict";
 	zidian = opendict(nil, zidianname);

-	natural = hanzi = nil;
-	hira 	= openmap("/lib/ktrans/hira.map");
-	kata 	= openmap("/lib/ktrans/kata.map");
-	greek 	= openmap("/lib/ktrans/greek.map");
-	cyril 	= openmap("/lib/ktrans/cyril.map");
-	hangul 	= openmap("/lib/ktrans/hangul.map");
-	telex	= openmap("/lib/ktrans/telex.map");
+	natural = nil;
+	for(i = 0; i < nelem(inittab); i++){
+		snprint(buf, sizeof buf, "/lib/ktrans/%s.map", inittab[i].s);
+		if((*inittab[i].m = openmap(buf)) == nil)
+			sysfatal("failed to open map: %r");
+	}

 	dictch 	= chancreate(Msgsize, 0);
 	input 	= chancreate(Msgsize, 0);