aboutsummaryrefslogtreecommitdiff
path: root/ht/gen-rune-width.sh
diff options
context:
space:
mode:
Diffstat (limited to 'ht/gen-rune-width.sh')
-rwxr-xr-xht/gen-rune-width.sh84
1 files changed, 84 insertions, 0 deletions
diff --git a/ht/gen-rune-width.sh b/ht/gen-rune-width.sh
new file mode 100755
index 0000000..c4889fb
--- /dev/null
+++ b/ht/gen-rune-width.sh
@@ -0,0 +1,84 @@
+#!/bin/sh
+u=https://www.unicode.org/Public/
+
+# Download and filter Unicode data files with the given category expression,
+# producing a list of possibly duplicitous codepoints in decimal format
+retrieve() {
+ curl --silent --show-error --location "$2" | perl -lne 's/#.*//; s/ //g;
+ next unless /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?;('"$1"')$/;
+ print for hex $1 .. hex ($2 // $1);'
+}
+
+togo() {
+ sort -nu | perl -lne '
+ sub flush { printf "{0x%04x, 0x%04x},\n", $first, $last }
+ BEGIN { $first = $last = <> }
+ if ($_ != $last + 1) { flush; $first = $_; }
+ $last = $_;
+ END { flush if defined $first }' | column -xc 72
+}
+
+gofmt <<EOF
+// Code generated by running "go generate" in janouch.name/haven. DO NOT EDIT.
+
+package $GOPACKAGE
+
+// RuneWidth returns the column width of Go runes, using an algorithm and tables
+// derived from https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c:
+// - The null character (U+0000) has a column width of 0.
+// - Other C0/C1 control characters and DEL will lead to a return value of -1.
+// - Non-spacing and enclosing combining characters (general category code
+// Mn or Me in the Unicode database) have a column width of 0.
+// - SOFT HYPHEN (U+00AD) has a column width of 1.
+// - Other format characters (general category code Cf in the Unicode database)
+// and ZERO WIDTH SPACE (U+200B) have a column width of 0.
+// - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) have
+// a column width of 0.
+// - Spacing characters in the East Asian Wide (W) or East Asian Full-width (F)
+// category as defined in Unicode UAX #11 have a column width of 2.
+// - All remaining characters (including all printable ISO 8859-1 and WGL4
+// characters, Unicode control characters, etc.) have a column width of 1.
+//
+// Local changes:
+// - Tables are generated from the latest available version of Unicode.
+func RuneWidth(r rune) int {
+ switch {
+ case r == 0:
+ return 0
+ case r < 32 || r >= 0x7f && r < 0xa0:
+ return -1
+ case zeroWidthRunes.contains(r):
+ return 0
+ case fullWidthRunes.contains(r):
+ return 2
+ }
+ return 1
+}
+
+type runeRange struct{ first, last rune }
+type runeRangeTable []runeRange
+
+func (table runeRangeTable) contains(r rune) bool {
+ min, max := 0, len(table)-1
+ for max >= min {
+ mid := (min + max) / 2
+ if table[mid].last < r {
+ min = mid + 1
+ } else if table[mid].first > r {
+ max = mid - 1
+ } else {
+ return true
+ }
+ }
+ return false
+}
+
+var zeroWidthRunes = runeRangeTable{
+$({ retrieve 'Me|Mn|Cf' $u/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt;
+ seq 0x1160 0x11ff; echo $((0x200B)); } | grep -xv $((0x00AD)) | togo)
+}
+
+var fullWidthRunes = runeRangeTable{
+$(retrieve 'W|F' $u/UCD/latest/ucd/EastAsianWidth.txt | togo)
+}
+EOF