diff options
author | Přemysl Janouch <p@janouch.name> | 2018-09-14 16:55:01 +0200 |
---|---|---|
committer | Přemysl Janouch <p@janouch.name> | 2018-10-07 18:09:09 +0200 |
commit | d04c140a6923607ec35a1a4e71cef9be253c0a49 (patch) | |
tree | c0c5fc6c0cfa138b7485d4c2d8c5ab6d1cf5cb95 /ht/gen-rune-width.sh | |
parent | cd6c9e4d8c6a49eb91091c00b55e0f879c9220bf (diff) | |
download | haven-d04c140a6923607ec35a1a4e71cef9be253c0a49.tar.gz haven-d04c140a6923607ec35a1a4e71cef9be253c0a49.tar.xz haven-d04c140a6923607ec35a1a4e71cef9be253c0a49.zip |
ht: add generated wcwidth tables and algorithm
Diffstat (limited to 'ht/gen-rune-width.sh')
-rwxr-xr-x | ht/gen-rune-width.sh | 84 |
1 files changed, 84 insertions, 0 deletions
diff --git a/ht/gen-rune-width.sh b/ht/gen-rune-width.sh new file mode 100755 index 0000000..c4889fb --- /dev/null +++ b/ht/gen-rune-width.sh @@ -0,0 +1,84 @@ +#!/bin/sh +u=https://www.unicode.org/Public/ + +# Download and filter Unicode data files with the given category expression, +# producing a list of possibly duplicitous codepoints in decimal format +retrieve() { + curl --silent --show-error --location "$2" | perl -lne 's/#.*//; s/ //g; + next unless /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?;('"$1"')$/; + print for hex $1 .. hex ($2 // $1);' +} + +togo() { + sort -nu | perl -lne ' + sub flush { printf "{0x%04x, 0x%04x},\n", $first, $last } + BEGIN { $first = $last = <> } + if ($_ != $last + 1) { flush; $first = $_; } + $last = $_; + END { flush if defined $first }' | column -xc 72 +} + +gofmt <<EOF +// Code generated by running "go generate" in janouch.name/haven. DO NOT EDIT. + +package $GOPACKAGE + +// RuneWidth returns the column width of Go runes, using an algorithm and tables +// derived from https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c: +// - The null character (U+0000) has a column width of 0. +// - Other C0/C1 control characters and DEL will lead to a return value of -1. +// - Non-spacing and enclosing combining characters (general category code +// Mn or Me in the Unicode database) have a column width of 0. +// - SOFT HYPHEN (U+00AD) has a column width of 1. +// - Other format characters (general category code Cf in the Unicode database) +// and ZERO WIDTH SPACE (U+200B) have a column width of 0. +// - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) have +// a column width of 0. +// - Spacing characters in the East Asian Wide (W) or East Asian Full-width (F) +// category as defined in Unicode UAX #11 have a column width of 2. +// - All remaining characters (including all printable ISO 8859-1 and WGL4 +// characters, Unicode control characters, etc.) have a column width of 1. +// +// Local changes: +// - Tables are generated from the latest available version of Unicode. +func RuneWidth(r rune) int { + switch { + case r == 0: + return 0 + case r < 32 || r >= 0x7f && r < 0xa0: + return -1 + case zeroWidthRunes.contains(r): + return 0 + case fullWidthRunes.contains(r): + return 2 + } + return 1 +} + +type runeRange struct{ first, last rune } +type runeRangeTable []runeRange + +func (table runeRangeTable) contains(r rune) bool { + min, max := 0, len(table)-1 + for max >= min { + mid := (min + max) / 2 + if table[mid].last < r { + min = mid + 1 + } else if table[mid].first > r { + max = mid - 1 + } else { + return true + } + } + return false +} + +var zeroWidthRunes = runeRangeTable{ +$({ retrieve 'Me|Mn|Cf' $u/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt; + seq 0x1160 0x11ff; echo $((0x200B)); } | grep -xv $((0x00AD)) | togo) +} + +var fullWidthRunes = runeRangeTable{ +$(retrieve 'W|F' $u/UCD/latest/ucd/EastAsianWidth.txt | togo) +} +EOF |