aboutsummaryrefslogtreecommitdiff
path: root/ht/gen-rune-width.sh
blob: c4889fb151a63e7eb7d478b8d895cc863710b9ec (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/sh
u=https://www.unicode.org/Public/

# Download and filter Unicode data files with the given category expression,
# producing a list of possibly duplicitous codepoints in decimal format
retrieve() {
	curl --silent --show-error --location "$2" | perl -lne 's/#.*//; s/ //g;
		next unless /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?;('"$1"')$/;
		print for hex $1 .. hex ($2 // $1);'
}

togo() {
	sort -nu | perl -lne '
		sub flush { printf "{0x%04x, 0x%04x},\n", $first, $last }
		BEGIN { $first = $last = <> }
		if ($_ != $last + 1) { flush; $first = $_; }
		$last = $_;
		END { flush if defined $first }' | column -xc 72
}

gofmt <<EOF
// Code generated by running "go generate" in janouch.name/haven. DO NOT EDIT.

package $GOPACKAGE

// RuneWidth returns the column width of Go runes, using an algorithm and tables
// derived from https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c:
//  - The null character (U+0000) has a column width of 0.
//  - Other C0/C1 control characters and DEL will lead to a return value of -1.
//  - Non-spacing and enclosing combining characters (general category code
//    Mn or Me in the Unicode database) have a column width of 0.
//  - SOFT HYPHEN (U+00AD) has a column width of 1.
//  - Other format characters (general category code Cf in the Unicode database)
//    and ZERO WIDTH SPACE (U+200B) have a column width of 0.
//  - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) have
//    a column width of 0.
//  - Spacing characters in the East Asian Wide (W) or East Asian Full-width (F)
//    category as defined in Unicode UAX #11 have a column width of 2.
//  - All remaining characters (including all printable ISO 8859-1 and WGL4
//    characters, Unicode control characters, etc.) have a column width of 1.
//
// Local changes:
//  - Tables are generated from the latest available version of Unicode.
func RuneWidth(r rune) int {
	switch {
	case r == 0:
		return 0
	case r < 32 || r >= 0x7f && r < 0xa0:
		return -1
	case zeroWidthRunes.contains(r):
		return 0
	case fullWidthRunes.contains(r):
		return 2
	}
	return 1
}

type runeRange struct{ first, last rune }
type runeRangeTable []runeRange

func (table runeRangeTable) contains(r rune) bool {
	min, max := 0, len(table)-1
	for max >= min {
		mid := (min + max) / 2
		if table[mid].last < r {
			min = mid + 1
		} else if table[mid].first > r {
			max = mid - 1
		} else {
			return true
		}
	}
	return false
}

var zeroWidthRunes = runeRangeTable{
$({ retrieve 'Me|Mn|Cf' $u/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt;
    seq 0x1160 0x11ff; echo $((0x200B)); } | grep -xv $((0x00AD)) | togo)
}

var fullWidthRunes = runeRangeTable{
$(retrieve 'W|F' $u/UCD/latest/ucd/EastAsianWidth.txt | togo)
}
EOF