#!/bin/sh u=https://www.unicode.org/Public/ # Download and filter Unicode data files with the given category expression, # producing a list of possibly duplicitous codepoints in decimal format retrieve() { curl --silent --show-error --location "$2" | perl -lne 's/#.*//; s/ //g; next unless /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?;('"$1"')$/; print for hex $1 .. hex ($2 // $1);' } togo() { sort -nu | perl -lne ' sub flush { printf "{0x%04x, 0x%04x},\n", $first, $last } BEGIN { $first = $last = <> } if ($_ != $last + 1) { flush; $first = $_; } $last = $_; END { flush if defined $first }' | column -xc 72 } gofmt <= 0x7f && r < 0xa0: return -1 case zeroWidthRunes.contains(r): return 0 case fullWidthRunes.contains(r): return 2 } return 1 } type runeRange struct{ first, last rune } type runeRangeTable []runeRange func (table runeRangeTable) contains(r rune) bool { min, max := 0, len(table)-1 for max >= min { mid := (min + max) / 2 if table[mid].last < r { min = mid + 1 } else if table[mid].first > r { max = mid - 1 } else { return true } } return false } var zeroWidthRunes = runeRangeTable{ $({ retrieve 'Me|Mn|Cf' $u/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt; seq 0x1160 0x11ff; echo $((0x200B)); } | grep -xv $((0x00AD)) | togo) } var fullWidthRunes = runeRangeTable{ $(retrieve 'W|F' $u/UCD/latest/ucd/EastAsianWidth.txt | togo) } EOF