Add the Czech WordNet snapshot to dicts

author: Přemysl Eric Janouch <p@janouch.name> 2021-10-10 00:36:12 +0200
committer: Přemysl Eric Janouch <p@janouch.name> 2021-10-10 00:45:38 +0200
commit: 8b9c5e04609169adc45a0d30f9ac094d82efae2a (patch)
tree: d238a23689b83d370a8ee5769638ef6af90a790c /dicts/czech-wordnet.pl
parent: 973d1d27ea776a902eb1bf85e645181eb389d7a9 (diff)
download: tdv-8b9c5e04609169adc45a0d30f9ac094d82efae2a.tar.gz
tdv-8b9c5e04609169adc45a0d30f9ac094d82efae2a.tar.xz
tdv-8b9c5e04609169adc45a0d30f9ac094d82efae2a.zip
1 files changed, 82 insertions, 0 deletions
diff --git a/dicts/czech-wordnet.pl b/dicts/czech-wordnet.pl
new file mode 100755
index 0000000..d626e5a
--- /dev/null
+++ b/dicts/czech-wordnet.pl
@@ -0,0 +1,82 @@
+#!/usr/bin/env perl
+# Czech WordNet 1.9 PDT, CC BY-NC-SA 3.0, newer versions available commercially;
+# this one's IDs cannot be linked with any release of the Princeton WordNet
+use warnings;
+use strict;
+
+my $base = 'https://lindat.cz/repository/xmlui';
+my $path = 'handle/11858/00-097C-0000-0001-4880-3';
+open(my $doc, '-|',
+	"curl -Lo- '$base/bitstream/$path/Czech_WordNet_1.9_PDT.zip'"
+		. ' | zcat | iconv -f latin2 -t UTF-8') or die $!;
+
+# https://nlp.fi.muni.cz/trac/deb2/wiki/WordNetFormat but not quite;
+# for terminology see https://wordnet.princeton.edu/documentation/wngloss7wn
+my %synsets;
+while (<$doc>) {
+	my $id = m|<ID>(.+?)</ID>| && $1; next unless defined $id;
+	my $pos = m|<POS>(.+?)</POS>| && $1; next if $pos eq 'e';
+	$synsets{$id} = {
+		literals => [map {s| \^\d+||gr} m|<LITERAL>(.+?)<|g],
+		rels => {
+			anto => [m^<ILR>(.+?)<TYPE>near_antonym<^g],
+			hyper => [m^<ILR>(.+?)<TYPE>hypernym<^g],
+			hypo => [m^<ILR>(.+?)<TYPE>hyponym<^g,
+				m^<SUBEVENT>(.+?)</SUBEVENT>^g],
+			super => [m^<ILR>(.+?)<TYPE>holo_part<^g],
+			sub => [m^<ILR>(.+?)<TYPE>(?:holo_member|mero_part|partonym)<^g],
+		},
+	};
+}
+
+# Resolve all synset links to hash references, filtering out what can't be found
+while (my ($id, $synset) = each %synsets) {
+	while (my ($name, $links) = each %{$synset->{rels}}) {
+		@$links = map {$synsets{$_} || ()} @$links;
+	}
+}
+
+# Ensure symmetry in relationships, duplicates will be taken care of later
+my %antitags = qw(anto anto hyper hypo hypo hyper super sub sub super);
+while (my ($id, $synset) = each %synsets) {
+	while (my ($name, $links) = each %{$synset->{rels}}) {
+		push @{$_->{rels}->{$antitags{$name}}}, $synset for @$links;
+	}
+}
+
+# Create an inverse index from literals/keywords to their synsets
+my %literals;
+while (my ($id, $synset) = each %synsets) {
+	push @{$literals{$_}}, $synset for @{$synset->{literals}};
+}
+
+# Output synsets exploded to individual words, with expanded relationships
+close($doc) or die $?;
+open(my $tabfile, '|-', 'tabfile', 'czech-wordnet',
+	'--book-name=Czech WordNet 1.9 PDT', "--website=$base/$path",
+	'--date=2011-01-24', '--collation=cs_CZ') or die $!;
+
+sub expand {
+	my %seen;
+	return grep {!$seen{$_}++} (map {@{$_->{literals}}} @_);
+}
+
+for my $keyword (sort {lc $a cmp lc $b} keys %literals) {
+	my @lines;
+	for my $synset (@{$literals{$keyword}}) {
+		my $rels = $synset->{rels};
+		push @lines,
+			(grep {$_ ne $keyword} @{$synset->{literals}}),
+			(map {"$_ ↑"} expand(@{$rels->{hyper}})),
+			(map {"$_ ↓"} expand(@{$rels->{hypo}})),
+			(map {"$_ ⊃"} expand(@{$rels->{super}})),
+			(map {"$_ ⊂"} expand(@{$rels->{sub}})),
+			(map {"$_ ≠"} expand(@{$rels->{anto}}));
+	}
+	if (@lines) {
+		print $tabfile "$keyword\t" . join('\n',
+			map { s/&lt;/</gr =~ s/&gt;/>/gr =~ s/&amp;/&/gr
+				=~ s/\\/\\\\/gr =~ s/\n/\\n/gr =~ s/\t/\\t/gr} @lines) . "\n";
+	}
+}
+close($tabfile) or die $?;
author	Přemysl Eric Janouch <p@janouch.name>	2021-10-10 00:36:12 +0200
committer	Přemysl Eric Janouch <p@janouch.name>	2021-10-10 00:45:38 +0200
commit	8b9c5e04609169adc45a0d30f9ac094d82efae2a (patch)
tree	d238a23689b83d370a8ee5769638ef6af90a790c /dicts/czech-wordnet.pl
parent	973d1d27ea776a902eb1bf85e645181eb389d7a9 (diff)
download	tdv-8b9c5e04609169adc45a0d30f9ac094d82efae2a.tar.gz tdv-8b9c5e04609169adc45a0d30f9ac094d82efae2a.tar.xz tdv-8b9c5e04609169adc45a0d30f9ac094d82efae2a.zip