diff options
| -rw-r--r-- | CMakeLists.txt | 2 | ||||
| -rwxr-xr-x | dicts/czech-wordnet.pl | 82 | 
2 files changed, 83 insertions, 1 deletions
| diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f07bfb..1a8e100 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,7 +173,7 @@ endforeach ()  add_custom_target (tools DEPENDS ${tools})  # Example dictionaries -file (GLOB dicts_scripts "${PROJECT_SOURCE_DIR}/dicts/*.sh") +file (GLOB dicts_scripts "${PROJECT_SOURCE_DIR}/dicts/*.*")  set (dicts_targets)  foreach (dict_script ${dicts_scripts})  	get_filename_component (dict_name "${dict_script}" NAME_WE) diff --git a/dicts/czech-wordnet.pl b/dicts/czech-wordnet.pl new file mode 100755 index 0000000..d626e5a --- /dev/null +++ b/dicts/czech-wordnet.pl @@ -0,0 +1,82 @@ +#!/usr/bin/env perl +# Czech WordNet 1.9 PDT, CC BY-NC-SA 3.0, newer versions available commercially; +# this one's IDs cannot be linked with any release of the Princeton WordNet +use warnings; +use strict; + +my $base = 'https://lindat.cz/repository/xmlui'; +my $path = 'handle/11858/00-097C-0000-0001-4880-3'; +open(my $doc, '-|', +	"curl -Lo- '$base/bitstream/$path/Czech_WordNet_1.9_PDT.zip'" +		. ' | zcat | iconv -f latin2 -t UTF-8') or die $!; + +# https://nlp.fi.muni.cz/trac/deb2/wiki/WordNetFormat but not quite; +# for terminology see https://wordnet.princeton.edu/documentation/wngloss7wn +my %synsets; +while (<$doc>) { +	my $id = m|<ID>(.+?)</ID>| && $1; next unless defined $id; +	my $pos = m|<POS>(.+?)</POS>| && $1; next if $pos eq 'e'; +	$synsets{$id} = { +		literals => [map {s| \^\d+||gr} m|<LITERAL>(.+?)<|g], +		rels => { +			anto => [m^<ILR>(.+?)<TYPE>near_antonym<^g], +			hyper => [m^<ILR>(.+?)<TYPE>hypernym<^g], +			hypo => [m^<ILR>(.+?)<TYPE>hyponym<^g, +				m^<SUBEVENT>(.+?)</SUBEVENT>^g], +			super => [m^<ILR>(.+?)<TYPE>holo_part<^g], +			sub => [m^<ILR>(.+?)<TYPE>(?:holo_member|mero_part|partonym)<^g], +		}, +	}; +} + +# Resolve all synset links to hash references, filtering out what can't be found +while (my ($id, $synset) = each %synsets) { +	while (my ($name, $links) = each %{$synset->{rels}}) { +		@$links = map {$synsets{$_} || ()} @$links; +	} +} + +# Ensure symmetry in relationships, duplicates will be taken care of later +my %antitags = qw(anto anto hyper hypo hypo hyper super sub sub super); +while (my ($id, $synset) = each %synsets) { +	while (my ($name, $links) = each %{$synset->{rels}}) { +		push @{$_->{rels}->{$antitags{$name}}}, $synset for @$links; +	} +} + +# Create an inverse index from literals/keywords to their synsets +my %literals; +while (my ($id, $synset) = each %synsets) { +	push @{$literals{$_}}, $synset for @{$synset->{literals}}; +} + +# Output synsets exploded to individual words, with expanded relationships +close($doc) or die $?; +open(my $tabfile, '|-', 'tabfile', 'czech-wordnet', +	'--book-name=Czech WordNet 1.9 PDT', "--website=$base/$path", +	'--date=2011-01-24', '--collation=cs_CZ') or die $!; + +sub expand { +	my %seen; +	return grep {!$seen{$_}++} (map {@{$_->{literals}}} @_); +} + +for my $keyword (sort {lc $a cmp lc $b} keys %literals) { +	my @lines; +	for my $synset (@{$literals{$keyword}}) { +		my $rels = $synset->{rels}; +		push @lines, +			(grep {$_ ne $keyword} @{$synset->{literals}}), +			(map {"$_ ↑"} expand(@{$rels->{hyper}})), +			(map {"$_ ↓"} expand(@{$rels->{hypo}})), +			(map {"$_ ⊃"} expand(@{$rels->{super}})), +			(map {"$_ ⊂"} expand(@{$rels->{sub}})), +			(map {"$_ ≠"} expand(@{$rels->{anto}})); +	} +	if (@lines) { +		print $tabfile "$keyword\t" . join('\n', +			map { s/</</gr =~ s/>/>/gr =~ s/&/&/gr +				=~ s/\\/\\\\/gr =~ s/\n/\\n/gr =~ s/\t/\\t/gr} @lines) . "\n"; +	} +} +close($tabfile) or die $?; | 
