Add sample dictionary downloaders/builders

author: Přemysl Eric Janouch <p@janouch.name> 2021-10-07 03:37:12 +0200
committer: Přemysl Eric Janouch <p@janouch.name> 2021-10-07 14:06:57 +0200
commit: ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e (patch)
tree: 4d864fe34d2102eb1ef4812d4613c2eca7006336
parent: 3881725904473cd9fdbd3e60cd1de2010f14d767 (diff)
download: tdv-ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e.tar.gz
tdv-ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e.tar.xz
tdv-ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e.zip
4 files changed, 47 insertions, 3 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd42553..9f07bfb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,6 +172,21 @@ endforeach ()
 
 add_custom_target (tools DEPENDS ${tools})
 
+# Example dictionaries
+file (GLOB dicts_scripts "${PROJECT_SOURCE_DIR}/dicts/*.sh")
+set (dicts_targets)
+foreach (dict_script ${dicts_scripts})
+	get_filename_component (dict_name "${dict_script}" NAME_WE)
+	list (APPEND dicts_targets "dicts-${dict_name}")
+	add_custom_target (dicts-${dict_name}
+		COMMAND sh -c "PATH=.:$PATH \"$0\"" "${dict_script}"
+		DEPENDS tabfile
+		COMMENT "Generating sample dictionary ${dict_name}"
+		VERBATIM)
+endforeach ()
+
+add_custom_target (dicts DEPENDS ${dicts_targets})
+
 # The files to be installed
 include (GNUInstallDirs)
 install (TARGETS ${PROJECT_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})
@@ -212,4 +227,3 @@ set (CPACK_SOURCE_IGNORE_FILES "/\\\\.git;/build;/CMakeLists.txt.user")
 set (CPACK_SOURCE_PACKAGE_FILE_NAME "${PROJECT_NAME}-${PROJECT_VERSION}")
 
 include (CPack)
-
diff --git a/README.adoc b/README.adoc
index 4f98061..724c7e8 100644
--- a/README.adoc
+++ b/README.adoc
@@ -101,13 +101,14 @@ Dictionaries
 Unfortunately this application only really works with specific dictionaries.
 Word definitions have to be in plain text, separated by newlines.
 
+The `make dicts` command will build some examples from freely available sources.
+
 You may use the included transform tool to transform existing dictionaries that
 are almost useful as they are, e.g. after stripping XML tags.  You might want to
 fix up the `sametypesequence` of the resulting '.ifo' file afterwards, and run
 dictzip on the resulting '.dict' file.
 
-https://mega.co.nz/#!axtD0QRK!sbtBgizksyfkPqKvKEgr8GQ11rsWhtqyRgUUV0B7pwg[
-CZ <--> { EN, DE, PL, RU } dictionaries]
+https://mega.co.nz/#!axtD0QRK!sbtBgizksyfkPqKvKEgr8GQ11rsWhtqyRgUUV0B7pwg[CZ <--> EN/DE/PL/RU dictionaries]
 
 Contributing and Support
 ------------------------
diff --git a/dicts/gnu-fdl-en-cz.sh b/dicts/gnu-fdl-en-cz.sh
new file mode 100755
index 0000000..8feeea9
--- /dev/null
+++ b/dicts/gnu-fdl-en-cz.sh
@@ -0,0 +1,21 @@
+#!/bin/sh -e
+# GNU/FDL English-Czech dictionary, see https://www.svobodneslovniky.cz/
+curl -Lo- https://www.svobodneslovniky.cz/data/en-cs.txt.gz | \
+zcat | grep -v ^# | sed 's/\\//g' | perl -CSD -F\\t -le '
+	sub e { shift =~ s/\\/\\\\/gr =~ s/\n/\\n/gr =~ s/\t/\\t/gr }
+	sub w {
+		open(my $f, "|-", "tabfile gnu-fdl-$_[0]") or die $!;
+		print $f e($k) . "\t" . e(join("\n", @$v))
+			while ($k, $v) = each %{$_[1]};
+		close($f);
+	}
+	my ($en, $cz, $notes, $special, $translator) = @F;
+	if ($cz) {
+		$notes =~ s/\w+:\s?//g;          # remove word classes
+		$notes =~ s/(\w+\.)(?!])/($1)/;  # quote "pl."
+		push(@{$encz{$en}}, $notes ? "$cz " . $notes : $cz);
+		push(@{$czen{$cz}}, $notes ? "$en " . $notes : $en);
+	} END {
+		w("en-cz", \%encz);
+		w("cz-en", \%czen);
+	}'
diff --git a/dicts/slovnik-cizich-slov.sh b/dicts/slovnik-cizich-slov.sh
new file mode 100755
index 0000000..cb99ea1
--- /dev/null
+++ b/dicts/slovnik-cizich-slov.sh
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+# Slovník cizích slov, see https://slovnik-cizich-slov.abz.cz/web.php/o-slovniku
+# TODO: Skipping the optional pronunciation field, tabfile can't handle it yet,
+# but could be made to accept a lowercase sametypesequence
+curl -Lo- https://slovnik-cizich-slov.abz.cz/export.php | \
+iconv -f latin2 -t UTF-8 | perl -CSD -F\\\| -le '
+	print "$_\t" . $F[2] =~ s/\\/\\\\/gr =~ s/; /\\n/gr for split(", ", $F[0])
+' | sort -u | tabfile slovnik-cizich-slov
author	Přemysl Eric Janouch <p@janouch.name>	2021-10-07 03:37:12 +0200
committer	Přemysl Eric Janouch <p@janouch.name>	2021-10-07 14:06:57 +0200
commit	ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e (patch)
tree	4d864fe34d2102eb1ef4812d4613c2eca7006336
parent	3881725904473cd9fdbd3e60cd1de2010f14d767 (diff)
download	tdv-ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e.tar.gz tdv-ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e.tar.xz tdv-ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e.zip