aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPřemysl Eric Janouch <p@janouch.name>2021-10-07 03:37:12 +0200
committerPřemysl Eric Janouch <p@janouch.name>2021-10-07 14:06:57 +0200
commited8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e (patch)
tree4d864fe34d2102eb1ef4812d4613c2eca7006336
parent3881725904473cd9fdbd3e60cd1de2010f14d767 (diff)
downloadtdv-ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e.tar.gz
tdv-ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e.tar.xz
tdv-ed8b1bcdad7c430af1eef5fbe78b6ec4eb3eb60e.zip
Add sample dictionary downloaders/builders
-rw-r--r--CMakeLists.txt16
-rw-r--r--README.adoc5
-rwxr-xr-xdicts/gnu-fdl-en-cz.sh21
-rwxr-xr-xdicts/slovnik-cizich-slov.sh8
4 files changed, 47 insertions, 3 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd42553..9f07bfb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,6 +172,21 @@ endforeach ()
add_custom_target (tools DEPENDS ${tools})
+# Example dictionaries
+file (GLOB dicts_scripts "${PROJECT_SOURCE_DIR}/dicts/*.sh")
+set (dicts_targets)
+foreach (dict_script ${dicts_scripts})
+ get_filename_component (dict_name "${dict_script}" NAME_WE)
+ list (APPEND dicts_targets "dicts-${dict_name}")
+ add_custom_target (dicts-${dict_name}
+ COMMAND sh -c "PATH=.:$PATH \"$0\"" "${dict_script}"
+ DEPENDS tabfile
+ COMMENT "Generating sample dictionary ${dict_name}"
+ VERBATIM)
+endforeach ()
+
+add_custom_target (dicts DEPENDS ${dicts_targets})
+
# The files to be installed
include (GNUInstallDirs)
install (TARGETS ${PROJECT_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})
@@ -212,4 +227,3 @@ set (CPACK_SOURCE_IGNORE_FILES "/\\\\.git;/build;/CMakeLists.txt.user")
set (CPACK_SOURCE_PACKAGE_FILE_NAME "${PROJECT_NAME}-${PROJECT_VERSION}")
include (CPack)
-
diff --git a/README.adoc b/README.adoc
index 4f98061..724c7e8 100644
--- a/README.adoc
+++ b/README.adoc
@@ -101,13 +101,14 @@ Dictionaries
Unfortunately this application only really works with specific dictionaries.
Word definitions have to be in plain text, separated by newlines.
+The `make dicts` command will build some examples from freely available sources.
+
You may use the included transform tool to transform existing dictionaries that
are almost useful as they are, e.g. after stripping XML tags. You might want to
fix up the `sametypesequence` of the resulting '.ifo' file afterwards, and run
dictzip on the resulting '.dict' file.
-https://mega.co.nz/#!axtD0QRK!sbtBgizksyfkPqKvKEgr8GQ11rsWhtqyRgUUV0B7pwg[
-CZ <--> { EN, DE, PL, RU } dictionaries]
+https://mega.co.nz/#!axtD0QRK!sbtBgizksyfkPqKvKEgr8GQ11rsWhtqyRgUUV0B7pwg[CZ <--> EN/DE/PL/RU dictionaries]
Contributing and Support
------------------------
diff --git a/dicts/gnu-fdl-en-cz.sh b/dicts/gnu-fdl-en-cz.sh
new file mode 100755
index 0000000..8feeea9
--- /dev/null
+++ b/dicts/gnu-fdl-en-cz.sh
@@ -0,0 +1,21 @@
+#!/bin/sh -e
+# GNU/FDL English-Czech dictionary, see https://www.svobodneslovniky.cz/
+curl -Lo- https://www.svobodneslovniky.cz/data/en-cs.txt.gz | \
+zcat | grep -v ^# | sed 's/\\//g' | perl -CSD -F\\t -le '
+ sub e { shift =~ s/\\/\\\\/gr =~ s/\n/\\n/gr =~ s/\t/\\t/gr }
+ sub w {
+ open(my $f, "|-", "tabfile gnu-fdl-$_[0]") or die $!;
+ print $f e($k) . "\t" . e(join("\n", @$v))
+ while ($k, $v) = each %{$_[1]};
+ close($f);
+ }
+ my ($en, $cz, $notes, $special, $translator) = @F;
+ if ($cz) {
+ $notes =~ s/\w+:\s?//g; # remove word classes
+ $notes =~ s/(\w+\.)(?!])/($1)/; # quote "pl."
+ push(@{$encz{$en}}, $notes ? "$cz " . $notes : $cz);
+ push(@{$czen{$cz}}, $notes ? "$en " . $notes : $en);
+ } END {
+ w("en-cz", \%encz);
+ w("cz-en", \%czen);
+ }'
diff --git a/dicts/slovnik-cizich-slov.sh b/dicts/slovnik-cizich-slov.sh
new file mode 100755
index 0000000..cb99ea1
--- /dev/null
+++ b/dicts/slovnik-cizich-slov.sh
@@ -0,0 +1,8 @@
+#!/bin/sh -e
+# Slovník cizích slov, see https://slovnik-cizich-slov.abz.cz/web.php/o-slovniku
+# TODO: Skipping the optional pronunciation field, tabfile can't handle it yet,
+# but could be made to accept a lowercase sametypesequence
+curl -Lo- https://slovnik-cizich-slov.abz.cz/export.php | \
+iconv -f latin2 -t UTF-8 | perl -CSD -F\\\| -le '
+ print "$_\t" . $F[2] =~ s/\\/\\\\/gr =~ s/; /\\n/gr for split(", ", $F[0])
+' | sort -u | tabfile slovnik-cizich-slov