From af2756ee01fa6b1921c6bcb581817e64c30beb48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C5=99emysl=20Eric=20Janouch?= Date: Tue, 27 Sep 2022 17:13:45 +0200 Subject: Add a rudimentary CMake script parser --- CMakeLists.txt | 4 + tools/cmake-dump.awk | 24 +++++ tools/cmake-parser.awk | 250 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 278 insertions(+) create mode 100644 tools/cmake-dump.awk create mode 100644 tools/cmake-parser.awk diff --git a/CMakeLists.txt b/CMakeLists.txt index fa996bf..af9c910 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,3 +52,7 @@ foreach (name ${tests}) target_link_libraries (test-${name} ${common_libraries}) add_test (NAME test-${name} COMMAND test-${name}) endforeach () + +add_test (test-cmake-parser + env LC_ALL=C awk -f ${PROJECT_SOURCE_DIR}/tools/cmake-parser.awk + -f ${PROJECT_SOURCE_DIR}/tools/cmake-dump.awk ${CMAKE_CURRENT_LIST_FILE}) diff --git a/tools/cmake-dump.awk b/tools/cmake-dump.awk new file mode 100644 index 0000000..d0b68b9 --- /dev/null +++ b/tools/cmake-dump.awk @@ -0,0 +1,24 @@ +# cmake-dump.awk: dump parsed CMake scripts as tables +# +# Copyright (c) 2022, Přemysl Eric Janouch +# SPDX-License-Identifier: 0BSD +# +# Parsed scripts are output in a table, with commands separated using ASCII +# Record Separators, and arguments using Unit Separators. +# +# Example usage: awk -f cmake-parser.awk -f cmake-dump.awk CMakeLists.txt \ +# | sed 'y/\x1F\x1E\t\n/\t\n /' \ +# | sed -n '/^project\t\([^\t]*\).*\tVERSION\t\([^\t]*\).*/{s//\1 \2/p;q;}' + +function sanitize(s) { + if (s ~ /[\x1E\x1F]/) + fatal("conflicting ASCII control characters found in source") + return s +} + +Command { + out = sanitize(Command) + for (i in Args) + out = out "\x1F" sanitize(Args[i]) + printf "%s\x1E", out +} diff --git a/tools/cmake-parser.awk b/tools/cmake-parser.awk new file mode 100644 index 0000000..7651cd1 --- /dev/null +++ b/tools/cmake-parser.awk @@ -0,0 +1,250 @@ +# cmake-parser.awk: rudimentary CMake script parser +# +# Copyright (c) 2022, Přemysl Eric Janouch +# SPDX-License-Identifier: 0BSD +# +# Implemented roughly according to the grammar described in cmake-language(7), +# which is self-conflicting, and not an accurate description. +# +# The result of parsing is stored in the case-normalized Command variable, +# and the Args array. These can be used by subsequent scripts. + +function warning(message) { + print FILENAME ":" FNR ": warning: " message > "/dev/stderr" +} + +function fatal(message) { + print FILENAME ":" FNR ": fatal error: " message > "/dev/stderr" + exit 1 +} + +function expect(v) { + if (!v && v == 0) + fatal("broken expectations at `" $0 "'") + return v +} + +function literal(v) { + if (substr($0, 1, length(v)) != v) + return 0 + $0 = substr($0, length(v) + 1) + return 1 +} + +function regexp(re) { + if (!match($0, "^" re)) + return 0 + $0 = substr($0, RLENGTH + 1) + return 1 +} + +function space() { + return regexp("[ \t]+") +} + +function unbracket(len, v) { + do { + if (match($0, "]={" len "}]")) { + v = v substr($0, 1, RSTART - 1) + $0 = substr($0, RSTART + RLENGTH) + return v + } + v = v $0 RS + } while (getline > 0) + fatal("unterminated bracket") +} + +function bracket_comment() { + if (!match($0, /^#\[=*\[/)) + return 0 + $0 = substr($0, RSTART + RLENGTH) + unbracket(RLENGTH - 3) + return 1 +} + +function line_ending() { + while (space() || bracket_comment()) {} + if (/^#/) + $0 = "" + return !$0 +} + +# ------------------------------------------------------------------------------ + +# While elementary expansion of previously set variables is implementable, +# it doesn't seem to be worth the effort. +function expand(s, v) { + v = s + while (match(v, /\\*[$](|ENV|CACHE)[{]/)) { + if (index(substr(v, RSTART), "$") % 2 != 0) { + warning("variable expansion is not supported: " s) + return s + } + v = substr(v, RSTART + RLENGTH) + } + return s +} + +function escape_sequence( v) { + if (!literal("\\")) + return 0 + + if (literal("t")) return "\t" + if (literal("r")) return "\r" + if (literal("n")) return "\n" + + # escape_semicolon isn't treated any specially here. + if (regexp("[A-Za-z0-9]")) + fatal("unsupported escape sequence") + + if ($0) { + v = substr($0, 1, 1) + $0 = substr($0, 2) + return v + } + if (getline > 0) + return "" + fatal("premature end of file") +} + +function quoted_argument( v, unescaped) { + if (!literal("\"")) + return 0 + + v = "" + while (!literal("\"")) { + if (!$0) { + if (getline <= 0) + fatal("premature end of file") + v = v RS + } else if ((unescaped = escape_sequence())) { + if (unescaped == "\\" || unescaped == "$") + v = v "\\" + else if (unescaped == ";") + v = v "\\\\" + v = v unescaped + } else if (unescaped == "") { + # quoted_continuation + } else { + v = v substr($0, 1, 1) + $0 = substr($0, 2) + } + } + return v +} + +function unquoted_argument( v, unescaped) { + while (1) { + if (match($0, /^[^[:space:]()#"\\]+/)) { + v = v substr($0, RSTART, RLENGTH) + $0 = substr($0, RSTART + RLENGTH) + } else if ((unescaped = escape_sequence())) { + if (unescaped == "\\" || unescaped == "$" || unescaped == ";") + v = v "\\" + v = v unescaped + } else if (unescaped == "") { + fatal("unexpected backslash in an unquoted argument") + } else { + # unquoted_legacy is not supported. + return v + } + } +} + +# Note that we keep and reprocess some escape sequences in here. +function argument( arg, expanded, v) { + if (regexp("\\[=*\\[")) { + Args[++N] = unbracket(RLENGTH - 2) + return 1 + } + if ((arg = quoted_argument()) || arg == "") { + expanded = expand(arg) + while (match(expanded, /\\./)) { + v = v substr(expanded, 1, RSTART - 1) \ + substr(expanded, RSTART + 1, 1) + expanded = substr(expanded, RSTART + RLENGTH) + } + Args[++N] = v expanded + return 1 + } + if ((arg = unquoted_argument())) { + expanded = expand(arg) + while (expanded) { + if (expanded ~ /^;/) { + if (v) + Args[++N] = v + v = "" + expanded = substr(expanded, 2) + } else if (expanded ~ /^\\./) { + v = v substr(expanded, 2, 1) + expanded = substr(expanded, 3) + } else { + v = v substr(expanded, 1, 1) + expanded = substr(expanded, 2) + } + } + if (v) + Args[++N] = v + return 1 + } + return 0 +} + +# ------------------------------------------------------------------------------ + +function identifier( v) { + if (!match($0, /^[A-Za-z_][A-Za-z0-9_]*/)) + return 0 + v = substr($0, 1, RLENGTH) + $0 = substr($0, RLENGTH + 1) + return v +} + +function separation() { + if (space() || bracket_comment()) + return 1 + + if (!line_ending()) + return 0 + if (getline > 0) + return 1 + fatal("premature end of file") +} + +function command_invocation( level) { + while (space()) {} + Command = identifier() + if (!Command) + return 0 + while (space()) {} + + Command = tolower(Command) + for (N in Args) + delete Args[N] + + N = 0 + expect(literal("(")) + while (1) { + while (separation()) {} + if (literal(")")) { + if (!level--) + break + Args[++N] = ")" + continue + } + if (literal("(")) { + level++ + Args[++N] = "(" + continue + } + expect(argument()) + if (!/^[()]/) + expect(separation()) + } + return 1 +} + +{ + command_invocation() + expect(line_ending()) +} -- cgit v1.2.3