diff options
-rw-r--r-- | .clang-format | 8 | ||||
-rw-r--r-- | .gitignore | 8 | ||||
-rw-r--r-- | LICENSE | 12 | ||||
-rw-r--r-- | NEWS | 29 | ||||
-rw-r--r-- | README.adoc | 70 | ||||
-rw-r--r-- | cmd/extfs-pdf/main.go | 141 | ||||
-rw-r--r-- | cmd/pdf-simple-sign/main.go | 76 | ||||
-rw-r--r-- | go.mod | 8 | ||||
-rw-r--r-- | go.sum | 13 | ||||
-rw-r--r-- | meson.build | 23 | ||||
-rw-r--r-- | pdf-simple-sign.adoc | 80 | ||||
-rw-r--r-- | pdf-simple-sign.cpp | 1017 | ||||
-rw-r--r-- | pdf/pdf.go | 1663 | ||||
-rwxr-xr-x | test.sh | 86 |
14 files changed, 3234 insertions, 0 deletions
diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..13cbee9 --- /dev/null +++ b/.clang-format @@ -0,0 +1,8 @@ +BasedOnStyle: Chromium +ColumnLimit: 100 +IndentCaseLabels: false +AccessModifierOffset: -2 +ContinuationIndentWidth: 2 +SpaceAfterTemplateKeyword: false +SpaceAfterCStyleCast: true +SpacesBeforeTrailingComments: 2 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d046c48 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/builddir +/pdf-simple-sign.cflags +/pdf-simple-sign.config +/pdf-simple-sign.creator +/pdf-simple-sign.creator.user +/pdf-simple-sign.cxxflags +/pdf-simple-sign.files +/pdf-simple-sign.includes @@ -0,0 +1,12 @@ +Copyright (c) 2017 - 2024, Přemysl Eric Janouch <p@janouch.name> + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. @@ -0,0 +1,29 @@ +1.1.1 (2020-09-06) + + * Fix a dysfunctional example in the manual + + * Go: write the xref table in a deterministic order + + * Add a trivial test suite, based on pdfsig from poppler-utils + + +1.1 (2020-09-05) + + * Make it possible to change the signature reservation with an option + + * Return errors rather than mangle documents in some cases, + notably with pre-existing PDF forms + + * Avoid downgrading the document's PDF version to 1.6 + + * A few fixes for PDF parsing and serialisation + + * Add an instructive man page + + * Add a native Go port of the utility, also usable as a library + + +1.0 (2018-08-03) + + * Initial release + diff --git a/README.adoc b/README.adoc new file mode 100644 index 0000000..10e581f --- /dev/null +++ b/README.adoc @@ -0,0 +1,70 @@ +pdf-simple-sign +=============== + +'pdf-simple-sign' is a simple PDF signer intended for documents produced by +the Cairo library (≤ 1.17.4 or using PDF 1.4), GNU troff, ImageMagick, +or similar. + +I don't aim to extend the functionality any further. The project is fairly +self-contained and it should be easy to grasp and change to suit to your needs. + +Packages +-------- +Regular releases are sporadic. git master should be stable enough. +You can get a package with the latest development version using Arch Linux's +https://aur.archlinux.org/packages/pdf-simple-sign-git[AUR], +or as a https://git.janouch.name/p/nixexprs[Nix derivation]. + +Documentation +------------- +See the link:pdf-simple-sign.adoc[man page] for information about usage. +The rest of this README will concern itself with externalities. + +image:https://pkg.go.dev/badge/janouch.name/pdf-simple-sign@master/pdf["PkgGoDev", link="https://pkg.go.dev/janouch.name/pdf-simple-sign@master/pdf"] + +Building +-------- +Build dependencies: Meson, Asciidoctor, a C++11 compiler, pkg-config + +Runtime dependencies: libcrypto (OpenSSL 1.1 API) + + $ git clone https://git.janouch.name/p/pdf-simple-sign.git + $ cd pdf-simple-sign + $ meson builddir + $ cd builddir + $ ninja + +In addition to the C++ version, also included is a native Go port, +which has enhanced PDF 1.5 support: + +---- +$ go install janouch.name/pdf-simple-sign/cmd/pdf-simple-sign@master +---- + +and a crude external VFS for Midnight Commander, that may be used to extract +all streams from a given PDF file: + +---- +$ GOBIN=$HOME/.local/share/mc/extfs.d \ + go install janouch.name/pdf-simple-sign/cmd/extfs-pdf@master +---- + +To enable the VFS, edit your _~/.config/mc/mc.ext.ini_ to contain: + +---- +[pdf] +Type=^PDF +Open=%cd %p/extfs-pdf:// +---- + +Contributing and Support +------------------------ +Use https://git.janouch.name/p/pdf-simple-sign to report bugs, request features, +or submit pull requests. `git send-email` is tolerated. If you want to discuss +the project, feel free to join me at ircs://irc.janouch.name, channel #dev. + +Bitcoin donations are accepted at: 12r5uEWEgcHC46xd64tt3hHt9EUvYYDHe9 + +License +------- +This software is released under the terms of the 0BSD license, the text of which +is included within the package along with the list of authors. diff --git a/cmd/extfs-pdf/main.go b/cmd/extfs-pdf/main.go new file mode 100644 index 0000000..eab3e2b --- /dev/null +++ b/cmd/extfs-pdf/main.go @@ -0,0 +1,141 @@ +// +// Copyright (c) 2021 - 2024, Přemysl Eric Janouch <p@janouch.name> +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// + +// extfs-pdf is an external VFS plugin for Midnight Commander. +// More serious image extractors should rewrite this to use pdfimages(1). +package main + +import ( + "flag" + "fmt" + "os" + "time" + + "janouch.name/pdf-simple-sign/pdf" +) + +func die(status int, format string, args ...interface{}) { + os.Stderr.WriteString(fmt.Sprintf(format+"\n", args...)) + os.Exit(status) +} + +func usage() { + die(1, "Usage: %s [-h] COMMAND DOCUMENT [ARG...]", os.Args[0]) +} + +func streamSuffix(o *pdf.Object) string { + if filter, _ := o.Dict["Filter"]; filter.Kind == pdf.Name { + switch filter.String { + case "JBIG2Decode": + // This is the file extension used by pdfimages(1). + // This is not a complete JBIG2 standalone file. + return "jb2e" + case "JPXDecode": + return "jp2" + case "DCTDecode": + return "jpg" + case "FlateDecode": + return "zz" + default: + return filter.String + } + } + return "stream" +} + +func list(mtime time.Time, updater *pdf.Updater) { + stamp := mtime.Local().Format("01-02-2006 15:04:05") + for _, o := range updater.ListIndirect() { + object, err := updater.Get(o.N, o.Generation) + size := 0 + if err != nil { + fmt.Fprintf(os.Stderr, "%s\n", err) + } else { + // Accidental transformation, retrieving original data is more work. + size = len(object.Serialize()) + } + fmt.Printf("-r--r--r-- 1 0 0 %d %s n%dg%d\n", + size, stamp, o.N, o.Generation) + if object.Kind == pdf.Stream { + fmt.Printf("-r--r--r-- 1 0 0 %d %s n%dg%d.%s\n", len(object.Stream), + stamp, o.N, o.Generation, streamSuffix(&object)) + } + } +} + +func copyout(updater *pdf.Updater, storedFilename, extractTo string) { + var ( + n, generation uint + suffix string + ) + m, err := fmt.Sscanf(storedFilename, "n%dg%d%s", &n, &generation, &suffix) + if m < 2 { + die(3, "%s: %s", storedFilename, err) + } + + object, err := updater.Get(n, generation) + if err != nil { + die(3, "%s: %s", storedFilename, err) + } + + content := []byte(object.Serialize()) + if suffix != "" { + content = object.Stream + } + if err = os.WriteFile(extractTo, content, 0666); err != nil { + die(3, "%s", err) + } +} + +func main() { + flag.Usage = usage + flag.Parse() + if flag.NArg() < 2 { + usage() + } + + command, documentPath := flag.Arg(0), flag.Arg(1) + doc, err := os.ReadFile(documentPath) + if err != nil { + die(1, "%s", err) + } + + mtime := time.UnixMilli(0) + if info, err := os.Stat(documentPath); err == nil { + mtime = info.ModTime() + } + + updater, err := pdf.NewUpdater(doc) + if err != nil { + die(2, "%s", err) + } + + switch command { + default: + die(1, "unsupported command: %s", command) + case "list": + if flag.NArg() != 2 { + usage() + } else { + list(mtime, updater) + } + case "copyout": + if flag.NArg() != 4 { + usage() + } else { + copyout(updater, flag.Arg(2), flag.Arg(3)) + } + } +} diff --git a/cmd/pdf-simple-sign/main.go b/cmd/pdf-simple-sign/main.go new file mode 100644 index 0000000..5141a12 --- /dev/null +++ b/cmd/pdf-simple-sign/main.go @@ -0,0 +1,76 @@ +// +// Copyright (c) 2018 - 2020, Přemysl Eric Janouch <p@janouch.name> +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// + +// pdf-simple-sign is a simple PDF signer. +package main + +import ( + "flag" + "fmt" + "io/ioutil" + "os" + + "janouch.name/pdf-simple-sign/pdf" +) + +// #include <unistd.h> +import "C" + +func isatty(fd uintptr) bool { return C.isatty(C.int(fd)) != 0 } + +func die(status int, format string, args ...interface{}) { + msg := fmt.Sprintf(format+"\n", args...) + if isatty(os.Stderr.Fd()) { + msg = "\x1b[0;31m" + msg + "\x1b[m" + } + os.Stderr.WriteString(msg) + os.Exit(status) +} + +func usage() { + die(1, "Usage: %s [-h] [-r RESERVATION] INPUT-FILENAME OUTPUT-FILENAME "+ + "PKCS12-PATH PKCS12-PASS", os.Args[0]) +} + +var reservation = flag.Int( + "r", 4096, "signature reservation as a number of bytes") + +func main() { + flag.Usage = usage + flag.Parse() + if flag.NArg() != 4 { + usage() + } + + inputPath, outputPath := flag.Arg(0), flag.Arg(1) + doc, err := ioutil.ReadFile(inputPath) + if err != nil { + die(1, "%s", err) + } + p12, err := ioutil.ReadFile(flag.Arg(2)) + if err != nil { + die(2, "%s", err) + } + key, certs, err := pdf.PKCS12Parse(p12, flag.Arg(3)) + if err != nil { + die(3, "%s", err) + } + if doc, err = pdf.Sign(doc, key, certs, *reservation); err != nil { + die(4, "error: %s", err) + } + if err = ioutil.WriteFile(outputPath, doc, 0666); err != nil { + die(5, "%s", err) + } +} @@ -0,0 +1,8 @@ +module janouch.name/pdf-simple-sign + +go 1.17 + +require ( + go.mozilla.org/pkcs7 v0.0.0-20210826202110-33d05740a352 + golang.org/x/crypto v0.10.0 +) @@ -0,0 +1,13 @@ +go.mozilla.org/pkcs7 v0.0.0-20200128120323-432b2356ecb1 h1:A/5uWzF44DlIgdm/PQFwfMkW0JX+cIcQi/SwLAmZP5M= +go.mozilla.org/pkcs7 v0.0.0-20200128120323-432b2356ecb1/go.mod h1:SNgMg+EgDFwmvSmLRTNKC5fegJjB7v23qTQ0XLGUNHk= +go.mozilla.org/pkcs7 v0.0.0-20210826202110-33d05740a352 h1:CCriYyAfq1Br1aIYettdHZTy8mBTIPo7We18TuO/bak= +go.mozilla.org/pkcs7 v0.0.0-20210826202110-33d05740a352/go.mod h1:SNgMg+EgDFwmvSmLRTNKC5fegJjB7v23qTQ0XLGUNHk= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de h1:ikNHVSjEfnvz6sxdSPCaPt572qowuyMDMJLLm3Db3ig= +golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM= +golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..d68a99d --- /dev/null +++ b/meson.build @@ -0,0 +1,23 @@ +project('pdf-simple-sign', 'cpp', default_options : ['cpp_std=c++11'], + version : '1.1.1') + +conf = configuration_data() +conf.set_quoted('PROJECT_NAME', meson.project_name()) +conf.set_quoted('PROJECT_VERSION', meson.project_version()) +configure_file(output : 'config.h', configuration : conf) + +cryptodep = dependency('libcrypto') +executable('pdf-simple-sign', 'pdf-simple-sign.cpp', + install : true, + dependencies : cryptodep) + +asciidoctor = find_program('asciidoctor') +foreach page : ['pdf-simple-sign'] + custom_target('manpage for ' + page, + input: page + '.adoc', output: page + '.1', + command: [asciidoctor, '-b', 'manpage', + '-a', 'release-version=' + meson.project_version(), + '@INPUT@', '-o', '@OUTPUT@'], + install: true, + install_dir: join_paths(get_option('mandir'), 'man1')) +endforeach diff --git a/pdf-simple-sign.adoc b/pdf-simple-sign.adoc new file mode 100644 index 0000000..4ab1bc5 --- /dev/null +++ b/pdf-simple-sign.adoc @@ -0,0 +1,80 @@ +pdf-simple-sign(1) +================== +:doctype: manpage +:manmanual: pdf-simple-sign Manual +:mansource: pdf-simple-sign {release-version} + +Name +---- +pdf-simple-sign - a simple PDF signer + +Synopsis +-------- +*pdf-simple-sign* [_OPTION_]... _INPUT.pdf_ _OUTPUT.pdf_ _KEY-PAIR.p12_ _PASSWORD_ + +Description +----------- +*pdf-simple-sign* is a simple PDF signer intended for documents produced by +the Cairo library, GNU troff, ImageMagick, or similar. As such, it currently +comes with some restrictions: + + * the document may not have any forms or signatures already, as they would be + overwritten, + * the document may not employ cross-reference streams, or must constitute + a hybrid-reference file at least. + +The key and certificate pair is accepted in the PKCS#12 format. The _PASSWORD_ +must be supplied on the command line, and may be empty if it is not needed. + +The signature is attached to the first page and has no appearance. + +If signature data don't fit within the default reservation of 4 kibibytes, +you might need to adjust it using the *-r* option, or throw out any unnecessary +intermediate certificates. + +Options +------- +*-r* _RESERVATION_, *--reservation*=_RESERVATION_:: + Set aside _RESERVATION_ amount of bytes for the resulting signature. + Feel free to try a few values in a loop. The program itself has no + conceptions about the data, so it can't make accurate predictions. + +*-h*, *--help*:: + Display a help message and exit. + +*-V*, *--version*:: + Output version information and exit. + +Examples +-------- +Create a self-signed certificate, make a document containing the current date, +sign it and verify the attached signature: + + $ openssl req -newkey rsa:2048 -subj /CN=Test -nodes \ + -keyout key.pem -x509 -addext keyUsage=digitalSignature \ + -out cert.pem 2>/dev/null + $ openssl pkcs12 -inkey key.pem -in cert.pem \ + -export -passout pass: -out key-pair.p12 + $ date | groff -T pdf > test.pdf + $ pdf-simple-sign test.pdf test.signed.pdf key-pair.p12 "" + $ pdfsig test.signed.pdf + Digital Signature Info of: test.signed.pdf + Signature #1: + - Signer Certificate Common Name: Test + - Signer full Distinguished Name: CN=Test + - Signing Time: Sep 05 2020 19:41:22 + - Signing Hash Algorithm: SHA-256 + - Signature Type: adbe.pkcs7.detached + - Signed Ranges: [0 - 6522], [14716 - 15243] + - Total document signed + - Signature Validation: Signature is Valid. + - Certificate Validation: Certificate issuer isn't Trusted. + +Reporting bugs +-------------- +Use https://git.janouch.name/p/pdf-simple-sign to report bugs, request features, +or submit pull requests. + +See also +-------- +*openssl*(1), *pdfsig*(1) diff --git a/pdf-simple-sign.cpp b/pdf-simple-sign.cpp new file mode 100644 index 0000000..8b9d1fe --- /dev/null +++ b/pdf-simple-sign.cpp @@ -0,0 +1,1017 @@ +// vim: set sw=2 ts=2 sts=2 et tw=100: +// +// pdf-simple-sign: simple PDF signer +// +// Copyright (c) 2017 - 2020, Přemysl Eric Janouch <p@janouch.name> +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// + +#include <cmath> +#include <cstdio> +#undef NDEBUG +#include <cassert> + +#include <map> +#include <memory> +#include <regex> +#include <set> +#include <vector> + +#if defined __GLIBCXX__ && __GLIBCXX__ < 20140422 +#error Need libstdc++ >= 4.9 for <regex> +#endif + +#include <getopt.h> +#include <openssl/err.h> +#include <openssl/pkcs12.h> +#include <openssl/x509v3.h> +#include <unistd.h> + +#include "config.h" + +// ------------------------------------------------------------------------------------------------- + +using uint = unsigned int; +using ushort = unsigned short; + +static std::string concatenate(const std::vector<std::string>& v, const std::string& delim) { + std::string res; + if (v.empty()) + return res; + for (const auto& s : v) + res += s + delim; + return res.substr(0, res.length() - delim.length()); +} + +template<typename... Args> +std::string ssprintf(const std::string& format, Args... args) { + size_t size = std::snprintf(nullptr, 0, format.c_str(), args...) + 1; + std::unique_ptr<char[]> buf(new char[size]); + std::snprintf(buf.get(), size, format.c_str(), args...); + return std::string(buf.get(), buf.get() + size - 1); +} + +// ------------------------------------------------------------------------------------------------- + +/// PDF token/object thingy. Objects may be composed either from one or a sequence of tokens. +/// The PDF Reference doesn't actually speak of tokens, though ISO 32000-1:2008 does. +struct pdf_object { + enum type { + END, NL, COMMENT, NIL, BOOL, NUMERIC, KEYWORD, NAME, STRING, + // Simple tokens + B_ARRAY, E_ARRAY, B_DICT, E_DICT, + // Higher-level objects + ARRAY, DICT, OBJECT, REFERENCE, + } type = END; + + std::string string; ///< END (error message), COMMENT/KEYWORD/NAME/STRING + double number = 0.; ///< BOOL, NUMERIC + std::vector<pdf_object> array; ///< ARRAY, OBJECT + std::map<std::string, pdf_object> dict; ///< DICT, in the future also STREAM + uint n = 0, generation = 0; ///< OBJECT, REFERENCE + + pdf_object(enum type type = END) : type(type) {} + pdf_object(enum type type, double v) : type(type), number(v) {} + pdf_object(enum type type, const std::string& v) : type(type), string(v) {} + pdf_object(enum type type, uint n, uint g) : type(type), n(n), generation(g) {} + pdf_object(const std::vector<pdf_object>& array) : type(ARRAY), array(array) {} + pdf_object(const std::map<std::string, pdf_object>& dict) : type(DICT), dict(dict) {} + + pdf_object(const pdf_object&) = default; + pdf_object(pdf_object&&) = default; + pdf_object& operator=(const pdf_object&) = default; + pdf_object& operator=(pdf_object&&) = default; + + /// Return whether this is a number without a fractional part + bool is_integer() const { + double tmp; + return type == NUMERIC && std::modf(number, &tmp) == 0.; + } +}; + +/// Basic lexical analyser for the Portable Document Format, giving limited error information +struct pdf_lexer { + const unsigned char* p; + pdf_lexer(const char* s) : p(reinterpret_cast<const unsigned char*>(s)) {} + + static constexpr const char* oct_alphabet = "01234567"; + static constexpr const char* dec_alphabet = "0123456789"; + static constexpr const char* hex_alphabet = "0123456789abcdefABCDEF"; + static constexpr const char* whitespace = "\t\n\f\r "; + static constexpr const char* delimiters = "()<>[]{}/%"; + + bool eat_newline(int ch) { + if (ch == '\r') { + if (*p == '\n') p++; + return true; + } + return ch == '\n'; + } + + pdf_object string() { + std::string value; + int parens = 1; + while (1) { + if (!*p) return {pdf_object::END, "unexpected end of string"}; + auto ch = *p++; + if (eat_newline(ch)) ch = '\n'; + else if (ch == '(') { parens++; } + else if (ch == ')') { if (!--parens) break; } + else if (ch == '\\') { + if (!*p) return {pdf_object::END, "unexpected end of string"}; + switch ((ch = *p++)) { + case 'n': ch = '\n'; break; + case 'r': ch = '\r'; break; + case 't': ch = '\t'; break; + case 'b': ch = '\b'; break; + case 'f': ch = '\f'; break; + default: + if (eat_newline(ch)) + continue; + std::string octal; + if (ch && strchr(oct_alphabet, ch)) { + octal += ch; + if (*p && strchr(oct_alphabet, *p)) octal += *p++; + if (*p && strchr(oct_alphabet, *p)) octal += *p++; + ch = std::stoi(octal, nullptr, 8); + } + } + } + value += ch; + } + return {pdf_object::STRING, value}; + } + + pdf_object string_hex() { + std::string value, buf; + while (*p != '>') { + if (!*p) return {pdf_object::END, "unexpected end of hex string"}; + if (!strchr(hex_alphabet, *p)) + return {pdf_object::END, "invalid hex string"}; + buf += *p++; + if (buf.size() == 2) { + value += char(std::stoi(buf, nullptr, 16)); + buf.clear(); + } + } + p++; + if (!buf.empty()) value += char(std::stoi(buf + '0', nullptr, 16)); + return {pdf_object::STRING, value}; + } + + pdf_object name() { + std::string value; + while (!strchr(whitespace, *p) && !strchr(delimiters, *p)) { + auto ch = *p++; + if (ch == '#') { + std::string hexa; + if (*p && strchr(hex_alphabet, *p)) hexa += *p++; + if (*p && strchr(hex_alphabet, *p)) hexa += *p++; + if (hexa.size() != 2) + return {pdf_object::END, "invalid name hexa escape"}; + ch = char(std::stoi(hexa, nullptr, 16)); + } + value += ch; + } + if (value.empty()) return {pdf_object::END, "unexpected end of name"}; + return {pdf_object::NAME, value}; + } + + pdf_object comment() { + std::string value; + while (*p && *p != '\r' && *p != '\n') + value += *p++; + return {pdf_object::COMMENT, value}; + } + + // XXX maybe invalid numbers should rather be interpreted as keywords + pdf_object number() { + std::string value; + if (*p == '-') + value += *p++; + bool real = false, digits = false; + while (*p) { + if (strchr(dec_alphabet, *p)) + digits = true; + else if (*p == '.' && !real) + real = true; + else + break; + value += *p++; + } + if (!digits) return {pdf_object::END, "invalid number"}; + return {pdf_object::NUMERIC, std::stod(value, nullptr)}; + } + + pdf_object next() { + if (!*p) + return {pdf_object::END}; + if (strchr("-0123456789.", *p)) + return number(); + + // {} end up being keywords, we might want to error out on those + std::string value; + while (!strchr(whitespace, *p) && !strchr(delimiters, *p)) + value += *p++; + if (!value.empty()) { + if (value == "null") return {pdf_object::NIL}; + if (value == "true") return {pdf_object::BOOL, 1}; + if (value == "false") return {pdf_object::BOOL, 0}; + return {pdf_object::KEYWORD, value}; + } + + switch (char ch = *p++) { + case '/': return name(); + case '%': return comment(); + case '(': return string(); + case '[': return {pdf_object::B_ARRAY}; + case ']': return {pdf_object::E_ARRAY}; + case '<': + if (*p++ == '<') + return {pdf_object::B_DICT}; + p--; + return string_hex(); + case '>': + if (*p++ == '>') + return {pdf_object::E_DICT}; + p--; + return {pdf_object::END, "unexpected '>'"}; + default: + if (eat_newline(ch)) + return {pdf_object::NL}; + if (strchr(whitespace, ch)) + return next(); + return {pdf_object::END, "unexpected input"}; + } + } +}; + +// FIXME lines /should not/ be longer than 255 characters, some wrapping is in order +static std::string pdf_serialize(const pdf_object& o) { + switch (o.type) { + case pdf_object::NL: return "\n"; + case pdf_object::NIL: return "null"; + case pdf_object::BOOL: return o.number ? "true" : "false"; + case pdf_object::NUMERIC: { + if (o.is_integer()) return std::to_string((long long) o.number); + return std::to_string(o.number); + } + case pdf_object::KEYWORD: return o.string; + case pdf_object::NAME: { + std::string escaped = "/"; + for (char c : o.string) { + if (c == '#' || strchr(pdf_lexer::delimiters, c) || strchr(pdf_lexer::whitespace, c)) + escaped += ssprintf("#%02x", c); + else + escaped += c; + } + return escaped; + } + case pdf_object::STRING: { + std::string escaped; + for (char c : o.string) { + if (c == '\\' || c == '(' || c == ')') + escaped += '\\'; + escaped += c; + } + return "(" + escaped + ")"; + } + case pdf_object::B_ARRAY: return "["; + case pdf_object::E_ARRAY: return "]"; + case pdf_object::B_DICT: return "<<"; + case pdf_object::E_DICT: return ">>"; + case pdf_object::ARRAY: { + std::vector<std::string> v; + for (const auto& i : o.array) + v.push_back(pdf_serialize(i)); + return "[ " + concatenate(v, " ") + " ]"; + } + case pdf_object::DICT: { + std::string s; + for (const auto i : o.dict) + // FIXME the key is also supposed to be escaped by pdf_serialize() + s += " /" + i.first + " " + pdf_serialize(i.second); + return "<<" + s + " >>"; + } + case pdf_object::OBJECT: + return ssprintf("%u %u obj\n", o.n, o.generation) + pdf_serialize(o.array.at(0)) + "\nendobj"; + case pdf_object::REFERENCE: + return ssprintf("%u %u R", o.n, o.generation); + default: + assert(!"unsupported token for serialization"); + } +} + +// ------------------------------------------------------------------------------------------------- + +/// Utility class to help read and possibly incrementally update PDF files +class pdf_updater { + struct ref { + size_t offset = 0; ///< File offset or N of the next free entry + uint generation = 0; ///< Object generation + bool free = true; ///< Whether this N has been deleted + }; + + std::vector<ref> xref; ///< Cross-reference table + size_t xref_size = 0; ///< Current cross-reference table size, correlated to xref.size() + std::set<uint> updated; ///< List of updated objects + + pdf_object parse_obj(pdf_lexer& lex, std::vector<pdf_object>& stack) const; + pdf_object parse_R(std::vector<pdf_object>& stack) const; + pdf_object parse(pdf_lexer& lex, std::vector<pdf_object>& stack) const; + std::string load_xref(pdf_lexer& lex, std::set<uint>& loaded_entries); + +public: + /// The new trailer dictionary to be written, initialized with the old one + std::map<std::string, pdf_object> trailer; + + std::string& document; + pdf_updater(std::string& document) : document(document) {} + + /// Build the cross-reference table and prepare a new trailer dictionary + std::string initialize(); + /// Try to extract the claimed PDF version as a positive decimal number, e.g. 17 for PDF 1.7. + /// Returns zero on failure. + int version(const pdf_object& root) const; + /// Retrieve an object by its number and generation -- may return NIL or END with an error + pdf_object get(uint n, uint generation) const; + /// Allocate a new object number + uint allocate(); + /// Append an updated object to the end of the document + void update(uint n, std::function<void()> fill); + /// Write an updated cross-reference table and trailer + void flush_updates(); +}; + +// ------------------------------------------------------------------------------------------------- + +/// If the object is an error, forward its message, otherwise return err. +static std::string pdf_error(const pdf_object& o, const char* err) { + if (o.type != pdf_object::END || o.string.empty()) return err; + return o.string; +} + +pdf_object pdf_updater::parse_obj(pdf_lexer& lex, std::vector<pdf_object>& stack) const { + if (stack.size() < 2) + return {pdf_object::END, "missing object ID pair"}; + + auto g = stack.back(); stack.pop_back(); + auto n = stack.back(); stack.pop_back(); + if (!g.is_integer() || g.number < 0 || g.number > UINT_MAX || + !n.is_integer() || n.number < 0 || n.number > UINT_MAX) + return {pdf_object::END, "invalid object ID pair"}; + + pdf_object obj{pdf_object::OBJECT}; + obj.n = n.number; + obj.generation = g.number; + + while (1) { + auto object = parse(lex, obj.array); + if (object.type == pdf_object::END) + return {pdf_object::END, pdf_error(object, "object doesn't end")}; + if (object.type == pdf_object::KEYWORD && object.string == "endobj") + break; + obj.array.push_back(std::move(object)); + } + return obj; +} + +pdf_object pdf_updater::parse_R(std::vector<pdf_object>& stack) const { + if (stack.size() < 2) + return {pdf_object::END, "missing reference ID pair"}; + + auto g = stack.back(); stack.pop_back(); + auto n = stack.back(); stack.pop_back(); + if (!g.is_integer() || g.number < 0 || g.number > UINT_MAX || + !n.is_integer() || n.number < 0 || n.number > UINT_MAX) + return {pdf_object::END, "invalid reference ID pair"}; + + pdf_object ref{pdf_object::REFERENCE}; + ref.n = n.number; + ref.generation = g.number; + return ref; +} + +/// Read an object at the lexer's position. Not a strict parser. +pdf_object pdf_updater::parse(pdf_lexer& lex, std::vector<pdf_object>& stack) const { + auto token = lex.next(); + switch (token.type) { + case pdf_object::NL: + case pdf_object::COMMENT: + // These are not important to parsing, not even for this procedure's needs + return parse(lex, stack); + case pdf_object::B_ARRAY: { + std::vector<pdf_object> array; + while (1) { + auto object = parse(lex, array); + if (object.type == pdf_object::END) + return {pdf_object::END, pdf_error(object, "array doesn't end")}; + if (object.type == pdf_object::E_ARRAY) + break; + array.push_back(std::move(object)); + } + return array; + } + case pdf_object::B_DICT: { + std::vector<pdf_object> array; + while (1) { + auto object = parse(lex, array); + if (object.type == pdf_object::END) + return {pdf_object::END, pdf_error(object, "dictionary doesn't end")}; + if (object.type == pdf_object::E_DICT) + break; + array.push_back(std::move(object)); + } + if (array.size() % 2) + return {pdf_object::END, "unbalanced dictionary"}; + std::map<std::string, pdf_object> dict; + for (size_t i = 0; i < array.size(); i += 2) { + if (array[i].type != pdf_object::NAME) + return {pdf_object::END, "invalid dictionary key type"}; + dict.insert({array[i].string, std::move(array[i + 1])}); + } + return dict; + } + case pdf_object::KEYWORD: + // Appears in the document body, typically needs to access the cross-reference table + // TODO use the xref to read /Length etc. once we actually need to read such objects; + // presumably streams can use the pdf_object::string member + if (token.string == "stream") return {pdf_object::END, "streams are not supported yet"}; + if (token.string == "obj") return parse_obj(lex, stack); + if (token.string == "R") return parse_R(stack); + return token; + default: + return token; + } +} + +std::string pdf_updater::load_xref(pdf_lexer& lex, std::set<uint>& loaded_entries) { + std::vector<pdf_object> throwaway_stack; + { + auto keyword = parse(lex, throwaway_stack); + if (keyword.type != pdf_object::KEYWORD || keyword.string != "xref") + return "invalid xref table"; + } + while (1) { + auto object = parse(lex, throwaway_stack); + if (object.type == pdf_object::END) + return pdf_error(object, "unexpected EOF while looking for the trailer"); + if (object.type == pdf_object::KEYWORD && object.string == "trailer") + break; + + auto second = parse(lex, throwaway_stack); + if (!object.is_integer() || object.number < 0 || object.number > UINT_MAX || + !second.is_integer() || second.number < 0 || second.number > UINT_MAX) + return "invalid xref section header"; + + const size_t start = object.number; + const size_t count = second.number; + for (size_t i = 0; i < count; i++) { + auto off = parse(lex, throwaway_stack); + auto gen = parse(lex, throwaway_stack); + auto key = parse(lex, throwaway_stack); + if (!off.is_integer() || off.number < 0 || off.number > document.length() || + !gen.is_integer() || gen.number < 0 || gen.number > 65535 || + key.type != pdf_object::KEYWORD) + return "invalid xref entry"; + + bool free = true; + if (key.string == "n") + free = false; + else if (key.string != "f") + return "invalid xref entry"; + + auto n = start + i; + if (loaded_entries.count(n)) + continue; + if (n >= xref.size()) + xref.resize(n + 1); + loaded_entries.insert(n); + + auto& ref = xref[n]; + ref.generation = gen.number; + ref.offset = off.number; + ref.free = free; + } + } + return ""; +} + +// ------------------------------------------------------------------------------------------------- + +std::string pdf_updater::initialize() { + // We only need to look for startxref roughly within the last kibibyte of the document + static std::regex haystack_re(R"([\s\S]*\sstartxref\s+(\d+)\s+%%EOF)"); + std::string haystack = document.substr(document.length() < 1024 ? 0 : document.length() - 1024); + + std::smatch m; + if (!std::regex_search(haystack, m, haystack_re, std::regex_constants::match_continuous)) + return "cannot find startxref"; + + size_t xref_offset = std::stoul(m.str(1)), last_xref_offset = xref_offset; + std::set<size_t> loaded_xrefs; + std::set<uint> loaded_entries; + + std::vector<pdf_object> throwaway_stack; + while (1) { + if (loaded_xrefs.count(xref_offset)) + return "circular xref offsets"; + if (xref_offset >= document.length()) + return "invalid xref offset"; + + pdf_lexer lex(document.c_str() + xref_offset); + auto err = load_xref(lex, loaded_entries); + if (!err.empty()) return err; + + auto trailer = parse(lex, throwaway_stack); + if (trailer.type != pdf_object::DICT) + return pdf_error(trailer, "invalid trailer dictionary"); + if (loaded_xrefs.empty()) + this->trailer = trailer.dict; + loaded_xrefs.insert(xref_offset); + + const auto prev_offset = trailer.dict.find("Prev"); + if (prev_offset == trailer.dict.end()) + break; + // FIXME do not read offsets and sizes as floating point numbers + if (!prev_offset->second.is_integer() || prev_offset->second.number < 0) + return "invalid Prev offset"; + xref_offset = prev_offset->second.number; + } + + trailer["Prev"] = {pdf_object::NUMERIC, double(last_xref_offset)}; + const auto last_size = trailer.find("Size"); + if (last_size == trailer.end() || !last_size->second.is_integer() || + last_size->second.number <= 0) + return "invalid or missing cross-reference table Size"; + + xref_size = last_size->second.number; + return ""; +} + +int pdf_updater::version(const pdf_object& root) const { + auto version = root.dict.find("Version"); + if (version != root.dict.end() && version->second.type == pdf_object::NAME) { + const auto& v = version->second.string; + if (isdigit(v[0]) && v[1] == '.' && isdigit(v[2]) && !v[3]) + return (v[0] - '0') * 10 + (v[2] - '0'); + } + + // We only need to look for the comment roughly within the first kibibyte of the document + static std::regex version_re(R"((?:^|[\r\n])%(?:!PS-Adobe-\d\.\d )?PDF-(\d)\.(\d)[\r\n])"); + std::string haystack = document.substr(0, 1024); + + std::smatch m; + if (std::regex_search(haystack, m, version_re, std::regex_constants::match_default)) + return std::stoul(m.str(1)) * 10 + std::stoul(m.str(2)); + + return 0; +} + +pdf_object pdf_updater::get(uint n, uint generation) const { + if (n >= xref_size) + return {pdf_object::NIL}; + + const auto& ref = xref[n]; + if (ref.free || ref.generation != generation || ref.offset >= document.length()) + return {pdf_object::NIL}; + + pdf_lexer lex(document.c_str() + ref.offset); + std::vector<pdf_object> stack; + while (1) { + auto object = parse(lex, stack); + if (object.type == pdf_object::END) + return object; + if (object.type != pdf_object::OBJECT) + stack.push_back(std::move(object)); + else if (object.n != n || object.generation != generation) + return {pdf_object::END, "object mismatch"}; + else + return std::move(object.array.at(0)); + } +} + +uint pdf_updater::allocate() { + assert(xref_size < UINT_MAX); + + auto n = xref_size++; + if (xref.size() < xref_size) + xref.resize(xref_size); + + // We don't make sure it gets a subsection in the update yet because we + // make no attempts at fixing the linked list of free items either + return n; +} + +void pdf_updater::update(uint n, std::function<void()> fill) { + auto& ref = xref.at(n); + ref.offset = document.length() + 1; + ref.free = false; + updated.insert(n); + + document += ssprintf("\n%u %u obj\n", n, ref.generation); + // Separately so that the callback can use document.length() to get the current offset + fill(); + document += "\nendobj"; +} + +void pdf_updater::flush_updates() { + std::map<uint, size_t> groups; + for (auto i = updated.cbegin(); i != updated.cend(); ) { + size_t start = *i, count = 1; + while (++i != updated.cend() && *i == start + count) + count++; + groups[start] = count; + } + + // Taking literally "Each cross-reference section begins with a line containing the keyword xref. + // Following this line are one or more cross-reference subsections." from 3.4.3 in PDF Reference + if (groups.empty()) + groups[0] = 0; + + auto startxref = document.length() + 1; + document += "\nxref\n"; + for (const auto& g : groups) { + document += ssprintf("%u %zu\n", g.first, g.second); + for (size_t i = 0; i < g.second; i++) { + auto& ref = xref[g.first + i]; + document += ssprintf("%010zu %05u %c \n", ref.offset, ref.generation, "nf"[!!ref.free]); + } + } + + trailer["Size"] = {pdf_object::NUMERIC, double(xref_size)}; + document += + "trailer\n" + pdf_serialize(trailer) + ssprintf("\nstartxref\n%zu\n%%%%EOF\n", startxref); +} + +// ------------------------------------------------------------------------------------------------- + +/// Make a PDF object representing the given point in time +static pdf_object pdf_date(time_t timestamp) { + struct tm parts; + assert(localtime_r(×tamp, &parts)); + + char buf[64]; + assert(strftime(buf, sizeof buf, "D:%Y%m%d%H%M%S", &parts)); + + std::string offset = "Z"; + auto offset_min = parts.tm_gmtoff / 60; + if (parts.tm_gmtoff < 0) + offset = ssprintf("-%02ld'%02ld'", -offset_min / 60, -offset_min % 60); + if (parts.tm_gmtoff > 0) + offset = ssprintf("+%02ld'%02ld'", +offset_min / 60, +offset_min % 60); + return {pdf_object::STRING, buf + offset}; +} + +static pdf_object pdf_get_first_page(pdf_updater& pdf, uint node_n, uint node_generation) { + auto obj = pdf.get(node_n, node_generation); + if (obj.type != pdf_object::DICT) + return {pdf_object::NIL}; + + // Out of convenience; these aren't filled normally + obj.n = node_n; + obj.generation = node_generation; + + auto type = obj.dict.find("Type"); + if (type == obj.dict.end() || type->second.type != pdf_object::NAME) + return {pdf_object::NIL}; + if (type->second.string == "Page") + return obj; + if (type->second.string != "Pages") + return {pdf_object::NIL}; + + // XXX technically speaking, this may be an indirect reference. The correct way to solve this + // seems to be having "pdf_updater" include a wrapper around "obj.dict.find" + auto kids = obj.dict.find("Kids"); + if (kids == obj.dict.end() || kids->second.type != pdf_object::ARRAY || + kids->second.array.empty() || + kids->second.array.at(0).type != pdf_object::REFERENCE) + return {pdf_object::NIL}; + + // XXX nothing prevents us from recursing in an evil circular graph + return pdf_get_first_page(pdf, kids->second.array.at(0).n, kids->second.array.at(0).generation); +} + +// ------------------------------------------------------------------------------------------------- + +static std::string pkcs12_path, pkcs12_pass; + +// /All/ bytes are checked, except for the signature hexstring itself +static std::string pdf_fill_in_signature(std::string& document, size_t sign_off, size_t sign_len) { + size_t tail_off = sign_off + sign_len, tail_len = document.size() - tail_off; + if (pkcs12_path.empty()) + return "undefined path to the signing key"; + + auto pkcs12_fp = fopen(pkcs12_path.c_str(), "r"); + if (!pkcs12_fp) + return pkcs12_path + ": " + strerror(errno); + + // Abandon hope, all ye who enter OpenSSL! Half of it is undocumented. + OpenSSL_add_all_algorithms(); + ERR_load_crypto_strings(); + ERR_clear_error(); + + PKCS12* p12 = nullptr; + EVP_PKEY* private_key = nullptr; + X509* certificate = nullptr; + STACK_OF(X509)* chain = nullptr; + PKCS7* p7 = nullptr; + int len = 0, sign_flags = PKCS7_DETACHED | PKCS7_BINARY | PKCS7_NOSMIMECAP | PKCS7_PARTIAL; + BIO* p7bio = nullptr; + unsigned char* buf = nullptr; + + // OpenSSL error reasons will usually be of more value than any distinction I can come up with + std::string err = "OpenSSL failure"; + + if (!(p12 = d2i_PKCS12_fp(pkcs12_fp, nullptr)) || + !PKCS12_parse(p12, pkcs12_pass.c_str(), &private_key, &certificate, &chain)) { + err = pkcs12_path + ": parse failure"; + goto error; + } + if (!private_key || !certificate) { + err = pkcs12_path + ": must contain a private key and a valid certificate chain"; + goto error; + } + // Prevent useless signatures -- makes pdfsig from poppler happy at least (and NSS by extension) + if (!(X509_get_key_usage(certificate) & (KU_DIGITAL_SIGNATURE | KU_NON_REPUDIATION))) { + err = "the certificate's key usage must include digital signatures or non-repudiation"; + goto error; + } + if (!(X509_get_extended_key_usage(certificate) & (XKU_SMIME | XKU_ANYEKU))) { + err = "the certificate's extended key usage must include S/MIME"; + goto error; + } +#if 0 // This happily ignores XKU_ANYEKU and I want my tiny world to make a tiny bit more sense + if (X509_check_purpose(certificate, X509_PURPOSE_SMIME_SIGN, false /* not a CA certificate */)) { + err = "the certificate can't be used for S/MIME digital signatures"; + goto error; + } +#endif + + // The default digest is SHA1, which is mildly insecure now -- hence using PKCS7_sign_add_signer + if (!(p7 = PKCS7_sign(nullptr, nullptr, nullptr, nullptr, sign_flags)) || + !PKCS7_sign_add_signer(p7, certificate, private_key, EVP_sha256(), sign_flags)) + goto error; + // For RFC 3161, this is roughly how a timestamp token would be attached (see Appendix A): + // PKCS7_add_attribute(signer_info, NID_id_smime_aa_timeStampToken, V_ASN1_SEQUENCE, value) + for (int i = 0; i < sk_X509_num(chain); i++) + if (!PKCS7_add_certificate(p7, sk_X509_value(chain, i))) + goto error; + + // Adaptation of the innards of the undocumented PKCS7_final() -- I didn't feel like making + // a copy of the whole document. Hopefully this writes directly into a digest BIO. + if (!(p7bio = PKCS7_dataInit(p7, nullptr)) || + (ssize_t) sign_off != BIO_write(p7bio, document.data(), sign_off) || + (ssize_t) tail_len != BIO_write(p7bio, document.data() + tail_off, tail_len) || + BIO_flush(p7bio) != 1 || !PKCS7_dataFinal(p7, p7bio)) + goto error; + +#if 0 + { + // Debugging: openssl cms -inform PEM -in pdf_signature.pem -noout -cmsout -print + // Context: https://stackoverflow.com/a/29253469 + auto fp = fopen("pdf_signature.pem", "wb"); + assert(PEM_write_PKCS7(fp, p7) && !fclose(fp)); + } +#endif + + if ((len = i2d_PKCS7(p7, &buf)) < 0) + goto error; + if (size_t(len) * 2 > sign_len - 2 /* hexstring quotes */) { + // The obvious solution is to increase the allocation... or spend a week reading specifications + // while losing all faith in humanity as a species, and skip the PKCS7 API entirely + err = ssprintf("not enough space reserved for the signature (%zu nibbles vs %zu nibbles)", + sign_len - 2, size_t(len) * 2); + goto error; + } + for (int i = 0; i < len; i++) { + document[sign_off + 2 * i + 1] = "0123456789abcdef"[buf[i] / 16]; + document[sign_off + 2 * i + 2] = "0123456789abcdef"[buf[i] % 16]; + } + err.clear(); + +error: + OPENSSL_free(buf); + BIO_free_all(p7bio); + PKCS7_free(p7); + sk_X509_pop_free(chain, X509_free); + X509_free(certificate); + EVP_PKEY_free(private_key); + PKCS12_free(p12); + + // In any case, clear the error stack (it's a queue, really) to avoid confusion elsewhere + while (auto code = ERR_get_error()) + if (auto reason = ERR_reason_error_string(code)) + err = err + "; " + reason; + + fclose(pkcs12_fp); + return err; +} + +// ------------------------------------------------------------------------------------------------- + +/// The presumption here is that the document is valid and that it doesn't employ cross-reference +/// streams from PDF 1.5, or at least constitutes a hybrid-reference file. The results with +/// PDF 2.0 (2017) are currently unknown as the standard costs money. +/// +/// https://www.adobe.com/devnet-docs/acrobatetk/tools/DigSig/Acrobat_DigitalSignatures_in_PDF.pdf +/// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf +/// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PPKAppearances.pdf +static std::string pdf_sign(std::string& document, ushort reservation) { + pdf_updater pdf(document); + auto err = pdf.initialize(); + if (!err.empty()) + return err; + + auto root_ref = pdf.trailer.find("Root"); + if (root_ref == pdf.trailer.end() || root_ref->second.type != pdf_object::REFERENCE) + return "trailer does not contain a reference to Root"; + auto root = pdf.get(root_ref->second.n, root_ref->second.generation); + if (root.type != pdf_object::DICT) + return "invalid Root dictionary reference"; + + // 8.7 Digital Signatures - /signature dictionary/ + auto sigdict_n = pdf.allocate(); + size_t byterange_off = 0, byterange_len = 0, sign_off = 0, sign_len = 0; + pdf.update(sigdict_n, [&] { + // The timestamp is important for Adobe Acrobat Reader DC. The ideal would be to use RFC 3161. + pdf.document.append("<< /Type/Sig /Filter/Adobe.PPKLite /SubFilter/adbe.pkcs7.detached\n" + " /M" + pdf_serialize(pdf_date(time(nullptr))) + " /ByteRange "); + byterange_off = pdf.document.size(); + pdf.document.append((byterange_len = 32 /* fine for a gigabyte */), ' '); + pdf.document.append("\n /Contents <"); + sign_off = pdf.document.size(); + pdf.document.append((sign_len = reservation * 2), '0'); + pdf.document.append("> >>"); + + // We actually need to exclude the hexstring quotes from signing + sign_off -= 1; + sign_len += 2; + }); + + // 8.6.3 Field Types - Signature Fields + pdf_object sigfield{pdf_object::DICT}; + sigfield.dict.insert({"FT", {pdf_object::NAME, "Sig"}}); + sigfield.dict.insert({"V", {pdf_object::REFERENCE, sigdict_n, 0}}); + // 8.4.5 Annotations Types - Widget Annotations + // We can merge the Signature Annotation and omit Kids here + sigfield.dict.insert({"Subtype", {pdf_object::NAME, "Widget"}}); + sigfield.dict.insert({"F", {pdf_object::NUMERIC, 2 /* Hidden */}}); + sigfield.dict.insert({"T", {pdf_object::STRING, "Signature1"}}); + sigfield.dict.insert({"Rect", {std::vector<pdf_object>{ + {pdf_object::NUMERIC, 0}, + {pdf_object::NUMERIC, 0}, + {pdf_object::NUMERIC, 0}, + {pdf_object::NUMERIC, 0}, + }}}); + + auto sigfield_n = pdf.allocate(); + pdf.update(sigfield_n, [&] { pdf.document += pdf_serialize(sigfield); }); + + auto pages_ref = root.dict.find("Pages"); + if (pages_ref == root.dict.end() || pages_ref->second.type != pdf_object::REFERENCE) + return "invalid Pages reference"; + auto page = pdf_get_first_page(pdf, pages_ref->second.n, pages_ref->second.generation); + if (page.type != pdf_object::DICT) + return "invalid or unsupported page tree"; + + auto& annots = page.dict["Annots"]; + if (annots.type != pdf_object::ARRAY) { + // TODO indirectly referenced arrays might not be that hard to support + if (annots.type != pdf_object::END) + return "unexpected Annots"; + + annots = {pdf_object::ARRAY}; + } + annots.array.emplace_back(pdf_object::REFERENCE, sigfield_n, 0); + pdf.update(page.n, [&] { pdf.document += pdf_serialize(page); }); + + // 8.6.1 Interactive Form Dictionary + if (root.dict.count("AcroForm")) + return "the document already contains forms, they would be overwritten"; + + root.dict["AcroForm"] = {std::map<std::string, pdf_object>{ + {"Fields", {std::vector<pdf_object>{ + {pdf_object::REFERENCE, sigfield_n, 0} + }}}, + {"SigFlags", {pdf_object::NUMERIC, 3 /* SignaturesExist | AppendOnly */}} + }}; + + // Upgrade the document version for SHA-256 etc. + if (pdf.version(root) < 16) + root.dict["Version"] = {pdf_object::NAME, "1.6"}; + + pdf.update(root_ref->second.n, [&] { pdf.document += pdf_serialize(root); }); + pdf.flush_updates(); + + // Now that we know the length of everything, store byte ranges of what we're about to sign, + // which must be everything but the resulting signature itself + size_t tail_off = sign_off + sign_len, tail_len = pdf.document.size() - tail_off; + auto ranges = ssprintf("[0 %zu %zu %zu]", sign_off, tail_off, tail_len); + if (ranges.length() > byterange_len) + return "not enough space reserved for /ByteRange"; + pdf.document.replace(byterange_off, std::min(ranges.length(), byterange_len), ranges); + return pdf_fill_in_signature(pdf.document, sign_off, sign_len); +} + +// ------------------------------------------------------------------------------------------------- + +__attribute__((format(printf, 2, 3))) +static void die(int status, const char* format, ...) { + va_list ap; + va_start(ap, format); + if (isatty(fileno(stderr))) + vfprintf(stderr, ssprintf("\x1b[31m%s\x1b[0m\n", format).c_str(), ap); + else + vfprintf(stderr, format, ap); + va_end(ap); + exit(status); +} + +int main(int argc, char* argv[]) { + auto invocation_name = argv[0]; + auto usage = [=] { + die(1, "Usage: %s [-h] [-r RESERVATION] INPUT-FILENAME OUTPUT-FILENAME PKCS12-PATH PKCS12-PASS", + invocation_name); + }; + + static struct option opts[] = { + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {"reservation", required_argument, 0, 'r'}, + {nullptr, 0, 0, 0}, + }; + + // Reserved space in bytes for the certificate, digest, encrypted digest, ... + long reservation = 4096; + while (1) { + int option_index = 0; + auto c = getopt_long(argc, const_cast<char* const*>(argv), "hVr:", opts, &option_index); + if (c == -1) + break; + + char* end = nullptr; + switch (c) { + case 'r': + errno = 0, reservation = strtol(optarg, &end, 10); + if (errno || *end || reservation <= 0 || reservation > USHRT_MAX) + die(1, "%s: must be a positive number", optarg); + break; + case 'V': + die(0, "%s", PROJECT_NAME " " PROJECT_VERSION); + break; + case 'h': + default: + usage(); + } + } + + argv += optind; + argc -= optind; + + if (argc != 4) + usage(); + + const char* input_path = argv[0]; + const char* output_path = argv[1]; + pkcs12_path = argv[2]; + pkcs12_pass = argv[3]; + + std::string pdf_document; + if (auto fp = fopen(input_path, "rb")) { + int c; + while ((c = fgetc(fp)) != EOF) + pdf_document += c; + if (ferror(fp)) + die(1, "%s: %s", input_path, strerror(errno)); + fclose(fp); + } else { + die(1, "%s: %s", input_path, strerror(errno)); + } + + auto err = pdf_sign(pdf_document, ushort(reservation)); + if (!err.empty()) { + die(2, "Error: %s", err.c_str()); + } + + if (auto fp = fopen(output_path, "wb")) { + auto written = fwrite(pdf_document.c_str(), pdf_document.size(), 1, fp); + if (fclose(fp) || written != 1) { + (void) unlink(output_path); + die(3, "%s: %s", output_path, strerror(errno)); + } + } else { + die(3, "%s: %s", output_path, strerror(errno)); + } + return 0; +} diff --git a/pdf/pdf.go b/pdf/pdf.go new file mode 100644 index 0000000..1fcdaa4 --- /dev/null +++ b/pdf/pdf.go @@ -0,0 +1,1663 @@ +// +// Copyright (c) 2018 - 2024, Přemysl Eric Janouch <p@janouch.name> +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// + +// Package pdf signs PDF documents and provides some processing utilities. +package pdf + +import ( + "bytes" + "compress/zlib" + "encoding/binary" + "encoding/hex" + "errors" + "fmt" + "math" + "regexp" + "sort" + "strconv" + "strings" + "time" + + "crypto" + "crypto/ecdsa" + "crypto/rsa" + "crypto/x509" + + "go.mozilla.org/pkcs7" + "golang.org/x/crypto/pkcs12" +) + +type ObjectKind int + +const ( + End ObjectKind = iota + NL + Comment + Nil + Bool + Numeric + Keyword + Name + String + + // simple tokens + BArray + EArray + BDict + EDict + + // higher-level objects + Array + Dict + Stream + Indirect + Reference +) + +// Object is a PDF token/object thingy. Objects may be composed either from +// one or a sequence of tokens. The PDF Reference doesn't actually speak +// of tokens, though ISO 32000-1:2008 does. +type Object struct { + Kind ObjectKind + + String string // Comment/Keyword/Name/String + Number float64 // Bool, Numeric + Array []Object // Array, Indirect + Dict map[string]Object // Dict, Stream + Stream []byte // Stream + N, Generation uint // Indirect, Reference +} + +// IsInteger checks if the PDF object is an integer number. +func (o *Object) IsInteger() bool { + _, f := math.Modf(o.Number) + return o.Kind == Numeric && f == 0 +} + +// IsUint checks if the PDF object is an integer number that fits into a uint. +func (o *Object) IsUint() bool { + return o.IsInteger() && o.Number >= 0 && o.Number <= float64(^uint(0)) +} + +// A slew of constructors that will hopefully get all inlined. + +// New returns a new Object of the given kind, with default values. +func New(kind ObjectKind) Object { return Object{Kind: kind} } + +func NewComment(c string) Object { return Object{Kind: Comment, String: c} } +func NewKeyword(k string) Object { return Object{Kind: Keyword, String: k} } + +func NewBool(b bool) Object { + var b64 float64 + if b { + b64 = 1 + } + return Object{Kind: Bool, Number: b64} +} + +func NewNumeric(n float64) Object { return Object{Kind: Numeric, Number: n} } +func NewName(n string) Object { return Object{Kind: Name, String: n} } +func NewString(s string) Object { return Object{Kind: String, String: s} } + +func NewArray(a []Object) Object { + return Object{Kind: Array, Array: a} +} + +func NewDict(d map[string]Object) Object { + if d == nil { + d = make(map[string]Object) + } + return Object{Kind: Dict, Dict: d} +} + +func NewStream(d map[string]Object, s []byte) Object { + if d == nil { + d = make(map[string]Object) + } + return Object{Kind: Stream, Dict: d, Stream: s} +} + +func NewIndirect(o Object, n, generation uint) Object { + return Object{Kind: Indirect, N: n, Generation: generation, + Array: []Object{o}} +} + +func NewReference(n, generation uint) Object { + return Object{Kind: Reference, N: n, Generation: generation} +} + +func newError(msg string) (Object, error) { return New(End), errors.New(msg) } + +// ----------------------------------------------------------------------------- + +const ( + octAlphabet = "01234567" + decAlphabet = "0123456789" + hexAlphabet = "0123456789abcdefABCDEF" + whitespace = "\t\n\f\r " + delimiters = "()<>[]{}/%" +) + +// Lexer is a basic lexical analyser for the Portable Document Format, +// giving limited error information. +type Lexer struct { + P []byte // input buffer +} + +func (lex *Lexer) read() (byte, bool) { + if len(lex.P) > 0 { + ch := lex.P[0] + lex.P = lex.P[1:] + return ch, true + } + return 0, false +} + +func (lex *Lexer) peek() (byte, bool) { + if len(lex.P) > 0 { + return lex.P[0], true + } + return 0, false +} + +func (lex *Lexer) eatNewline(ch byte) bool { + if ch == '\r' { + if ch, _ := lex.peek(); ch == '\n' { + lex.read() + } + return true + } + return ch == '\n' +} + +func (lex *Lexer) unescape(ch byte) byte { + switch ch { + case 'n': + return '\n' + case 'r': + return '\r' + case 't': + return '\t' + case 'b': + return '\b' + case 'f': + return '\f' + } + if strings.IndexByte(octAlphabet, ch) >= 0 { + octal := []byte{ch} + lex.read() + if ch, _ := lex.peek(); strings.IndexByte(octAlphabet, ch) >= 0 { + octal = append(octal, ch) + lex.read() + } + if ch, _ := lex.peek(); strings.IndexByte(octAlphabet, ch) >= 0 { + octal = append(octal, ch) + lex.read() + } + u, _ := strconv.ParseUint(string(octal), 8, 8) + return byte(u) + } + return ch +} + +func (lex *Lexer) string() (Object, error) { + var value []byte + parens := 1 + for { + ch, ok := lex.read() + if !ok { + return newError("unexpected end of string") + } + if lex.eatNewline(ch) { + ch = '\n' + } else if ch == '(' { + parens++ + } else if ch == ')' { + if parens--; parens == 0 { + break + } + } else if ch == '\\' { + if ch, ok = lex.read(); !ok { + return newError("unexpected end of string") + } else if lex.eatNewline(ch) { + continue + } else { + ch = lex.unescape(ch) + } + } + value = append(value, ch) + } + return NewString(string(value)), nil +} + +func (lex *Lexer) stringHex() (Object, error) { + var value, buf []byte + for { + ch, ok := lex.read() + if !ok { + return newError("unexpected end of hex string") + } else if ch == '>' { + break + } else if strings.IndexByte(hexAlphabet, ch) < 0 { + return newError("invalid hex string") + } else if buf = append(buf, ch); len(buf) == 2 { + u, _ := strconv.ParseUint(string(buf), 16, 8) + value = append(value, byte(u)) + buf = nil + } + } + if len(buf) > 0 { + u, _ := strconv.ParseUint(string(buf)+"0", 16, 8) + value = append(value, byte(u)) + } + return NewString(string(value)), nil +} + +func (lex *Lexer) name() (Object, error) { + var value []byte + for { + ch, ok := lex.peek() + if !ok || strings.IndexByte(whitespace+delimiters, ch) >= 0 { + break + } + lex.read() + if ch == '#' { + var hexa []byte + if ch, _ := lex.peek(); strings.IndexByte(hexAlphabet, ch) >= 0 { + hexa = append(hexa, ch) + lex.read() + } + if ch, _ := lex.peek(); strings.IndexByte(hexAlphabet, ch) >= 0 { + hexa = append(hexa, ch) + lex.read() + } + if len(hexa) != 2 { + return newError("invalid name hexa escape") + } + u, _ := strconv.ParseUint(string(value), 16, 8) + ch = byte(u) + } + value = append(value, ch) + } + if len(value) == 0 { + return newError("unexpected end of name") + } + return NewName(string(value)), nil +} + +func (lex *Lexer) comment() (Object, error) { + var value []byte + for { + ch, ok := lex.peek() + if !ok || ch == '\r' || ch == '\n' { + break + } + value = append(value, ch) + lex.read() + } + return NewComment(string(value)), nil +} + +// XXX: Maybe invalid numbers should rather be interpreted as keywords. +func (lex *Lexer) number() (Object, error) { + var value []byte + ch, ok := lex.peek() + if ch == '-' { + value = append(value, ch) + lex.read() + } + real, digits := false, false + for { + ch, ok = lex.peek() + if !ok { + break + } else if strings.IndexByte(decAlphabet, ch) >= 0 { + digits = true + } else if ch == '.' && !real { + real = true + } else { + break + } + value = append(value, ch) + lex.read() + } + if !digits { + return newError("invalid number") + } + f, _ := strconv.ParseFloat(string(value), 64) + return NewNumeric(f), nil +} + +func (lex *Lexer) Next() (Object, error) { + ch, ok := lex.peek() + if !ok { + return New(End), nil + } + if strings.IndexByte("-0123456789.", ch) >= 0 { + return lex.number() + } + + // {} end up being keywords, we might want to error out on those. + var value []byte + for { + ch, ok := lex.peek() + if !ok || strings.IndexByte(whitespace+delimiters, ch) >= 0 { + break + } + value = append(value, ch) + lex.read() + } + switch v := string(value); v { + case "": + case "null": + return New(Nil), nil + case "true": + return NewBool(true), nil + case "false": + return NewBool(false), nil + default: + return NewKeyword(v), nil + } + + switch ch, _ := lex.read(); ch { + case '/': + return lex.name() + case '%': + return lex.comment() + case '(': + return lex.string() + case '[': + return New(BArray), nil + case ']': + return New(EArray), nil + case '<': + if ch, _ := lex.peek(); ch == '<' { + lex.read() + return New(BDict), nil + } + return lex.stringHex() + case '>': + if ch, _ := lex.peek(); ch == '>' { + lex.read() + return New(EDict), nil + } + return newError("unexpected '>'") + default: + if lex.eatNewline(ch) { + return New(NL), nil + } + if strings.IndexByte(whitespace, ch) >= 0 { + return lex.Next() + } + return newError("unexpected input") + } +} + +// ----------------------------------------------------------------------------- + +// FIXME: Lines /should not/ be longer than 255 characters, +// some wrapping is in order. +func (o *Object) Serialize() string { + switch o.Kind { + case NL: + return "\n" + case Nil: + return "null" + case Bool: + if o.Number != 0 { + return "true" + } + return "false" + case Numeric: + return strconv.FormatFloat(o.Number, 'f', -1, 64) + case Keyword: + return o.String + case Name: + escaped := []byte{'/'} + for _, ch := range []byte(o.String) { + escaped = append(escaped, ch) + if ch == '#' || strings.IndexByte(delimiters+whitespace, ch) >= 0 { + escaped = append(escaped, fmt.Sprintf("%02x", ch)...) + } + } + return string(escaped) + case String: + escaped := []byte{'('} + for _, ch := range []byte(o.String) { + if ch == '\\' || ch == '(' || ch == ')' { + escaped = append(escaped, '\\') + } + escaped = append(escaped, ch) + } + return string(append(escaped, ')')) + case BArray: + return "[" + case EArray: + return "]" + case BDict: + return "<<" + case EDict: + return ">>" + case Array: + var v []string + for _, i := range o.Array { + v = append(v, i.Serialize()) + } + return "[ " + strings.Join(v, " ") + " ]" + case Dict: + b := bytes.NewBuffer(nil) + var keys []string + for k := range o.Dict { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + v := o.Dict[k] + // FIXME: The key is also supposed to be escaped by Serialize. + fmt.Fprint(b, " /", k, " ", v.Serialize()) + } + return "<<" + b.String() + " >>" + case Stream: + d := NewDict(o.Dict) + d.Dict["Length"] = NewNumeric(float64(len(o.Stream))) + return d.Serialize() + "\nstream\n" + string(o.Stream) + "\nendstream" + case Indirect: + return fmt.Sprintf("%d %d obj\n%s\nendobj", o.N, o.Generation, + o.Array[0].Serialize()) + case Reference: + return fmt.Sprintf("%d %d R", o.N, o.Generation) + default: + panic("unsupported token for serialization") + } +} + +// ----------------------------------------------------------------------------- + +type ref struct { + offset int64 // file offset, or N of the next free entry, or index + generation uint // object generation + compressed *uint // PDF 1.5: N of the containing compressed object + nonfree bool // whether this N is taken (for a good zero value) +} + +// Updater is a utility class to help read and possibly incrementally update +// PDF files. +type Updater struct { + // cross-reference table + xref []ref + + // current cross-reference table size, correlated to len(xref) + xrefSize uint + + // list of updated objects + // TODO(p): A map to bool makes this simpler to work with. + // The same with another map to struct{} somewhere in this code. + updated map[uint]struct{} + + // PDF document data + Document []byte + + // the new trailer dictionary to be written, initialized with the old one + Trailer map[string]Object +} + +// ListIndirect returns the whole cross-reference table as Reference Objects. +func (u *Updater) ListIndirect() []Object { + result := []Object{} + for i := 0; i < len(u.xref); i++ { + if u.xref[i].nonfree { + result = append(result, NewReference(uint(i), u.xref[i].generation)) + } + } + return result +} + +func (u *Updater) parseStream(lex *Lexer, stack *[]Object) (Object, error) { + lenStack := len(*stack) + if lenStack < 1 { + return newError("missing stream dictionary") + } + dict := (*stack)[lenStack-1] + if dict.Kind != Dict { + return newError("stream not preceded by a dictionary") + } + + *stack = (*stack)[:lenStack-1] + length, ok := dict.Dict["Length"] + if !ok { + return newError("missing stream Length") + } + length, err := u.Dereference(length) + if err != nil { + return length, err + } + if !length.IsUint() || length.Number > math.MaxInt { + return newError("stream Length not an unsigned integer") + } + + // Expect exactly one newline. + if nl, err := lex.Next(); err != nil { + return nl, err + } else if nl.Kind != NL { + return newError("stream does not start with a newline") + } + + size := int(length.Number) + if len(lex.P) < size { + return newError("stream is longer than the document") + } + + dict.Kind = Stream + dict.Stream = lex.P[:size] + lex.P = lex.P[size:] + + // Skip any number of trailing newlines or comments. + if end, err := u.parse(lex, stack); err != nil { + return end, err + } else if end.Kind != Keyword || end.String != "endstream" { + return newError("improperly terminated stream") + } + return dict, nil +} + +func (u *Updater) parseIndirect(lex *Lexer, stack *[]Object) (Object, error) { + lenStack := len(*stack) + if lenStack < 2 { + return newError("missing object ID pair") + } + + n := (*stack)[lenStack-2] + g := (*stack)[lenStack-1] + *stack = (*stack)[:lenStack-2] + + if !g.IsUint() || !n.IsUint() { + return newError("invalid object ID pair") + } + + var inner []Object + for { + object, _ := u.parse(lex, &inner) + if object.Kind == End { + return newError("object doesn't end") + } + if object.Kind == Keyword && object.String == "endobj" { + break + } + inner = append(inner, object) + } + if len(inner) != 1 { + return newError("indirect objects must contain exactly one object") + } + return NewIndirect(inner[0], uint(n.Number), uint(g.Number)), nil +} + +func (u *Updater) parseR(stack *[]Object) (Object, error) { + lenStack := len(*stack) + if lenStack < 2 { + return newError("missing reference ID pair") + } + + n := (*stack)[lenStack-2] + g := (*stack)[lenStack-1] + *stack = (*stack)[:lenStack-2] + + if !g.IsUint() || !n.IsUint() { + return newError("invalid reference ID pair") + } + return NewReference(uint(n.Number), uint(g.Number)), nil +} + +// parse reads an object at the lexer's position. Not a strict parser. +// +// TODO(p): We should fix all uses of this not to eat the error. +func (u *Updater) parse(lex *Lexer, stack *[]Object) (Object, error) { + switch token, err := lex.Next(); token.Kind { + case NL, Comment: + // These are not important to parsing, + // not even for this procedure's needs. + return u.parse(lex, stack) + case BArray: + var array []Object + for { + object, _ := u.parse(lex, &array) + if object.Kind == End { + return newError("array doesn't end") + } + if object.Kind == EArray { + break + } + array = append(array, object) + } + return NewArray(array), nil + case BDict: + var array []Object + for { + object, _ := u.parse(lex, &array) + if object.Kind == End { + return newError("dictionary doesn't end") + } + if object.Kind == EDict { + break + } + array = append(array, object) + } + if len(array)%2 != 0 { + return newError("unbalanced dictionary") + } + dict := make(map[string]Object) + for i := 0; i < len(array); i += 2 { + if array[i].Kind != Name { + return newError("invalid dictionary key type") + } + dict[array[i].String] = array[i+1] + } + return NewDict(dict), nil + case Keyword: + switch token.String { + case "stream": + // Appears in the document body, + // typically needs to access the cross-reference table. + return u.parseStream(lex, stack) + case "obj": + return u.parseIndirect(lex, stack) + case "R": + return u.parseR(stack) + } + fallthrough + default: + return token, err + } +} + +func (u *Updater) loadXrefEntry( + n uint, r ref, loadedEntries map[uint]struct{}) { + if _, ok := loadedEntries[n]; ok { + return + } + if lenXref := uint(len(u.xref)); n >= lenXref { + u.xref = append(u.xref, make([]ref, n-lenXref+1)...) + } + loadedEntries[n] = struct{}{} + + u.xref[n] = r +} + +func (u *Updater) loadXrefStream( + lex *Lexer, stack []Object, loadedEntries map[uint]struct{}) ( + Object, error) { + var object Object + for { + var err error + if object, err = u.parse(lex, &stack); err != nil { + return New(End), fmt.Errorf("invalid xref table: %s", err) + } else if object.Kind == End { + return newError("invalid xref table") + } + + // For the sake of simplicity, keep stacking until we find an object. + if object.Kind == Indirect { + break + } + + stack = append(stack, object) + } + + // ISO 32000-2:2020 7.5.8.2 Cross-reference stream dictionary + stream := object.Array[0] + if stream.Kind != Stream { + return newError("invalid xref table") + } + if typ, ok := stream.Dict["Type"]; !ok || + typ.Kind != Name || typ.String != "XRef" { + return newError("invalid xref stream") + } + + data, err := u.GetStreamData(stream) + if err != nil { + return New(End), fmt.Errorf("invalid xref stream: %s", err) + } + + size, ok := stream.Dict["Size"] + if !ok || !size.IsUint() || size.Number <= 0 { + return newError("invalid or missing cross-reference stream Size") + } + + type pair struct{ start, count uint } + pairs := []pair{} + if index, ok := stream.Dict["Index"]; !ok { + pairs = append(pairs, pair{0, uint(size.Number)}) + } else { + if index.Kind != Array || len(index.Array)%2 != 0 { + return newError("invalid cross-reference stream Index") + } + + a := index.Array + for i := 0; i < len(a); i += 2 { + if !a[i].IsUint() || !a[i+1].IsUint() { + return newError("invalid cross-reference stream Index") + } + pairs = append(pairs, pair{uint(a[i].Number), uint(a[i+1].Number)}) + } + } + + w, ok := stream.Dict["W"] + if !ok || w.Kind != Array || len(w.Array) != 3 || + !w.Array[0].IsUint() || !w.Array[1].IsUint() || !w.Array[2].IsUint() { + return newError("invalid or missing cross-reference stream W") + } + + w1 := uint(w.Array[0].Number) + w2 := uint(w.Array[1].Number) + w3 := uint(w.Array[2].Number) + if w2 == 0 { + return newError("invalid cross-reference stream W") + } + + unit := w1 + w2 + w3 + if uint(len(data))%unit != 0 { + return newError("invalid cross-reference stream length") + } + + readField := func(data []byte, width uint) (uint, []byte) { + var n uint + for ; width != 0; width-- { + n = n<<8 | uint(data[0]) + data = data[1:] + } + return n, data + } + + // ISO 32000-2:2020 7.5.8.3 Cross-reference stream data + for _, pair := range pairs { + for i := uint(0); i < pair.count; i++ { + if uint(len(data)) < unit { + return newError("premature cross-reference stream EOF") + } + + var f1, f2, f3 uint = 1, 0, 0 + if w1 > 0 { + f1, data = readField(data, w1) + } + f2, data = readField(data, w2) + if w3 > 0 { + f3, data = readField(data, w3) + } + + var r ref + switch f1 { + case 0: + r.offset = int64(f2) + r.generation = f3 + case 1: + r.offset = int64(f2) + r.generation = f3 + r.nonfree = true + case 2: + r.offset = int64(f3) + r.compressed = &f2 + r.nonfree = true + default: + // TODO(p): It should be treated as a reference to + // the null object. We can't currently represent that. + return newError("unsupported cross-reference stream contents") + } + + u.loadXrefEntry(pair.start+i, r, loadedEntries) + } + } + + stream.Kind = Dict + stream.Stream = nil + return stream, nil +} + +func (u *Updater) loadXref(lex *Lexer, loadedEntries map[uint]struct{}) ( + Object, error) { + var throwawayStack []Object + if object, _ := u.parse(lex, + &throwawayStack); object.Kind != Keyword || object.String != "xref" { + return u.loadXrefStream(lex, []Object{object}, loadedEntries) + } + for { + object, _ := u.parse(lex, &throwawayStack) + if object.Kind == End { + return newError("unexpected EOF while looking for the trailer") + } + if object.Kind == Keyword && object.String == "trailer" { + break + } + + second, _ := u.parse(lex, &throwawayStack) + if !object.IsUint() || !second.IsUint() { + return newError("invalid xref section header") + } + + start, count := uint(object.Number), uint(second.Number) + for i := uint(0); i < count; i++ { + off, _ := u.parse(lex, &throwawayStack) + gen, _ := u.parse(lex, &throwawayStack) + key, _ := u.parse(lex, &throwawayStack) + if !off.IsInteger() || off.Number < 0 || + off.Number > float64(len(u.Document)) || + !gen.IsInteger() || gen.Number < 0 || gen.Number > 65535 || + key.Kind != Keyword { + return newError("invalid xref entry") + } + + free := true + if key.String == "n" { + free = false + } else if key.String != "f" { + return newError("invalid xref entry") + } + + u.loadXrefEntry(start+i, ref{ + offset: int64(off.Number), + generation: uint(gen.Number), + nonfree: !free, + }, loadedEntries) + } + } + + trailer, _ := u.parse(lex, &throwawayStack) + if trailer.Kind != Dict { + return newError("invalid trailer dictionary") + } + return trailer, nil +} + +// ----------------------------------------------------------------------------- + +var trailerRE = regexp.MustCompile(`(?s:.*)\sstartxref\s+(\d+)\s+%%EOF`) + +// NewUpdater initializes an Updater, building the cross-reference table and +// preparing a new trailer dictionary. +func NewUpdater(document []byte) (*Updater, error) { + u := &Updater{Document: document} + u.updated = make(map[uint]struct{}) + + // We only need to look for startxref roughly within + // the last kibibyte of the document. + haystack := u.Document + if len(haystack) > 1024 { + haystack = haystack[len(haystack)-1024:] + } + + m := trailerRE.FindSubmatch(haystack) + if m == nil { + return nil, errors.New("cannot find startxref") + } + + xrefOffset, _ := strconv.ParseInt(string(m[1]), 10, 64) + lastXrefOffset := xrefOffset + loadedXrefs := make(map[int64]struct{}) + loadedEntries := make(map[uint]struct{}) + + for { + if _, ok := loadedXrefs[xrefOffset]; ok { + return nil, errors.New("circular xref offsets") + } + if xrefOffset >= int64(len(u.Document)) { + return nil, errors.New("invalid xref offset") + } + + lex := Lexer{u.Document[xrefOffset:]} + trailer, err := u.loadXref(&lex, loadedEntries) + if err != nil { + return nil, err + } + + if len(loadedXrefs) == 0 { + u.Trailer = trailer.Dict + } + loadedXrefs[xrefOffset] = struct{}{} + + // TODO(p): Descend into XRefStm here first, if present, + // which is also a linked list. + + // We allow for mixed cross-reference tables and streams + // within a single Prev list, although this should never occur. + prevOffset, ok := trailer.Dict["Prev"] + if !ok { + break + } + // FIXME: Do not read offsets and sizes as floating point numbers. + if !prevOffset.IsInteger() { + return nil, errors.New("invalid Prev offset") + } + xrefOffset = int64(prevOffset.Number) + } + + u.Trailer["Prev"] = NewNumeric(float64(lastXrefOffset)) + + lastSize, ok := u.Trailer["Size"] + if !ok || !lastSize.IsInteger() || lastSize.Number <= 0 { + return nil, errors.New("invalid or missing cross-reference table Size") + } + u.xrefSize = uint(lastSize.Number) + return u, nil +} + +var versionRE = regexp.MustCompile( + `(?:^|[\r\n])%(?:!PS-Adobe-\d\.\d )?PDF-(\d)\.(\d)[\r\n]`) + +// Version extracts the claimed PDF version as a positive decimal number, +// e.g. 17 for PDF 1.7. Returns zero on failure. +func (u *Updater) Version(root *Object) int { + if version, ok := root.Dict["Version"]; ok && version.Kind == Name { + if v := version.String; len(v) == 3 && v[1] == '.' && + v[0] >= '0' && v[0] <= '9' && v[2] >= '0' && v[2] <= '9' { + return int(v[0]-'0')*10 + int(v[2]-'0') + } + } + + // We only need to look for the comment roughly within + // the first kibibyte of the document. + haystack := u.Document + if len(haystack) > 1024 { + haystack = haystack[:1024] + } + if m := versionRE.FindSubmatch(haystack); m != nil { + return int(m[1][0]-'0')*10 + int(m[2][0]-'0') + } + return 0 +} + +func (u *Updater) getFromObjStm(nObjStm, n uint) (Object, error) { + if nObjStm == n { + return newError("ObjStm recursion") + } + + stream, err := u.Get(nObjStm, 0) + if err != nil { + return stream, err + } + if stream.Kind != Stream { + return newError("invalid ObjStm") + } + if typ, ok := stream.Dict["Type"]; !ok || + typ.Kind != Name || typ.String != "ObjStm" { + return newError("invalid ObjStm") + } + + data, err := u.GetStreamData(stream) + if err != nil { + return New(End), fmt.Errorf("invalid ObjStm: %s", err) + } + entryN, ok := stream.Dict["N"] + if !ok || !entryN.IsUint() || entryN.Number <= 0 { + return newError("invalid ObjStm N") + } + entryFirst, ok := stream.Dict["First"] + if !ok || !entryFirst.IsUint() || entryFirst.Number <= 0 { + return newError("invalid ObjStm First") + } + + // NOTE: This means descending into that stream if n is not found here. + // It is meant to be an object reference. + if extends, ok := stream.Dict["Extends"]; ok && extends.Kind != Nil { + return newError("ObjStm extensions are unsupported") + } + + count := uint(entryN.Number) + first := uint(entryFirst.Number) + if first > uint(len(data)) { + return newError("invalid ObjStm First") + } + + lex1 := Lexer{data[:first]} + data = data[first:] + + type pair struct{ n, offset uint } + pairs := []pair{} + for i := uint(0); i < count; i++ { + var throwawayStack []Object + objN, _ := u.parse(&lex1, &throwawayStack) + objOffset, _ := u.parse(&lex1, &throwawayStack) + if !objN.IsUint() || !objOffset.IsUint() { + return newError("invalid ObjStm pairs") + } + pairs = append(pairs, pair{uint(objN.Number), uint(objOffset.Number)}) + } + for i, pair := range pairs { + if pair.offset > uint(len(data)) || + i > 0 && pairs[i-1].offset >= pair.offset { + return newError("invalid ObjStm pairs") + } + } + + for i, pair := range pairs { + if pair.n != n { + continue + } + + if i+1 < len(pairs) { + data = data[pair.offset:pairs[i+1].offset] + } else { + data = data[pair.offset:] + } + + lex2 := Lexer{data} + var stack []Object + for { + object, err := u.parse(&lex2, &stack) + if err != nil { + return object, err + } else if object.Kind == End { + break + } else { + stack = append(stack, object) + } + } + if len(stack) == 0 { + return newError("empty ObjStm object") + } + return stack[0], nil + } + return newError("object not found in ObjStm") +} + +// Get retrieves an object by its number and generation--may return +// Nil or End with an error. +func (u *Updater) Get(n, generation uint) (Object, error) { + if n >= u.xrefSize { + return New(Nil), nil + } + + ref := u.xref[n] + if !ref.nonfree || ref.generation != generation { + return New(Nil), nil + } + + if ref.compressed != nil { + return u.getFromObjStm(*ref.compressed, n) + } else if ref.offset >= int64(len(u.Document)) { + return New(Nil), nil + } + + lex := Lexer{u.Document[ref.offset:]} + var stack []Object + for { + object, err := u.parse(&lex, &stack) + if object.Kind == End { + return object, err + } + if object.Kind != Indirect { + stack = append(stack, object) + } else if object.N != n || object.Generation != generation { + return newError("object mismatch") + } else { + return object.Array[0], nil + } + } +} + +// Derefence dereferences Reference objects, and passes the other kinds through. +func (u *Updater) Dereference(o Object) (Object, error) { + if o.Kind != Reference { + return o, nil + } + return u.Get(o.N, o.Generation) +} + +// Allocate allocates a new object number. +func (u *Updater) Allocate() uint { + n := u.xrefSize + u.xrefSize++ + + if u.xrefSize == 0 { + panic("overflow") + } else if lenXref := uint(len(u.xref)); lenXref < u.xrefSize { + u.xref = append(u.xref, make([]ref, u.xrefSize-lenXref)...) + } + + // We don't make sure it gets a subsection in the update yet because we + // make no attempts at fixing the linked list of free items either. + return n +} + +// BytesWriter is an interface over a subset of bytes.Buffer methods. +type BytesWriter interface { + Bytes() []byte + Len() int + Write(p []byte) (n int, err error) + WriteByte(c byte) error + WriteRune(r rune) (n int, err error) + WriteString(s string) (n int, err error) +} + +// Update appends an updated object to the end of the document. +// The fill callback must write exactly one PDF object. +func (u *Updater) Update(n uint, fill func(buf BytesWriter)) { + oldRef := u.xref[n] + u.updated[n] = struct{}{} + u.xref[n] = ref{ + offset: int64(len(u.Document) + 1), + generation: oldRef.generation, + nonfree: true, + } + + buf := bytes.NewBuffer(u.Document) + fmt.Fprintf(buf, "\n%d %d obj\n", n, oldRef.generation) + + // Separately so that the callback can use w.Len() to get current offset. + fill(buf) + + buf.WriteString("\nendobj") + u.Document = buf.Bytes() +} + +func (u *Updater) flushXRefStm(updated []uint, buf *bytes.Buffer) { + // The cross-reference stream has to point to itself. + // XXX: We only duplicate Update code here due to how we currently buffer. + n := u.Allocate() + updated = append(updated, n) + + u.updated[n] = struct{}{} + u.xref[n] = ref{ + offset: int64(buf.Len() + 1), + generation: 0, + nonfree: true, + } + + index, b := []Object{}, []byte{} + write := func(f1 byte, f2, f3 uint64) { + b = append(b, f1) + b = binary.BigEndian.AppendUint64(b, f2) + b = binary.BigEndian.AppendUint64(b, f3) + } + for i := 0; i < len(updated); { + start, stop := updated[i], updated[i]+1 + for i++; i < len(updated) && updated[i] == stop; i++ { + stop++ + } + + index = append(index, + NewNumeric(float64(start)), NewNumeric(float64(stop-start))) + for ; start < stop; start++ { + ref := u.xref[start] + if ref.compressed != nil { + write(2, uint64(*ref.compressed), uint64(ref.offset)) + } else if ref.nonfree { + write(1, uint64(ref.offset), uint64(ref.generation)) + } else { + write(0, uint64(ref.offset), uint64(ref.generation)) + } + } + } + + u.Trailer["Size"] = NewNumeric(float64(u.xrefSize)) + u.Trailer["Index"] = NewArray(index) + u.Trailer["W"] = NewArray([]Object{ + NewNumeric(1), NewNumeric(8), NewNumeric(8), + }) + + for _, key := range []string{ + "Filter", "DecodeParms", "F", "FFilter", "FDecodeParms", "DL"} { + delete(u.Trailer, key) + } + + stream := NewStream(u.Trailer, b) + fmt.Fprintf(buf, "\n%d 0 obj\n%s\nendobj", n, stream.Serialize()) +} + +func (u *Updater) flushXRefTable(updated []uint, buf *bytes.Buffer) { + buf.WriteString("\nxref\n") + for i := 0; i < len(updated); { + start, stop := updated[i], updated[i]+1 + for i++; i < len(updated) && updated[i] == stop; i++ { + stop++ + } + + fmt.Fprintf(buf, "%d %d\n", start, stop-start) + for ; start < stop; start++ { + // XXX: We should warn about any object streams here. + ref := u.xref[start] + if ref.nonfree && ref.compressed == nil { + fmt.Fprintf(buf, "%010d %05d n \n", ref.offset, ref.generation) + } else { + fmt.Fprintf(buf, "%010d %05d f \n", ref.offset, ref.generation) + } + } + } + + // Taking literally "Each cross-reference section begins with a line + // containing the keyword xref. Following this line are one or more + // cross-reference subsections." from 3.4.3 in PDF Reference. + if len(updated) == 0 { + fmt.Fprintf(buf, "%d %d\n", 0, 0) + } + + u.Trailer["Size"] = NewNumeric(float64(u.xrefSize)) + trailer := NewDict(u.Trailer) + fmt.Fprintf(buf, "trailer\n%s", trailer.Serialize()) +} + +// FlushUpdates writes an updated cross-reference table and trailer, or stream. +func (u *Updater) FlushUpdates() { + updated := make([]uint, 0, len(u.updated)) + for n := range u.updated { + updated = append(updated, n) + } + sort.Slice(updated, func(i, j int) bool { + return updated[i] < updated[j] + }) + + // It does not seem to be possible to upgrade a PDF file + // from trailer dictionaries to cross-reference streams, + // so keep continuity either way. + // + // (Downgrading from cross-reference streams using XRefStm would not + // create a true hybrid-reference file, although it should work.) + buf := bytes.NewBuffer(u.Document) + startXref := buf.Len() + 1 /* '\n' */ + if typ, _ := u.Trailer["Type"]; typ.Kind == Name && typ.String == "XRef" { + u.flushXRefStm(updated, buf) + } else { + u.flushXRefTable(updated, buf) + } + + fmt.Fprintf(buf, "\nstartxref\n%d\n%%%%EOF\n", startXref) + u.Document = buf.Bytes() + u.updated = make(map[uint]struct{}) + + u.Trailer["Prev"] = NewNumeric(float64(startXref)) +} + +// ----------------------------------------------------------------------------- + +// NewDate makes a PDF object representing the given point in time. +func NewDate(ts time.Time) Object { + buf := ts.AppendFormat(nil, "D:20060102150405") + // "Z07'00'" doesn't work, we need to do some of it manually. + if _, offset := ts.Zone(); offset != 0 { + o := ts.AppendFormat(nil, "-0700") + buf = append(buf, o[0], o[1], o[2], '\'', o[3], o[4], '\'') + } else { + buf = append(buf, 'Z') + } + return NewString(string(buf)) +} + +// GetStreamData returns the actual data stored in a stream object, +// applying any filters. +func (u *Updater) GetStreamData(stream Object) ([]byte, error) { + if f, ok := stream.Dict["F"]; ok && f.Kind != Nil { + return nil, errors.New("stream data in other files are unsupported") + } + + // Support just enough to decode a common cross-reference stream. + if filter, ok := stream.Dict["Filter"]; !ok { + return stream.Stream, nil + } else if filter.Kind != Name || filter.String != "FlateDecode" { + return nil, errors.New("unsupported stream Filter") + } + + // TODO(p): Support << /Columns N /Predictor 12 >> + // which usually appears in files with cross-reference streams. + if parms, ok := stream.Dict["DecodeParms"]; ok && parms.Kind != Nil { + return nil, errors.New("DecodeParms are not supported") + } + + r, err := zlib.NewReader(bytes.NewReader(stream.Stream)) + if err != nil { + return nil, err + } + + var b bytes.Buffer + _, err = b.ReadFrom(r) + return b.Bytes(), err +} + +// GetFirstPage retrieves the first page of the given page (sub)tree reference, +// or returns a Nil object if unsuccessful. +func (u *Updater) GetFirstPage(node Object) Object { + obj, err := u.Dereference(node) + if err != nil || obj.Kind != Dict { + return New(Nil) + } + + // Out of convenience; these aren't filled normally. + obj.N = node.N + obj.Generation = node.Generation + + if typ, ok := obj.Dict["Type"]; !ok || typ.Kind != Name { + return New(Nil) + } else if typ.String == "Page" { + return obj + } else if typ.String != "Pages" { + return New(Nil) + } + + // XXX: Technically speaking, this may be an indirect reference. + // The correct way to solve this seems to be having Updater include + // a wrapper around "obj.Dict". Though does it still apply in Golang? + kids, ok := obj.Dict["Kids"] + if !ok || kids.Kind != Array || len(kids.Array) == 0 || + kids.Array[0].Kind != Reference { + return New(Nil) + } + + // XXX: Nothing prevents us from recursing in an evil circular graph. + return u.GetFirstPage(kids.Array[0]) +} + +// ----------------------------------------------------------------------------- + +// PKCS12Parse parses and verifies PKCS#12 data. +func PKCS12Parse(p12 []byte, password string) ( + crypto.PrivateKey, []*x509.Certificate, error) { + // The pkcs12.Decode function doesn't support included intermediate + // certificates, we need to do some processing manually. + blocks, err := pkcs12.ToPEM(p12, password) + if err != nil { + return nil, nil, err + } + + // b.Type is literally CERTIFICATE or PRIVATE KEY, the Headers only contain + // a localKeyId field. It seems like the pkey and the cert share the same + // localKeyId value. Though the leaf certificate should also be the first + // one in the PKCS#12 file, so I probably don't need that value. + var allX509Blocks [][]byte + var allCertBlocks [][]byte + for _, b := range blocks { + // CERTIFICATE, PRIVATE KEY constants are defined locally in the pkcs12 + // package. crypto/tls/tls.go seems to only use literals for these and + // also accepts words in front such as RSA PRIVATE KEY. + switch b.Type { + case "PRIVATE KEY": + allX509Blocks = append(allX509Blocks, b.Bytes) + case "CERTIFICATE": + allCertBlocks = append(allCertBlocks, b.Bytes) + } + } + switch { + case len(allX509Blocks) == 0: + return nil, nil, errors.New("missing private key") + case len(allX509Blocks) > 1: + return nil, nil, errors.New("more than one private key") + case len(allCertBlocks) == 0: + return nil, nil, errors.New("missing certificate") + } + + // The PKCS#12 file may only contain PKCS#8-wrapped private keys but the + // pkcs12 package unwraps them to simple PKCS#1/EC while converting to PEM. + var key crypto.PrivateKey + if key, err = x509.ParsePKCS1PrivateKey(allX509Blocks[0]); err != nil { + if key, err = x509.ParseECPrivateKey(allX509Blocks[0]); err == nil { + return nil, nil, errors.New("failed to parse private key") + } + } + + x509Certs, err := x509.ParseCertificates(allCertBlocks[0]) + if err != nil { + return nil, nil, err + } + if len(x509Certs) != 1 { + return nil, nil, + errors.New("expected exactly one certificate in the first bag") + } + + for _, cert := range allCertBlocks[1:] { + toAdd, err := x509.ParseCertificates(cert) + if err != nil { + return nil, nil, err + } + x509Certs = append(x509Certs, toAdd...) + } + + // Copied from crypto/tls/tls.go. + switch pub := x509Certs[0].PublicKey.(type) { + case *rsa.PublicKey: + priv, ok := key.(*rsa.PrivateKey) + if !ok { + return nil, nil, + errors.New("private key type does not match public key type") + } + if pub.N.Cmp(priv.N) != 0 { + return nil, nil, + errors.New("private key does not match public key") + } + case *ecdsa.PublicKey: + priv, ok := key.(*ecdsa.PrivateKey) + if !ok { + return nil, nil, + errors.New("private key type does not match public key type") + } + if pub.X.Cmp(priv.X) != 0 || pub.Y.Cmp(priv.Y) != 0 { + return nil, nil, + errors.New("private key does not match public key") + } + default: + return nil, nil, errors.New("unknown public key algorithm") + } + return key, x509Certs, nil +} + +// FillInSignature signs PDF contents and writes the signature into the given +// window that has been reserved for this specific purpose. +// This is a very low-level function. +func FillInSignature(document []byte, signOff, signLen int, + key crypto.PrivateKey, certs []*x509.Certificate) error { + if signOff < 0 || signOff > len(document) || + signLen < 2 || signOff+signLen > len(document) { + return errors.New("invalid signing window") + } + + pkcsError := func(message interface{}) error { + return fmt.Errorf("key/cert: %s", message) + } + + // Prevent useless signatures--makes pdfsig from poppler happy at least + // (and NSS by extension). + x509Cert := certs[0] + if x509Cert.KeyUsage&(x509.KeyUsageDigitalSignature| + x509.KeyUsageContentCommitment /* renamed non-repudiation */) == 0 { + return pkcsError("the certificate's key usage must include " + + "digital signatures or non-repudiation") + } + + extOK := false + for _, u := range x509Cert.ExtKeyUsage { + if u == x509.ExtKeyUsageAny || u == x509.ExtKeyUsageEmailProtection { + extOK = true + } + } + if len(x509Cert.ExtKeyUsage) > 0 && !extOK { + return pkcsError("the certificate's extended key usage " + + "must include S/MIME") + } + + // XXX: We'd like to stream to the hash manually instead of copying data. + data := make([]byte, len(document)-signLen) + copy(data, document[:signOff]) + copy(data[signOff:], document[signOff+signLen:]) + + signedData, err := pkcs7.NewSignedData(data) + if err != nil { + return err + } + // The default digest is SHA1, which is mildly insecure now. + signedData.SetDigestAlgorithm(pkcs7.OIDDigestAlgorithmSHA256) + if err := signedData.AddSignerChain( + x509Cert, key, certs[1:], pkcs7.SignerInfoConfig{}); err != nil { + return err + } + + signedData.Detach() + sig, err := signedData.Finish() + if err != nil { + return err + } + + /* + Debugging: ioutil.WriteFile("pdf_signature.der", sig, 0666) + openssl cms -inform PEM -in pdf_signature.pem -noout -cmsout -print + Context: https://stackoverflow.com/a/29253469 + */ + + if len(sig)*2 > signLen-2 /* hexstring quotes */ { + // The obvious solution is to increase the allocation... or spend + // a week reading specifications while losing all faith in humanity + // as a species, and skip the pkcs7 package entirely. + return fmt.Errorf("not enough space reserved for the signature "+ + "(%d nibbles vs %d nibbles)", signLen-2, len(sig)*2) + } + + hex.Encode(document[signOff+1:], sig) + return nil +} + +// https://www.adobe.com/devnet-docs/acrobatetk/tools/DigSig/Acrobat_DigitalSignatures_in_PDF.pdf +// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf +// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PPKAppearances.pdf + +// Sign signs the given document, growing and returning the passed-in slice. +// There must be at least one certificate, matching the private key. +// The certificates must form a chain. +// +// A good default for the reservation is around 4096 (the value is in bytes). +// +// The presumption here is that the document is valid and that it doesn't +// employ cross-reference streams from PDF 1.5, or at least constitutes +// a hybrid-reference file. The results with PDF 2.0 (2017) are currently +// unknown as the standard costs money. +func Sign(document []byte, key crypto.PrivateKey, certs []*x509.Certificate, + reservation int) ([]byte, error) { + pdf, err := NewUpdater(document) + if err != nil { + return nil, err + } + + rootRef, ok := pdf.Trailer["Root"] + if !ok || rootRef.Kind != Reference { + return nil, errors.New("trailer does not contain a reference to Root") + } + root, err := pdf.Dereference(rootRef) + if err != nil { + return nil, fmt.Errorf("Root dictionary retrieval failed: %s", err) + } + if root.Kind != Dict { + return nil, errors.New("invalid Root dictionary reference") + } + + // 8.7 Digital Signatures - /signature dictionary/ + sigdictN := pdf.Allocate() + var byterangeOff, byterangeLen, signOff, signLen int + pdf.Update(sigdictN, func(buf BytesWriter) { + // The timestamp is important for Adobe Acrobat Reader DC. + // The ideal would be to use RFC 3161. + now := NewDate(time.Now()) + buf.WriteString("<< /Type/Sig /Filter/Adobe.PPKLite" + + " /SubFilter/adbe.pkcs7.detached\n" + + " /M" + now.Serialize() + " /ByteRange ") + + byterangeOff = buf.Len() + byterangeLen = 32 // fine for a gigabyte + buf.Write(bytes.Repeat([]byte{' '}, byterangeLen)) + buf.WriteString("\n /Contents <") + + signOff = buf.Len() + signLen = reservation * 2 // cert, digest, encrypted digest, ... + buf.Write(bytes.Repeat([]byte{'0'}, signLen)) + buf.WriteString("> >>") + + // We actually need to exclude the hexstring quotes from signing. + signOff -= 1 + signLen += 2 + }) + + sigfield := NewDict(map[string]Object{ + // 8.6.3 Field Types - Signature Fields + "FT": NewName("Sig"), + "V": NewReference(sigdictN, 0), + // 8.4.5 Annotations Types - Widget Annotations + // We can merge the Signature Annotation and omit Kids here. + "Subtype": NewName("Widget"), + "F": NewNumeric(2 /* Hidden */), + "T": NewString("Signature1"), + "Rect": NewArray([]Object{ + NewNumeric(0), NewNumeric(0), NewNumeric(0), NewNumeric(0), + }), + }) + + sigfieldN := pdf.Allocate() + pdf.Update(sigfieldN, func(buf BytesWriter) { + buf.WriteString(sigfield.Serialize()) + }) + + pagesRef, ok := root.Dict["Pages"] + if !ok || pagesRef.Kind != Reference { + return nil, errors.New("invalid Pages reference") + } + page := pdf.GetFirstPage(pagesRef) + if page.Kind != Dict { + return nil, errors.New("invalid or unsupported page tree") + } + + annots := page.Dict["Annots"] + if annots.Kind != Array { + // TODO(p): Indirectly referenced arrays might not be + // that hard to support. + if annots.Kind != End { + return nil, errors.New("unexpected Annots") + } + annots = NewArray(nil) + } + annots.Array = append(annots.Array, NewReference(sigfieldN, 0)) + + page.Dict["Annots"] = annots + pdf.Update(page.N, func(buf BytesWriter) { + buf.WriteString(page.Serialize()) + }) + + // 8.6.1 Interactive Form Dictionary + if acroform, ok := root.Dict["AcroForm"]; ok && acroform.Kind != Nil { + return nil, errors.New("the document already contains forms, " + + "they would be overwritten") + } + + root.Dict["AcroForm"] = NewDict(map[string]Object{ + "Fields": NewArray([]Object{NewReference(sigfieldN, 0)}), + "SigFlags": NewNumeric(3 /* SignaturesExist | AppendOnly */), + }) + + // Upgrade the document version for SHA-256 etc. + if pdf.Version(&root) < 16 { + root.Dict["Version"] = NewName("1.6") + } + + pdf.Update(rootRef.N, func(buf BytesWriter) { + buf.WriteString(root.Serialize()) + }) + pdf.FlushUpdates() + + // Now that we know the length of everything, store byte ranges of + // what we're about to sign, which must be everything but the resulting + // signature itself. + tailOff := signOff + signLen + tailLen := len(pdf.Document) - tailOff + + ranges := fmt.Sprintf("[0 %d %d %d]", signOff, tailOff, tailLen) + if len(ranges) > byterangeLen { + return nil, errors.New("not enough space reserved for /ByteRange") + } + copy(pdf.Document[byterangeOff:], []byte(ranges)) + if err := FillInSignature(pdf.Document, signOff, signLen, + key, certs); err != nil { + return nil, err + } + return pdf.Document, nil +} @@ -0,0 +1,86 @@ +#!/bin/sh -e +# Test basic functionality of both versions +# Usage: ./test.sh builddir/pdf-simple-sign cmd/pdf-simple-sign/pdf-simple-sign + +log() { echo "`tput sitm`-- $1`tput sgr0`"; } +die() { echo "`tput bold`-- $1`tput sgr0`"; exit 1; } + +# Get rid of old test files +rm -rf tmp +mkdir tmp + +# Create documents in various tools +log "Creating source documents" +inkscape --pipe --export-filename=tmp/cairo.pdf --export-pdf-version=1.4 \ +<<'EOF' 2>/dev/null || : +<svg xmlns="http://www.w3.org/2000/svg"><text x="5" y="10">Hello</text></svg> +EOF + +date > tmp/lowriter.txt +if command -v gropdf >/dev/null +then groff -T pdf < tmp/lowriter.txt > tmp/groff.pdf +fi +lowriter --convert-to pdf tmp/lowriter.txt --outdir tmp >/dev/null || : +convert rose: tmp/imagemagick.pdf || : + +# Create a root CA certificate pair +log "Creating certificates" +openssl req -newkey rsa:2048 -subj "/CN=Test CA" -nodes \ + -keyout tmp/ca.key.pem -x509 -out tmp/ca.cert.pem 2>/dev/null + +# Create a private NSS database and insert our test CA there +rm -rf tmp/nssdir +mkdir tmp/nssdir +certutil -N --empty-password -d sql:tmp/nssdir +certutil -d sql:tmp/nssdir -A -n root -t ,C, -a -i tmp/ca.cert.pem + +# Create a leaf certificate pair +cat > tmp/cert.cfg <<'EOF' +[smime] +basicConstraints = CA:FALSE +keyUsage = digitalSignature +extendedKeyUsage = emailProtection +nsCertType = email +EOF + +openssl req -newkey rsa:2048 -subj "/CN=Test Leaf" -nodes \ + -keyout tmp/key.pem -out tmp/cert.csr 2>/dev/null +openssl x509 -req -in tmp/cert.csr -out tmp/cert.pem \ + -CA tmp/ca.cert.pem -CAkey tmp/ca.key.pem -set_serial 1 \ + -extensions smime -extfile tmp/cert.cfg 2>/dev/null +openssl verify -CAfile tmp/ca.cert.pem tmp/cert.pem >/dev/null + +# The second line accomodates the Go signer, +# which doesn't support SHA-256 within pkcs12 handling +openssl pkcs12 -inkey tmp/key.pem -in tmp/cert.pem \ + -certpbe PBE-SHA1-3DES -keypbe PBE-SHA1-3DES -macalg sha1 \ + -export -passout pass: -out tmp/key-pair.p12 + +for tool in "$@"; do + rm -f tmp/*.signed.pdf + for source in tmp/*.pdf; do + log "Testing $tool with $source" + result=${source%.pdf}.signed.pdf + $tool "$source" "$result" tmp/key-pair.p12 "" + pdfsig -nssdir sql:tmp/nssdir "$result" | grep Validation + + # Only some of our generators use PDF versions higher than 1.5 + log "Testing $tool for version detection" + grep -q "/Version /1[.]6" "$result" \ + || grep -q "^%PDF-1[.][67]" "$result" \ + || die "Version detection seems to misbehave (no upgrade)" + done + + log "Testing $tool for expected failures" + $tool "$result" "$source.fail.pdf" tmp/key-pair.p12 "" \ + && die "Double signing shouldn't succeed" + $tool -r 1 "$source" "$source.fail.pdf" tmp/key-pair.p12 "" \ + && die "Too low reservations shouldn't succeed" + + sed '1s/%PDF-1../%PDF-1.7/' "$source" > "$source.alt" + $tool "$source.alt" "$result.alt" tmp/key-pair.p12 "" + grep -q "/Version /1.6" "$result.alt" \ + && die "Version detection seems to misbehave (downgraded)" +done + +log "OK" |