aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.clang-format8
-rw-r--r--.gitignore8
-rw-r--r--LICENSE12
-rw-r--r--NEWS29
-rw-r--r--README.adoc70
-rw-r--r--cmd/extfs-pdf/main.go141
-rw-r--r--cmd/pdf-simple-sign/main.go76
-rw-r--r--go.mod8
-rw-r--r--go.sum13
-rw-r--r--meson.build23
-rw-r--r--pdf-simple-sign.adoc80
-rw-r--r--pdf-simple-sign.cpp1017
-rw-r--r--pdf/pdf.go1663
-rwxr-xr-xtest.sh86
14 files changed, 3234 insertions, 0 deletions
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..13cbee9
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,8 @@
+BasedOnStyle: Chromium
+ColumnLimit: 100
+IndentCaseLabels: false
+AccessModifierOffset: -2
+ContinuationIndentWidth: 2
+SpaceAfterTemplateKeyword: false
+SpaceAfterCStyleCast: true
+SpacesBeforeTrailingComments: 2
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d046c48
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+/builddir
+/pdf-simple-sign.cflags
+/pdf-simple-sign.config
+/pdf-simple-sign.creator
+/pdf-simple-sign.creator.user
+/pdf-simple-sign.cxxflags
+/pdf-simple-sign.files
+/pdf-simple-sign.includes
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..7511f3e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,12 @@
+Copyright (c) 2017 - 2024, Přemysl Eric Janouch <p@janouch.name>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..610609f
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,29 @@
+1.1.1 (2020-09-06)
+
+ * Fix a dysfunctional example in the manual
+
+ * Go: write the xref table in a deterministic order
+
+ * Add a trivial test suite, based on pdfsig from poppler-utils
+
+
+1.1 (2020-09-05)
+
+ * Make it possible to change the signature reservation with an option
+
+ * Return errors rather than mangle documents in some cases,
+ notably with pre-existing PDF forms
+
+ * Avoid downgrading the document's PDF version to 1.6
+
+ * A few fixes for PDF parsing and serialisation
+
+ * Add an instructive man page
+
+ * Add a native Go port of the utility, also usable as a library
+
+
+1.0 (2018-08-03)
+
+ * Initial release
+
diff --git a/README.adoc b/README.adoc
new file mode 100644
index 0000000..10e581f
--- /dev/null
+++ b/README.adoc
@@ -0,0 +1,70 @@
+pdf-simple-sign
+===============
+
+'pdf-simple-sign' is a simple PDF signer intended for documents produced by
+the Cairo library (≤ 1.17.4 or using PDF 1.4), GNU troff, ImageMagick,
+or similar.
+
+I don't aim to extend the functionality any further. The project is fairly
+self-contained and it should be easy to grasp and change to suit to your needs.
+
+Packages
+--------
+Regular releases are sporadic. git master should be stable enough.
+You can get a package with the latest development version using Arch Linux's
+https://aur.archlinux.org/packages/pdf-simple-sign-git[AUR],
+or as a https://git.janouch.name/p/nixexprs[Nix derivation].
+
+Documentation
+-------------
+See the link:pdf-simple-sign.adoc[man page] for information about usage.
+The rest of this README will concern itself with externalities.
+
+image:https://pkg.go.dev/badge/janouch.name/pdf-simple-sign@master/pdf["PkgGoDev", link="https://pkg.go.dev/janouch.name/pdf-simple-sign@master/pdf"]
+
+Building
+--------
+Build dependencies: Meson, Asciidoctor, a C++11 compiler, pkg-config +
+Runtime dependencies: libcrypto (OpenSSL 1.1 API)
+
+ $ git clone https://git.janouch.name/p/pdf-simple-sign.git
+ $ cd pdf-simple-sign
+ $ meson builddir
+ $ cd builddir
+ $ ninja
+
+In addition to the C++ version, also included is a native Go port,
+which has enhanced PDF 1.5 support:
+
+----
+$ go install janouch.name/pdf-simple-sign/cmd/pdf-simple-sign@master
+----
+
+and a crude external VFS for Midnight Commander, that may be used to extract
+all streams from a given PDF file:
+
+----
+$ GOBIN=$HOME/.local/share/mc/extfs.d \
+ go install janouch.name/pdf-simple-sign/cmd/extfs-pdf@master
+----
+
+To enable the VFS, edit your _~/.config/mc/mc.ext.ini_ to contain:
+
+----
+[pdf]
+Type=^PDF
+Open=%cd %p/extfs-pdf://
+----
+
+Contributing and Support
+------------------------
+Use https://git.janouch.name/p/pdf-simple-sign to report bugs, request features,
+or submit pull requests. `git send-email` is tolerated. If you want to discuss
+the project, feel free to join me at ircs://irc.janouch.name, channel #dev.
+
+Bitcoin donations are accepted at: 12r5uEWEgcHC46xd64tt3hHt9EUvYYDHe9
+
+License
+-------
+This software is released under the terms of the 0BSD license, the text of which
+is included within the package along with the list of authors.
diff --git a/cmd/extfs-pdf/main.go b/cmd/extfs-pdf/main.go
new file mode 100644
index 0000000..eab3e2b
--- /dev/null
+++ b/cmd/extfs-pdf/main.go
@@ -0,0 +1,141 @@
+//
+// Copyright (c) 2021 - 2024, Přemysl Eric Janouch <p@janouch.name>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+
+// extfs-pdf is an external VFS plugin for Midnight Commander.
+// More serious image extractors should rewrite this to use pdfimages(1).
+package main
+
+import (
+ "flag"
+ "fmt"
+ "os"
+ "time"
+
+ "janouch.name/pdf-simple-sign/pdf"
+)
+
+func die(status int, format string, args ...interface{}) {
+ os.Stderr.WriteString(fmt.Sprintf(format+"\n", args...))
+ os.Exit(status)
+}
+
+func usage() {
+ die(1, "Usage: %s [-h] COMMAND DOCUMENT [ARG...]", os.Args[0])
+}
+
+func streamSuffix(o *pdf.Object) string {
+ if filter, _ := o.Dict["Filter"]; filter.Kind == pdf.Name {
+ switch filter.String {
+ case "JBIG2Decode":
+ // This is the file extension used by pdfimages(1).
+ // This is not a complete JBIG2 standalone file.
+ return "jb2e"
+ case "JPXDecode":
+ return "jp2"
+ case "DCTDecode":
+ return "jpg"
+ case "FlateDecode":
+ return "zz"
+ default:
+ return filter.String
+ }
+ }
+ return "stream"
+}
+
+func list(mtime time.Time, updater *pdf.Updater) {
+ stamp := mtime.Local().Format("01-02-2006 15:04:05")
+ for _, o := range updater.ListIndirect() {
+ object, err := updater.Get(o.N, o.Generation)
+ size := 0
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "%s\n", err)
+ } else {
+ // Accidental transformation, retrieving original data is more work.
+ size = len(object.Serialize())
+ }
+ fmt.Printf("-r--r--r-- 1 0 0 %d %s n%dg%d\n",
+ size, stamp, o.N, o.Generation)
+ if object.Kind == pdf.Stream {
+ fmt.Printf("-r--r--r-- 1 0 0 %d %s n%dg%d.%s\n", len(object.Stream),
+ stamp, o.N, o.Generation, streamSuffix(&object))
+ }
+ }
+}
+
+func copyout(updater *pdf.Updater, storedFilename, extractTo string) {
+ var (
+ n, generation uint
+ suffix string
+ )
+ m, err := fmt.Sscanf(storedFilename, "n%dg%d%s", &n, &generation, &suffix)
+ if m < 2 {
+ die(3, "%s: %s", storedFilename, err)
+ }
+
+ object, err := updater.Get(n, generation)
+ if err != nil {
+ die(3, "%s: %s", storedFilename, err)
+ }
+
+ content := []byte(object.Serialize())
+ if suffix != "" {
+ content = object.Stream
+ }
+ if err = os.WriteFile(extractTo, content, 0666); err != nil {
+ die(3, "%s", err)
+ }
+}
+
+func main() {
+ flag.Usage = usage
+ flag.Parse()
+ if flag.NArg() < 2 {
+ usage()
+ }
+
+ command, documentPath := flag.Arg(0), flag.Arg(1)
+ doc, err := os.ReadFile(documentPath)
+ if err != nil {
+ die(1, "%s", err)
+ }
+
+ mtime := time.UnixMilli(0)
+ if info, err := os.Stat(documentPath); err == nil {
+ mtime = info.ModTime()
+ }
+
+ updater, err := pdf.NewUpdater(doc)
+ if err != nil {
+ die(2, "%s", err)
+ }
+
+ switch command {
+ default:
+ die(1, "unsupported command: %s", command)
+ case "list":
+ if flag.NArg() != 2 {
+ usage()
+ } else {
+ list(mtime, updater)
+ }
+ case "copyout":
+ if flag.NArg() != 4 {
+ usage()
+ } else {
+ copyout(updater, flag.Arg(2), flag.Arg(3))
+ }
+ }
+}
diff --git a/cmd/pdf-simple-sign/main.go b/cmd/pdf-simple-sign/main.go
new file mode 100644
index 0000000..5141a12
--- /dev/null
+++ b/cmd/pdf-simple-sign/main.go
@@ -0,0 +1,76 @@
+//
+// Copyright (c) 2018 - 2020, Přemysl Eric Janouch <p@janouch.name>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+
+// pdf-simple-sign is a simple PDF signer.
+package main
+
+import (
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "os"
+
+ "janouch.name/pdf-simple-sign/pdf"
+)
+
+// #include <unistd.h>
+import "C"
+
+func isatty(fd uintptr) bool { return C.isatty(C.int(fd)) != 0 }
+
+func die(status int, format string, args ...interface{}) {
+ msg := fmt.Sprintf(format+"\n", args...)
+ if isatty(os.Stderr.Fd()) {
+ msg = "\x1b[0;31m" + msg + "\x1b[m"
+ }
+ os.Stderr.WriteString(msg)
+ os.Exit(status)
+}
+
+func usage() {
+ die(1, "Usage: %s [-h] [-r RESERVATION] INPUT-FILENAME OUTPUT-FILENAME "+
+ "PKCS12-PATH PKCS12-PASS", os.Args[0])
+}
+
+var reservation = flag.Int(
+ "r", 4096, "signature reservation as a number of bytes")
+
+func main() {
+ flag.Usage = usage
+ flag.Parse()
+ if flag.NArg() != 4 {
+ usage()
+ }
+
+ inputPath, outputPath := flag.Arg(0), flag.Arg(1)
+ doc, err := ioutil.ReadFile(inputPath)
+ if err != nil {
+ die(1, "%s", err)
+ }
+ p12, err := ioutil.ReadFile(flag.Arg(2))
+ if err != nil {
+ die(2, "%s", err)
+ }
+ key, certs, err := pdf.PKCS12Parse(p12, flag.Arg(3))
+ if err != nil {
+ die(3, "%s", err)
+ }
+ if doc, err = pdf.Sign(doc, key, certs, *reservation); err != nil {
+ die(4, "error: %s", err)
+ }
+ if err = ioutil.WriteFile(outputPath, doc, 0666); err != nil {
+ die(5, "%s", err)
+ }
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..0e84ffc
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,8 @@
+module janouch.name/pdf-simple-sign
+
+go 1.17
+
+require (
+ go.mozilla.org/pkcs7 v0.0.0-20210826202110-33d05740a352
+ golang.org/x/crypto v0.10.0
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..4cf11b0
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,13 @@
+go.mozilla.org/pkcs7 v0.0.0-20200128120323-432b2356ecb1 h1:A/5uWzF44DlIgdm/PQFwfMkW0JX+cIcQi/SwLAmZP5M=
+go.mozilla.org/pkcs7 v0.0.0-20200128120323-432b2356ecb1/go.mod h1:SNgMg+EgDFwmvSmLRTNKC5fegJjB7v23qTQ0XLGUNHk=
+go.mozilla.org/pkcs7 v0.0.0-20210826202110-33d05740a352 h1:CCriYyAfq1Br1aIYettdHZTy8mBTIPo7We18TuO/bak=
+go.mozilla.org/pkcs7 v0.0.0-20210826202110-33d05740a352/go.mod h1:SNgMg+EgDFwmvSmLRTNKC5fegJjB7v23qTQ0XLGUNHk=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de h1:ikNHVSjEfnvz6sxdSPCaPt572qowuyMDMJLLm3Db3ig=
+golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
+golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
diff --git a/meson.build b/meson.build
new file mode 100644
index 0000000..d68a99d
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,23 @@
+project('pdf-simple-sign', 'cpp', default_options : ['cpp_std=c++11'],
+ version : '1.1.1')
+
+conf = configuration_data()
+conf.set_quoted('PROJECT_NAME', meson.project_name())
+conf.set_quoted('PROJECT_VERSION', meson.project_version())
+configure_file(output : 'config.h', configuration : conf)
+
+cryptodep = dependency('libcrypto')
+executable('pdf-simple-sign', 'pdf-simple-sign.cpp',
+ install : true,
+ dependencies : cryptodep)
+
+asciidoctor = find_program('asciidoctor')
+foreach page : ['pdf-simple-sign']
+ custom_target('manpage for ' + page,
+ input: page + '.adoc', output: page + '.1',
+ command: [asciidoctor, '-b', 'manpage',
+ '-a', 'release-version=' + meson.project_version(),
+ '@INPUT@', '-o', '@OUTPUT@'],
+ install: true,
+ install_dir: join_paths(get_option('mandir'), 'man1'))
+endforeach
diff --git a/pdf-simple-sign.adoc b/pdf-simple-sign.adoc
new file mode 100644
index 0000000..4ab1bc5
--- /dev/null
+++ b/pdf-simple-sign.adoc
@@ -0,0 +1,80 @@
+pdf-simple-sign(1)
+==================
+:doctype: manpage
+:manmanual: pdf-simple-sign Manual
+:mansource: pdf-simple-sign {release-version}
+
+Name
+----
+pdf-simple-sign - a simple PDF signer
+
+Synopsis
+--------
+*pdf-simple-sign* [_OPTION_]... _INPUT.pdf_ _OUTPUT.pdf_ _KEY-PAIR.p12_ _PASSWORD_
+
+Description
+-----------
+*pdf-simple-sign* is a simple PDF signer intended for documents produced by
+the Cairo library, GNU troff, ImageMagick, or similar. As such, it currently
+comes with some restrictions:
+
+ * the document may not have any forms or signatures already, as they would be
+ overwritten,
+ * the document may not employ cross-reference streams, or must constitute
+ a hybrid-reference file at least.
+
+The key and certificate pair is accepted in the PKCS#12 format. The _PASSWORD_
+must be supplied on the command line, and may be empty if it is not needed.
+
+The signature is attached to the first page and has no appearance.
+
+If signature data don't fit within the default reservation of 4 kibibytes,
+you might need to adjust it using the *-r* option, or throw out any unnecessary
+intermediate certificates.
+
+Options
+-------
+*-r* _RESERVATION_, *--reservation*=_RESERVATION_::
+ Set aside _RESERVATION_ amount of bytes for the resulting signature.
+ Feel free to try a few values in a loop. The program itself has no
+ conceptions about the data, so it can't make accurate predictions.
+
+*-h*, *--help*::
+ Display a help message and exit.
+
+*-V*, *--version*::
+ Output version information and exit.
+
+Examples
+--------
+Create a self-signed certificate, make a document containing the current date,
+sign it and verify the attached signature:
+
+ $ openssl req -newkey rsa:2048 -subj /CN=Test -nodes \
+ -keyout key.pem -x509 -addext keyUsage=digitalSignature \
+ -out cert.pem 2>/dev/null
+ $ openssl pkcs12 -inkey key.pem -in cert.pem \
+ -export -passout pass: -out key-pair.p12
+ $ date | groff -T pdf > test.pdf
+ $ pdf-simple-sign test.pdf test.signed.pdf key-pair.p12 ""
+ $ pdfsig test.signed.pdf
+ Digital Signature Info of: test.signed.pdf
+ Signature #1:
+ - Signer Certificate Common Name: Test
+ - Signer full Distinguished Name: CN=Test
+ - Signing Time: Sep 05 2020 19:41:22
+ - Signing Hash Algorithm: SHA-256
+ - Signature Type: adbe.pkcs7.detached
+ - Signed Ranges: [0 - 6522], [14716 - 15243]
+ - Total document signed
+ - Signature Validation: Signature is Valid.
+ - Certificate Validation: Certificate issuer isn't Trusted.
+
+Reporting bugs
+--------------
+Use https://git.janouch.name/p/pdf-simple-sign to report bugs, request features,
+or submit pull requests.
+
+See also
+--------
+*openssl*(1), *pdfsig*(1)
diff --git a/pdf-simple-sign.cpp b/pdf-simple-sign.cpp
new file mode 100644
index 0000000..8b9d1fe
--- /dev/null
+++ b/pdf-simple-sign.cpp
@@ -0,0 +1,1017 @@
+// vim: set sw=2 ts=2 sts=2 et tw=100:
+//
+// pdf-simple-sign: simple PDF signer
+//
+// Copyright (c) 2017 - 2020, Přemysl Eric Janouch <p@janouch.name>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+
+#include <cmath>
+#include <cstdio>
+#undef NDEBUG
+#include <cassert>
+
+#include <map>
+#include <memory>
+#include <regex>
+#include <set>
+#include <vector>
+
+#if defined __GLIBCXX__ && __GLIBCXX__ < 20140422
+#error Need libstdc++ >= 4.9 for <regex>
+#endif
+
+#include <getopt.h>
+#include <openssl/err.h>
+#include <openssl/pkcs12.h>
+#include <openssl/x509v3.h>
+#include <unistd.h>
+
+#include "config.h"
+
+// -------------------------------------------------------------------------------------------------
+
+using uint = unsigned int;
+using ushort = unsigned short;
+
+static std::string concatenate(const std::vector<std::string>& v, const std::string& delim) {
+ std::string res;
+ if (v.empty())
+ return res;
+ for (const auto& s : v)
+ res += s + delim;
+ return res.substr(0, res.length() - delim.length());
+}
+
+template<typename... Args>
+std::string ssprintf(const std::string& format, Args... args) {
+ size_t size = std::snprintf(nullptr, 0, format.c_str(), args...) + 1;
+ std::unique_ptr<char[]> buf(new char[size]);
+ std::snprintf(buf.get(), size, format.c_str(), args...);
+ return std::string(buf.get(), buf.get() + size - 1);
+}
+
+// -------------------------------------------------------------------------------------------------
+
+/// PDF token/object thingy. Objects may be composed either from one or a sequence of tokens.
+/// The PDF Reference doesn't actually speak of tokens, though ISO 32000-1:2008 does.
+struct pdf_object {
+ enum type {
+ END, NL, COMMENT, NIL, BOOL, NUMERIC, KEYWORD, NAME, STRING,
+ // Simple tokens
+ B_ARRAY, E_ARRAY, B_DICT, E_DICT,
+ // Higher-level objects
+ ARRAY, DICT, OBJECT, REFERENCE,
+ } type = END;
+
+ std::string string; ///< END (error message), COMMENT/KEYWORD/NAME/STRING
+ double number = 0.; ///< BOOL, NUMERIC
+ std::vector<pdf_object> array; ///< ARRAY, OBJECT
+ std::map<std::string, pdf_object> dict; ///< DICT, in the future also STREAM
+ uint n = 0, generation = 0; ///< OBJECT, REFERENCE
+
+ pdf_object(enum type type = END) : type(type) {}
+ pdf_object(enum type type, double v) : type(type), number(v) {}
+ pdf_object(enum type type, const std::string& v) : type(type), string(v) {}
+ pdf_object(enum type type, uint n, uint g) : type(type), n(n), generation(g) {}
+ pdf_object(const std::vector<pdf_object>& array) : type(ARRAY), array(array) {}
+ pdf_object(const std::map<std::string, pdf_object>& dict) : type(DICT), dict(dict) {}
+
+ pdf_object(const pdf_object&) = default;
+ pdf_object(pdf_object&&) = default;
+ pdf_object& operator=(const pdf_object&) = default;
+ pdf_object& operator=(pdf_object&&) = default;
+
+ /// Return whether this is a number without a fractional part
+ bool is_integer() const {
+ double tmp;
+ return type == NUMERIC && std::modf(number, &tmp) == 0.;
+ }
+};
+
+/// Basic lexical analyser for the Portable Document Format, giving limited error information
+struct pdf_lexer {
+ const unsigned char* p;
+ pdf_lexer(const char* s) : p(reinterpret_cast<const unsigned char*>(s)) {}
+
+ static constexpr const char* oct_alphabet = "01234567";
+ static constexpr const char* dec_alphabet = "0123456789";
+ static constexpr const char* hex_alphabet = "0123456789abcdefABCDEF";
+ static constexpr const char* whitespace = "\t\n\f\r ";
+ static constexpr const char* delimiters = "()<>[]{}/%";
+
+ bool eat_newline(int ch) {
+ if (ch == '\r') {
+ if (*p == '\n') p++;
+ return true;
+ }
+ return ch == '\n';
+ }
+
+ pdf_object string() {
+ std::string value;
+ int parens = 1;
+ while (1) {
+ if (!*p) return {pdf_object::END, "unexpected end of string"};
+ auto ch = *p++;
+ if (eat_newline(ch)) ch = '\n';
+ else if (ch == '(') { parens++; }
+ else if (ch == ')') { if (!--parens) break; }
+ else if (ch == '\\') {
+ if (!*p) return {pdf_object::END, "unexpected end of string"};
+ switch ((ch = *p++)) {
+ case 'n': ch = '\n'; break;
+ case 'r': ch = '\r'; break;
+ case 't': ch = '\t'; break;
+ case 'b': ch = '\b'; break;
+ case 'f': ch = '\f'; break;
+ default:
+ if (eat_newline(ch))
+ continue;
+ std::string octal;
+ if (ch && strchr(oct_alphabet, ch)) {
+ octal += ch;
+ if (*p && strchr(oct_alphabet, *p)) octal += *p++;
+ if (*p && strchr(oct_alphabet, *p)) octal += *p++;
+ ch = std::stoi(octal, nullptr, 8);
+ }
+ }
+ }
+ value += ch;
+ }
+ return {pdf_object::STRING, value};
+ }
+
+ pdf_object string_hex() {
+ std::string value, buf;
+ while (*p != '>') {
+ if (!*p) return {pdf_object::END, "unexpected end of hex string"};
+ if (!strchr(hex_alphabet, *p))
+ return {pdf_object::END, "invalid hex string"};
+ buf += *p++;
+ if (buf.size() == 2) {
+ value += char(std::stoi(buf, nullptr, 16));
+ buf.clear();
+ }
+ }
+ p++;
+ if (!buf.empty()) value += char(std::stoi(buf + '0', nullptr, 16));
+ return {pdf_object::STRING, value};
+ }
+
+ pdf_object name() {
+ std::string value;
+ while (!strchr(whitespace, *p) && !strchr(delimiters, *p)) {
+ auto ch = *p++;
+ if (ch == '#') {
+ std::string hexa;
+ if (*p && strchr(hex_alphabet, *p)) hexa += *p++;
+ if (*p && strchr(hex_alphabet, *p)) hexa += *p++;
+ if (hexa.size() != 2)
+ return {pdf_object::END, "invalid name hexa escape"};
+ ch = char(std::stoi(hexa, nullptr, 16));
+ }
+ value += ch;
+ }
+ if (value.empty()) return {pdf_object::END, "unexpected end of name"};
+ return {pdf_object::NAME, value};
+ }
+
+ pdf_object comment() {
+ std::string value;
+ while (*p && *p != '\r' && *p != '\n')
+ value += *p++;
+ return {pdf_object::COMMENT, value};
+ }
+
+ // XXX maybe invalid numbers should rather be interpreted as keywords
+ pdf_object number() {
+ std::string value;
+ if (*p == '-')
+ value += *p++;
+ bool real = false, digits = false;
+ while (*p) {
+ if (strchr(dec_alphabet, *p))
+ digits = true;
+ else if (*p == '.' && !real)
+ real = true;
+ else
+ break;
+ value += *p++;
+ }
+ if (!digits) return {pdf_object::END, "invalid number"};
+ return {pdf_object::NUMERIC, std::stod(value, nullptr)};
+ }
+
+ pdf_object next() {
+ if (!*p)
+ return {pdf_object::END};
+ if (strchr("-0123456789.", *p))
+ return number();
+
+ // {} end up being keywords, we might want to error out on those
+ std::string value;
+ while (!strchr(whitespace, *p) && !strchr(delimiters, *p))
+ value += *p++;
+ if (!value.empty()) {
+ if (value == "null") return {pdf_object::NIL};
+ if (value == "true") return {pdf_object::BOOL, 1};
+ if (value == "false") return {pdf_object::BOOL, 0};
+ return {pdf_object::KEYWORD, value};
+ }
+
+ switch (char ch = *p++) {
+ case '/': return name();
+ case '%': return comment();
+ case '(': return string();
+ case '[': return {pdf_object::B_ARRAY};
+ case ']': return {pdf_object::E_ARRAY};
+ case '<':
+ if (*p++ == '<')
+ return {pdf_object::B_DICT};
+ p--;
+ return string_hex();
+ case '>':
+ if (*p++ == '>')
+ return {pdf_object::E_DICT};
+ p--;
+ return {pdf_object::END, "unexpected '>'"};
+ default:
+ if (eat_newline(ch))
+ return {pdf_object::NL};
+ if (strchr(whitespace, ch))
+ return next();
+ return {pdf_object::END, "unexpected input"};
+ }
+ }
+};
+
+// FIXME lines /should not/ be longer than 255 characters, some wrapping is in order
+static std::string pdf_serialize(const pdf_object& o) {
+ switch (o.type) {
+ case pdf_object::NL: return "\n";
+ case pdf_object::NIL: return "null";
+ case pdf_object::BOOL: return o.number ? "true" : "false";
+ case pdf_object::NUMERIC: {
+ if (o.is_integer()) return std::to_string((long long) o.number);
+ return std::to_string(o.number);
+ }
+ case pdf_object::KEYWORD: return o.string;
+ case pdf_object::NAME: {
+ std::string escaped = "/";
+ for (char c : o.string) {
+ if (c == '#' || strchr(pdf_lexer::delimiters, c) || strchr(pdf_lexer::whitespace, c))
+ escaped += ssprintf("#%02x", c);
+ else
+ escaped += c;
+ }
+ return escaped;
+ }
+ case pdf_object::STRING: {
+ std::string escaped;
+ for (char c : o.string) {
+ if (c == '\\' || c == '(' || c == ')')
+ escaped += '\\';
+ escaped += c;
+ }
+ return "(" + escaped + ")";
+ }
+ case pdf_object::B_ARRAY: return "[";
+ case pdf_object::E_ARRAY: return "]";
+ case pdf_object::B_DICT: return "<<";
+ case pdf_object::E_DICT: return ">>";
+ case pdf_object::ARRAY: {
+ std::vector<std::string> v;
+ for (const auto& i : o.array)
+ v.push_back(pdf_serialize(i));
+ return "[ " + concatenate(v, " ") + " ]";
+ }
+ case pdf_object::DICT: {
+ std::string s;
+ for (const auto i : o.dict)
+ // FIXME the key is also supposed to be escaped by pdf_serialize()
+ s += " /" + i.first + " " + pdf_serialize(i.second);
+ return "<<" + s + " >>";
+ }
+ case pdf_object::OBJECT:
+ return ssprintf("%u %u obj\n", o.n, o.generation) + pdf_serialize(o.array.at(0)) + "\nendobj";
+ case pdf_object::REFERENCE:
+ return ssprintf("%u %u R", o.n, o.generation);
+ default:
+ assert(!"unsupported token for serialization");
+ }
+}
+
+// -------------------------------------------------------------------------------------------------
+
+/// Utility class to help read and possibly incrementally update PDF files
+class pdf_updater {
+ struct ref {
+ size_t offset = 0; ///< File offset or N of the next free entry
+ uint generation = 0; ///< Object generation
+ bool free = true; ///< Whether this N has been deleted
+ };
+
+ std::vector<ref> xref; ///< Cross-reference table
+ size_t xref_size = 0; ///< Current cross-reference table size, correlated to xref.size()
+ std::set<uint> updated; ///< List of updated objects
+
+ pdf_object parse_obj(pdf_lexer& lex, std::vector<pdf_object>& stack) const;
+ pdf_object parse_R(std::vector<pdf_object>& stack) const;
+ pdf_object parse(pdf_lexer& lex, std::vector<pdf_object>& stack) const;
+ std::string load_xref(pdf_lexer& lex, std::set<uint>& loaded_entries);
+
+public:
+ /// The new trailer dictionary to be written, initialized with the old one
+ std::map<std::string, pdf_object> trailer;
+
+ std::string& document;
+ pdf_updater(std::string& document) : document(document) {}
+
+ /// Build the cross-reference table and prepare a new trailer dictionary
+ std::string initialize();
+ /// Try to extract the claimed PDF version as a positive decimal number, e.g. 17 for PDF 1.7.
+ /// Returns zero on failure.
+ int version(const pdf_object& root) const;
+ /// Retrieve an object by its number and generation -- may return NIL or END with an error
+ pdf_object get(uint n, uint generation) const;
+ /// Allocate a new object number
+ uint allocate();
+ /// Append an updated object to the end of the document
+ void update(uint n, std::function<void()> fill);
+ /// Write an updated cross-reference table and trailer
+ void flush_updates();
+};
+
+// -------------------------------------------------------------------------------------------------
+
+/// If the object is an error, forward its message, otherwise return err.
+static std::string pdf_error(const pdf_object& o, const char* err) {
+ if (o.type != pdf_object::END || o.string.empty()) return err;
+ return o.string;
+}
+
+pdf_object pdf_updater::parse_obj(pdf_lexer& lex, std::vector<pdf_object>& stack) const {
+ if (stack.size() < 2)
+ return {pdf_object::END, "missing object ID pair"};
+
+ auto g = stack.back(); stack.pop_back();
+ auto n = stack.back(); stack.pop_back();
+ if (!g.is_integer() || g.number < 0 || g.number > UINT_MAX ||
+ !n.is_integer() || n.number < 0 || n.number > UINT_MAX)
+ return {pdf_object::END, "invalid object ID pair"};
+
+ pdf_object obj{pdf_object::OBJECT};
+ obj.n = n.number;
+ obj.generation = g.number;
+
+ while (1) {
+ auto object = parse(lex, obj.array);
+ if (object.type == pdf_object::END)
+ return {pdf_object::END, pdf_error(object, "object doesn't end")};
+ if (object.type == pdf_object::KEYWORD && object.string == "endobj")
+ break;
+ obj.array.push_back(std::move(object));
+ }
+ return obj;
+}
+
+pdf_object pdf_updater::parse_R(std::vector<pdf_object>& stack) const {
+ if (stack.size() < 2)
+ return {pdf_object::END, "missing reference ID pair"};
+
+ auto g = stack.back(); stack.pop_back();
+ auto n = stack.back(); stack.pop_back();
+ if (!g.is_integer() || g.number < 0 || g.number > UINT_MAX ||
+ !n.is_integer() || n.number < 0 || n.number > UINT_MAX)
+ return {pdf_object::END, "invalid reference ID pair"};
+
+ pdf_object ref{pdf_object::REFERENCE};
+ ref.n = n.number;
+ ref.generation = g.number;
+ return ref;
+}
+
+/// Read an object at the lexer's position. Not a strict parser.
+pdf_object pdf_updater::parse(pdf_lexer& lex, std::vector<pdf_object>& stack) const {
+ auto token = lex.next();
+ switch (token.type) {
+ case pdf_object::NL:
+ case pdf_object::COMMENT:
+ // These are not important to parsing, not even for this procedure's needs
+ return parse(lex, stack);
+ case pdf_object::B_ARRAY: {
+ std::vector<pdf_object> array;
+ while (1) {
+ auto object = parse(lex, array);
+ if (object.type == pdf_object::END)
+ return {pdf_object::END, pdf_error(object, "array doesn't end")};
+ if (object.type == pdf_object::E_ARRAY)
+ break;
+ array.push_back(std::move(object));
+ }
+ return array;
+ }
+ case pdf_object::B_DICT: {
+ std::vector<pdf_object> array;
+ while (1) {
+ auto object = parse(lex, array);
+ if (object.type == pdf_object::END)
+ return {pdf_object::END, pdf_error(object, "dictionary doesn't end")};
+ if (object.type == pdf_object::E_DICT)
+ break;
+ array.push_back(std::move(object));
+ }
+ if (array.size() % 2)
+ return {pdf_object::END, "unbalanced dictionary"};
+ std::map<std::string, pdf_object> dict;
+ for (size_t i = 0; i < array.size(); i += 2) {
+ if (array[i].type != pdf_object::NAME)
+ return {pdf_object::END, "invalid dictionary key type"};
+ dict.insert({array[i].string, std::move(array[i + 1])});
+ }
+ return dict;
+ }
+ case pdf_object::KEYWORD:
+ // Appears in the document body, typically needs to access the cross-reference table
+ // TODO use the xref to read /Length etc. once we actually need to read such objects;
+ // presumably streams can use the pdf_object::string member
+ if (token.string == "stream") return {pdf_object::END, "streams are not supported yet"};
+ if (token.string == "obj") return parse_obj(lex, stack);
+ if (token.string == "R") return parse_R(stack);
+ return token;
+ default:
+ return token;
+ }
+}
+
+std::string pdf_updater::load_xref(pdf_lexer& lex, std::set<uint>& loaded_entries) {
+ std::vector<pdf_object> throwaway_stack;
+ {
+ auto keyword = parse(lex, throwaway_stack);
+ if (keyword.type != pdf_object::KEYWORD || keyword.string != "xref")
+ return "invalid xref table";
+ }
+ while (1) {
+ auto object = parse(lex, throwaway_stack);
+ if (object.type == pdf_object::END)
+ return pdf_error(object, "unexpected EOF while looking for the trailer");
+ if (object.type == pdf_object::KEYWORD && object.string == "trailer")
+ break;
+
+ auto second = parse(lex, throwaway_stack);
+ if (!object.is_integer() || object.number < 0 || object.number > UINT_MAX ||
+ !second.is_integer() || second.number < 0 || second.number > UINT_MAX)
+ return "invalid xref section header";
+
+ const size_t start = object.number;
+ const size_t count = second.number;
+ for (size_t i = 0; i < count; i++) {
+ auto off = parse(lex, throwaway_stack);
+ auto gen = parse(lex, throwaway_stack);
+ auto key = parse(lex, throwaway_stack);
+ if (!off.is_integer() || off.number < 0 || off.number > document.length() ||
+ !gen.is_integer() || gen.number < 0 || gen.number > 65535 ||
+ key.type != pdf_object::KEYWORD)
+ return "invalid xref entry";
+
+ bool free = true;
+ if (key.string == "n")
+ free = false;
+ else if (key.string != "f")
+ return "invalid xref entry";
+
+ auto n = start + i;
+ if (loaded_entries.count(n))
+ continue;
+ if (n >= xref.size())
+ xref.resize(n + 1);
+ loaded_entries.insert(n);
+
+ auto& ref = xref[n];
+ ref.generation = gen.number;
+ ref.offset = off.number;
+ ref.free = free;
+ }
+ }
+ return "";
+}
+
+// -------------------------------------------------------------------------------------------------
+
+std::string pdf_updater::initialize() {
+ // We only need to look for startxref roughly within the last kibibyte of the document
+ static std::regex haystack_re(R"([\s\S]*\sstartxref\s+(\d+)\s+%%EOF)");
+ std::string haystack = document.substr(document.length() < 1024 ? 0 : document.length() - 1024);
+
+ std::smatch m;
+ if (!std::regex_search(haystack, m, haystack_re, std::regex_constants::match_continuous))
+ return "cannot find startxref";
+
+ size_t xref_offset = std::stoul(m.str(1)), last_xref_offset = xref_offset;
+ std::set<size_t> loaded_xrefs;
+ std::set<uint> loaded_entries;
+
+ std::vector<pdf_object> throwaway_stack;
+ while (1) {
+ if (loaded_xrefs.count(xref_offset))
+ return "circular xref offsets";
+ if (xref_offset >= document.length())
+ return "invalid xref offset";
+
+ pdf_lexer lex(document.c_str() + xref_offset);
+ auto err = load_xref(lex, loaded_entries);
+ if (!err.empty()) return err;
+
+ auto trailer = parse(lex, throwaway_stack);
+ if (trailer.type != pdf_object::DICT)
+ return pdf_error(trailer, "invalid trailer dictionary");
+ if (loaded_xrefs.empty())
+ this->trailer = trailer.dict;
+ loaded_xrefs.insert(xref_offset);
+
+ const auto prev_offset = trailer.dict.find("Prev");
+ if (prev_offset == trailer.dict.end())
+ break;
+ // FIXME do not read offsets and sizes as floating point numbers
+ if (!prev_offset->second.is_integer() || prev_offset->second.number < 0)
+ return "invalid Prev offset";
+ xref_offset = prev_offset->second.number;
+ }
+
+ trailer["Prev"] = {pdf_object::NUMERIC, double(last_xref_offset)};
+ const auto last_size = trailer.find("Size");
+ if (last_size == trailer.end() || !last_size->second.is_integer() ||
+ last_size->second.number <= 0)
+ return "invalid or missing cross-reference table Size";
+
+ xref_size = last_size->second.number;
+ return "";
+}
+
+int pdf_updater::version(const pdf_object& root) const {
+ auto version = root.dict.find("Version");
+ if (version != root.dict.end() && version->second.type == pdf_object::NAME) {
+ const auto& v = version->second.string;
+ if (isdigit(v[0]) && v[1] == '.' && isdigit(v[2]) && !v[3])
+ return (v[0] - '0') * 10 + (v[2] - '0');
+ }
+
+ // We only need to look for the comment roughly within the first kibibyte of the document
+ static std::regex version_re(R"((?:^|[\r\n])%(?:!PS-Adobe-\d\.\d )?PDF-(\d)\.(\d)[\r\n])");
+ std::string haystack = document.substr(0, 1024);
+
+ std::smatch m;
+ if (std::regex_search(haystack, m, version_re, std::regex_constants::match_default))
+ return std::stoul(m.str(1)) * 10 + std::stoul(m.str(2));
+
+ return 0;
+}
+
+pdf_object pdf_updater::get(uint n, uint generation) const {
+ if (n >= xref_size)
+ return {pdf_object::NIL};
+
+ const auto& ref = xref[n];
+ if (ref.free || ref.generation != generation || ref.offset >= document.length())
+ return {pdf_object::NIL};
+
+ pdf_lexer lex(document.c_str() + ref.offset);
+ std::vector<pdf_object> stack;
+ while (1) {
+ auto object = parse(lex, stack);
+ if (object.type == pdf_object::END)
+ return object;
+ if (object.type != pdf_object::OBJECT)
+ stack.push_back(std::move(object));
+ else if (object.n != n || object.generation != generation)
+ return {pdf_object::END, "object mismatch"};
+ else
+ return std::move(object.array.at(0));
+ }
+}
+
+uint pdf_updater::allocate() {
+ assert(xref_size < UINT_MAX);
+
+ auto n = xref_size++;
+ if (xref.size() < xref_size)
+ xref.resize(xref_size);
+
+ // We don't make sure it gets a subsection in the update yet because we
+ // make no attempts at fixing the linked list of free items either
+ return n;
+}
+
+void pdf_updater::update(uint n, std::function<void()> fill) {
+ auto& ref = xref.at(n);
+ ref.offset = document.length() + 1;
+ ref.free = false;
+ updated.insert(n);
+
+ document += ssprintf("\n%u %u obj\n", n, ref.generation);
+ // Separately so that the callback can use document.length() to get the current offset
+ fill();
+ document += "\nendobj";
+}
+
+void pdf_updater::flush_updates() {
+ std::map<uint, size_t> groups;
+ for (auto i = updated.cbegin(); i != updated.cend(); ) {
+ size_t start = *i, count = 1;
+ while (++i != updated.cend() && *i == start + count)
+ count++;
+ groups[start] = count;
+ }
+
+ // Taking literally "Each cross-reference section begins with a line containing the keyword xref.
+ // Following this line are one or more cross-reference subsections." from 3.4.3 in PDF Reference
+ if (groups.empty())
+ groups[0] = 0;
+
+ auto startxref = document.length() + 1;
+ document += "\nxref\n";
+ for (const auto& g : groups) {
+ document += ssprintf("%u %zu\n", g.first, g.second);
+ for (size_t i = 0; i < g.second; i++) {
+ auto& ref = xref[g.first + i];
+ document += ssprintf("%010zu %05u %c \n", ref.offset, ref.generation, "nf"[!!ref.free]);
+ }
+ }
+
+ trailer["Size"] = {pdf_object::NUMERIC, double(xref_size)};
+ document +=
+ "trailer\n" + pdf_serialize(trailer) + ssprintf("\nstartxref\n%zu\n%%%%EOF\n", startxref);
+}
+
+// -------------------------------------------------------------------------------------------------
+
+/// Make a PDF object representing the given point in time
+static pdf_object pdf_date(time_t timestamp) {
+ struct tm parts;
+ assert(localtime_r(&timestamp, &parts));
+
+ char buf[64];
+ assert(strftime(buf, sizeof buf, "D:%Y%m%d%H%M%S", &parts));
+
+ std::string offset = "Z";
+ auto offset_min = parts.tm_gmtoff / 60;
+ if (parts.tm_gmtoff < 0)
+ offset = ssprintf("-%02ld'%02ld'", -offset_min / 60, -offset_min % 60);
+ if (parts.tm_gmtoff > 0)
+ offset = ssprintf("+%02ld'%02ld'", +offset_min / 60, +offset_min % 60);
+ return {pdf_object::STRING, buf + offset};
+}
+
+static pdf_object pdf_get_first_page(pdf_updater& pdf, uint node_n, uint node_generation) {
+ auto obj = pdf.get(node_n, node_generation);
+ if (obj.type != pdf_object::DICT)
+ return {pdf_object::NIL};
+
+ // Out of convenience; these aren't filled normally
+ obj.n = node_n;
+ obj.generation = node_generation;
+
+ auto type = obj.dict.find("Type");
+ if (type == obj.dict.end() || type->second.type != pdf_object::NAME)
+ return {pdf_object::NIL};
+ if (type->second.string == "Page")
+ return obj;
+ if (type->second.string != "Pages")
+ return {pdf_object::NIL};
+
+ // XXX technically speaking, this may be an indirect reference. The correct way to solve this
+ // seems to be having "pdf_updater" include a wrapper around "obj.dict.find"
+ auto kids = obj.dict.find("Kids");
+ if (kids == obj.dict.end() || kids->second.type != pdf_object::ARRAY ||
+ kids->second.array.empty() ||
+ kids->second.array.at(0).type != pdf_object::REFERENCE)
+ return {pdf_object::NIL};
+
+ // XXX nothing prevents us from recursing in an evil circular graph
+ return pdf_get_first_page(pdf, kids->second.array.at(0).n, kids->second.array.at(0).generation);
+}
+
+// -------------------------------------------------------------------------------------------------
+
+static std::string pkcs12_path, pkcs12_pass;
+
+// /All/ bytes are checked, except for the signature hexstring itself
+static std::string pdf_fill_in_signature(std::string& document, size_t sign_off, size_t sign_len) {
+ size_t tail_off = sign_off + sign_len, tail_len = document.size() - tail_off;
+ if (pkcs12_path.empty())
+ return "undefined path to the signing key";
+
+ auto pkcs12_fp = fopen(pkcs12_path.c_str(), "r");
+ if (!pkcs12_fp)
+ return pkcs12_path + ": " + strerror(errno);
+
+ // Abandon hope, all ye who enter OpenSSL! Half of it is undocumented.
+ OpenSSL_add_all_algorithms();
+ ERR_load_crypto_strings();
+ ERR_clear_error();
+
+ PKCS12* p12 = nullptr;
+ EVP_PKEY* private_key = nullptr;
+ X509* certificate = nullptr;
+ STACK_OF(X509)* chain = nullptr;
+ PKCS7* p7 = nullptr;
+ int len = 0, sign_flags = PKCS7_DETACHED | PKCS7_BINARY | PKCS7_NOSMIMECAP | PKCS7_PARTIAL;
+ BIO* p7bio = nullptr;
+ unsigned char* buf = nullptr;
+
+ // OpenSSL error reasons will usually be of more value than any distinction I can come up with
+ std::string err = "OpenSSL failure";
+
+ if (!(p12 = d2i_PKCS12_fp(pkcs12_fp, nullptr)) ||
+ !PKCS12_parse(p12, pkcs12_pass.c_str(), &private_key, &certificate, &chain)) {
+ err = pkcs12_path + ": parse failure";
+ goto error;
+ }
+ if (!private_key || !certificate) {
+ err = pkcs12_path + ": must contain a private key and a valid certificate chain";
+ goto error;
+ }
+ // Prevent useless signatures -- makes pdfsig from poppler happy at least (and NSS by extension)
+ if (!(X509_get_key_usage(certificate) & (KU_DIGITAL_SIGNATURE | KU_NON_REPUDIATION))) {
+ err = "the certificate's key usage must include digital signatures or non-repudiation";
+ goto error;
+ }
+ if (!(X509_get_extended_key_usage(certificate) & (XKU_SMIME | XKU_ANYEKU))) {
+ err = "the certificate's extended key usage must include S/MIME";
+ goto error;
+ }
+#if 0 // This happily ignores XKU_ANYEKU and I want my tiny world to make a tiny bit more sense
+ if (X509_check_purpose(certificate, X509_PURPOSE_SMIME_SIGN, false /* not a CA certificate */)) {
+ err = "the certificate can't be used for S/MIME digital signatures";
+ goto error;
+ }
+#endif
+
+ // The default digest is SHA1, which is mildly insecure now -- hence using PKCS7_sign_add_signer
+ if (!(p7 = PKCS7_sign(nullptr, nullptr, nullptr, nullptr, sign_flags)) ||
+ !PKCS7_sign_add_signer(p7, certificate, private_key, EVP_sha256(), sign_flags))
+ goto error;
+ // For RFC 3161, this is roughly how a timestamp token would be attached (see Appendix A):
+ // PKCS7_add_attribute(signer_info, NID_id_smime_aa_timeStampToken, V_ASN1_SEQUENCE, value)
+ for (int i = 0; i < sk_X509_num(chain); i++)
+ if (!PKCS7_add_certificate(p7, sk_X509_value(chain, i)))
+ goto error;
+
+ // Adaptation of the innards of the undocumented PKCS7_final() -- I didn't feel like making
+ // a copy of the whole document. Hopefully this writes directly into a digest BIO.
+ if (!(p7bio = PKCS7_dataInit(p7, nullptr)) ||
+ (ssize_t) sign_off != BIO_write(p7bio, document.data(), sign_off) ||
+ (ssize_t) tail_len != BIO_write(p7bio, document.data() + tail_off, tail_len) ||
+ BIO_flush(p7bio) != 1 || !PKCS7_dataFinal(p7, p7bio))
+ goto error;
+
+#if 0
+ {
+ // Debugging: openssl cms -inform PEM -in pdf_signature.pem -noout -cmsout -print
+ // Context: https://stackoverflow.com/a/29253469
+ auto fp = fopen("pdf_signature.pem", "wb");
+ assert(PEM_write_PKCS7(fp, p7) && !fclose(fp));
+ }
+#endif
+
+ if ((len = i2d_PKCS7(p7, &buf)) < 0)
+ goto error;
+ if (size_t(len) * 2 > sign_len - 2 /* hexstring quotes */) {
+ // The obvious solution is to increase the allocation... or spend a week reading specifications
+ // while losing all faith in humanity as a species, and skip the PKCS7 API entirely
+ err = ssprintf("not enough space reserved for the signature (%zu nibbles vs %zu nibbles)",
+ sign_len - 2, size_t(len) * 2);
+ goto error;
+ }
+ for (int i = 0; i < len; i++) {
+ document[sign_off + 2 * i + 1] = "0123456789abcdef"[buf[i] / 16];
+ document[sign_off + 2 * i + 2] = "0123456789abcdef"[buf[i] % 16];
+ }
+ err.clear();
+
+error:
+ OPENSSL_free(buf);
+ BIO_free_all(p7bio);
+ PKCS7_free(p7);
+ sk_X509_pop_free(chain, X509_free);
+ X509_free(certificate);
+ EVP_PKEY_free(private_key);
+ PKCS12_free(p12);
+
+ // In any case, clear the error stack (it's a queue, really) to avoid confusion elsewhere
+ while (auto code = ERR_get_error())
+ if (auto reason = ERR_reason_error_string(code))
+ err = err + "; " + reason;
+
+ fclose(pkcs12_fp);
+ return err;
+}
+
+// -------------------------------------------------------------------------------------------------
+
+/// The presumption here is that the document is valid and that it doesn't employ cross-reference
+/// streams from PDF 1.5, or at least constitutes a hybrid-reference file. The results with
+/// PDF 2.0 (2017) are currently unknown as the standard costs money.
+///
+/// https://www.adobe.com/devnet-docs/acrobatetk/tools/DigSig/Acrobat_DigitalSignatures_in_PDF.pdf
+/// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
+/// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PPKAppearances.pdf
+static std::string pdf_sign(std::string& document, ushort reservation) {
+ pdf_updater pdf(document);
+ auto err = pdf.initialize();
+ if (!err.empty())
+ return err;
+
+ auto root_ref = pdf.trailer.find("Root");
+ if (root_ref == pdf.trailer.end() || root_ref->second.type != pdf_object::REFERENCE)
+ return "trailer does not contain a reference to Root";
+ auto root = pdf.get(root_ref->second.n, root_ref->second.generation);
+ if (root.type != pdf_object::DICT)
+ return "invalid Root dictionary reference";
+
+ // 8.7 Digital Signatures - /signature dictionary/
+ auto sigdict_n = pdf.allocate();
+ size_t byterange_off = 0, byterange_len = 0, sign_off = 0, sign_len = 0;
+ pdf.update(sigdict_n, [&] {
+ // The timestamp is important for Adobe Acrobat Reader DC. The ideal would be to use RFC 3161.
+ pdf.document.append("<< /Type/Sig /Filter/Adobe.PPKLite /SubFilter/adbe.pkcs7.detached\n"
+ " /M" + pdf_serialize(pdf_date(time(nullptr))) + " /ByteRange ");
+ byterange_off = pdf.document.size();
+ pdf.document.append((byterange_len = 32 /* fine for a gigabyte */), ' ');
+ pdf.document.append("\n /Contents <");
+ sign_off = pdf.document.size();
+ pdf.document.append((sign_len = reservation * 2), '0');
+ pdf.document.append("> >>");
+
+ // We actually need to exclude the hexstring quotes from signing
+ sign_off -= 1;
+ sign_len += 2;
+ });
+
+ // 8.6.3 Field Types - Signature Fields
+ pdf_object sigfield{pdf_object::DICT};
+ sigfield.dict.insert({"FT", {pdf_object::NAME, "Sig"}});
+ sigfield.dict.insert({"V", {pdf_object::REFERENCE, sigdict_n, 0}});
+ // 8.4.5 Annotations Types - Widget Annotations
+ // We can merge the Signature Annotation and omit Kids here
+ sigfield.dict.insert({"Subtype", {pdf_object::NAME, "Widget"}});
+ sigfield.dict.insert({"F", {pdf_object::NUMERIC, 2 /* Hidden */}});
+ sigfield.dict.insert({"T", {pdf_object::STRING, "Signature1"}});
+ sigfield.dict.insert({"Rect", {std::vector<pdf_object>{
+ {pdf_object::NUMERIC, 0},
+ {pdf_object::NUMERIC, 0},
+ {pdf_object::NUMERIC, 0},
+ {pdf_object::NUMERIC, 0},
+ }}});
+
+ auto sigfield_n = pdf.allocate();
+ pdf.update(sigfield_n, [&] { pdf.document += pdf_serialize(sigfield); });
+
+ auto pages_ref = root.dict.find("Pages");
+ if (pages_ref == root.dict.end() || pages_ref->second.type != pdf_object::REFERENCE)
+ return "invalid Pages reference";
+ auto page = pdf_get_first_page(pdf, pages_ref->second.n, pages_ref->second.generation);
+ if (page.type != pdf_object::DICT)
+ return "invalid or unsupported page tree";
+
+ auto& annots = page.dict["Annots"];
+ if (annots.type != pdf_object::ARRAY) {
+ // TODO indirectly referenced arrays might not be that hard to support
+ if (annots.type != pdf_object::END)
+ return "unexpected Annots";
+
+ annots = {pdf_object::ARRAY};
+ }
+ annots.array.emplace_back(pdf_object::REFERENCE, sigfield_n, 0);
+ pdf.update(page.n, [&] { pdf.document += pdf_serialize(page); });
+
+ // 8.6.1 Interactive Form Dictionary
+ if (root.dict.count("AcroForm"))
+ return "the document already contains forms, they would be overwritten";
+
+ root.dict["AcroForm"] = {std::map<std::string, pdf_object>{
+ {"Fields", {std::vector<pdf_object>{
+ {pdf_object::REFERENCE, sigfield_n, 0}
+ }}},
+ {"SigFlags", {pdf_object::NUMERIC, 3 /* SignaturesExist | AppendOnly */}}
+ }};
+
+ // Upgrade the document version for SHA-256 etc.
+ if (pdf.version(root) < 16)
+ root.dict["Version"] = {pdf_object::NAME, "1.6"};
+
+ pdf.update(root_ref->second.n, [&] { pdf.document += pdf_serialize(root); });
+ pdf.flush_updates();
+
+ // Now that we know the length of everything, store byte ranges of what we're about to sign,
+ // which must be everything but the resulting signature itself
+ size_t tail_off = sign_off + sign_len, tail_len = pdf.document.size() - tail_off;
+ auto ranges = ssprintf("[0 %zu %zu %zu]", sign_off, tail_off, tail_len);
+ if (ranges.length() > byterange_len)
+ return "not enough space reserved for /ByteRange";
+ pdf.document.replace(byterange_off, std::min(ranges.length(), byterange_len), ranges);
+ return pdf_fill_in_signature(pdf.document, sign_off, sign_len);
+}
+
+// -------------------------------------------------------------------------------------------------
+
+__attribute__((format(printf, 2, 3)))
+static void die(int status, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ if (isatty(fileno(stderr)))
+ vfprintf(stderr, ssprintf("\x1b[31m%s\x1b[0m\n", format).c_str(), ap);
+ else
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ exit(status);
+}
+
+int main(int argc, char* argv[]) {
+ auto invocation_name = argv[0];
+ auto usage = [=] {
+ die(1, "Usage: %s [-h] [-r RESERVATION] INPUT-FILENAME OUTPUT-FILENAME PKCS12-PATH PKCS12-PASS",
+ invocation_name);
+ };
+
+ static struct option opts[] = {
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"reservation", required_argument, 0, 'r'},
+ {nullptr, 0, 0, 0},
+ };
+
+ // Reserved space in bytes for the certificate, digest, encrypted digest, ...
+ long reservation = 4096;
+ while (1) {
+ int option_index = 0;
+ auto c = getopt_long(argc, const_cast<char* const*>(argv), "hVr:", opts, &option_index);
+ if (c == -1)
+ break;
+
+ char* end = nullptr;
+ switch (c) {
+ case 'r':
+ errno = 0, reservation = strtol(optarg, &end, 10);
+ if (errno || *end || reservation <= 0 || reservation > USHRT_MAX)
+ die(1, "%s: must be a positive number", optarg);
+ break;
+ case 'V':
+ die(0, "%s", PROJECT_NAME " " PROJECT_VERSION);
+ break;
+ case 'h':
+ default:
+ usage();
+ }
+ }
+
+ argv += optind;
+ argc -= optind;
+
+ if (argc != 4)
+ usage();
+
+ const char* input_path = argv[0];
+ const char* output_path = argv[1];
+ pkcs12_path = argv[2];
+ pkcs12_pass = argv[3];
+
+ std::string pdf_document;
+ if (auto fp = fopen(input_path, "rb")) {
+ int c;
+ while ((c = fgetc(fp)) != EOF)
+ pdf_document += c;
+ if (ferror(fp))
+ die(1, "%s: %s", input_path, strerror(errno));
+ fclose(fp);
+ } else {
+ die(1, "%s: %s", input_path, strerror(errno));
+ }
+
+ auto err = pdf_sign(pdf_document, ushort(reservation));
+ if (!err.empty()) {
+ die(2, "Error: %s", err.c_str());
+ }
+
+ if (auto fp = fopen(output_path, "wb")) {
+ auto written = fwrite(pdf_document.c_str(), pdf_document.size(), 1, fp);
+ if (fclose(fp) || written != 1) {
+ (void) unlink(output_path);
+ die(3, "%s: %s", output_path, strerror(errno));
+ }
+ } else {
+ die(3, "%s: %s", output_path, strerror(errno));
+ }
+ return 0;
+}
diff --git a/pdf/pdf.go b/pdf/pdf.go
new file mode 100644
index 0000000..1fcdaa4
--- /dev/null
+++ b/pdf/pdf.go
@@ -0,0 +1,1663 @@
+//
+// Copyright (c) 2018 - 2024, Přemysl Eric Janouch <p@janouch.name>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+
+// Package pdf signs PDF documents and provides some processing utilities.
+package pdf
+
+import (
+ "bytes"
+ "compress/zlib"
+ "encoding/binary"
+ "encoding/hex"
+ "errors"
+ "fmt"
+ "math"
+ "regexp"
+ "sort"
+ "strconv"
+ "strings"
+ "time"
+
+ "crypto"
+ "crypto/ecdsa"
+ "crypto/rsa"
+ "crypto/x509"
+
+ "go.mozilla.org/pkcs7"
+ "golang.org/x/crypto/pkcs12"
+)
+
+type ObjectKind int
+
+const (
+ End ObjectKind = iota
+ NL
+ Comment
+ Nil
+ Bool
+ Numeric
+ Keyword
+ Name
+ String
+
+ // simple tokens
+ BArray
+ EArray
+ BDict
+ EDict
+
+ // higher-level objects
+ Array
+ Dict
+ Stream
+ Indirect
+ Reference
+)
+
+// Object is a PDF token/object thingy. Objects may be composed either from
+// one or a sequence of tokens. The PDF Reference doesn't actually speak
+// of tokens, though ISO 32000-1:2008 does.
+type Object struct {
+ Kind ObjectKind
+
+ String string // Comment/Keyword/Name/String
+ Number float64 // Bool, Numeric
+ Array []Object // Array, Indirect
+ Dict map[string]Object // Dict, Stream
+ Stream []byte // Stream
+ N, Generation uint // Indirect, Reference
+}
+
+// IsInteger checks if the PDF object is an integer number.
+func (o *Object) IsInteger() bool {
+ _, f := math.Modf(o.Number)
+ return o.Kind == Numeric && f == 0
+}
+
+// IsUint checks if the PDF object is an integer number that fits into a uint.
+func (o *Object) IsUint() bool {
+ return o.IsInteger() && o.Number >= 0 && o.Number <= float64(^uint(0))
+}
+
+// A slew of constructors that will hopefully get all inlined.
+
+// New returns a new Object of the given kind, with default values.
+func New(kind ObjectKind) Object { return Object{Kind: kind} }
+
+func NewComment(c string) Object { return Object{Kind: Comment, String: c} }
+func NewKeyword(k string) Object { return Object{Kind: Keyword, String: k} }
+
+func NewBool(b bool) Object {
+ var b64 float64
+ if b {
+ b64 = 1
+ }
+ return Object{Kind: Bool, Number: b64}
+}
+
+func NewNumeric(n float64) Object { return Object{Kind: Numeric, Number: n} }
+func NewName(n string) Object { return Object{Kind: Name, String: n} }
+func NewString(s string) Object { return Object{Kind: String, String: s} }
+
+func NewArray(a []Object) Object {
+ return Object{Kind: Array, Array: a}
+}
+
+func NewDict(d map[string]Object) Object {
+ if d == nil {
+ d = make(map[string]Object)
+ }
+ return Object{Kind: Dict, Dict: d}
+}
+
+func NewStream(d map[string]Object, s []byte) Object {
+ if d == nil {
+ d = make(map[string]Object)
+ }
+ return Object{Kind: Stream, Dict: d, Stream: s}
+}
+
+func NewIndirect(o Object, n, generation uint) Object {
+ return Object{Kind: Indirect, N: n, Generation: generation,
+ Array: []Object{o}}
+}
+
+func NewReference(n, generation uint) Object {
+ return Object{Kind: Reference, N: n, Generation: generation}
+}
+
+func newError(msg string) (Object, error) { return New(End), errors.New(msg) }
+
+// -----------------------------------------------------------------------------
+
+const (
+ octAlphabet = "01234567"
+ decAlphabet = "0123456789"
+ hexAlphabet = "0123456789abcdefABCDEF"
+ whitespace = "\t\n\f\r "
+ delimiters = "()<>[]{}/%"
+)
+
+// Lexer is a basic lexical analyser for the Portable Document Format,
+// giving limited error information.
+type Lexer struct {
+ P []byte // input buffer
+}
+
+func (lex *Lexer) read() (byte, bool) {
+ if len(lex.P) > 0 {
+ ch := lex.P[0]
+ lex.P = lex.P[1:]
+ return ch, true
+ }
+ return 0, false
+}
+
+func (lex *Lexer) peek() (byte, bool) {
+ if len(lex.P) > 0 {
+ return lex.P[0], true
+ }
+ return 0, false
+}
+
+func (lex *Lexer) eatNewline(ch byte) bool {
+ if ch == '\r' {
+ if ch, _ := lex.peek(); ch == '\n' {
+ lex.read()
+ }
+ return true
+ }
+ return ch == '\n'
+}
+
+func (lex *Lexer) unescape(ch byte) byte {
+ switch ch {
+ case 'n':
+ return '\n'
+ case 'r':
+ return '\r'
+ case 't':
+ return '\t'
+ case 'b':
+ return '\b'
+ case 'f':
+ return '\f'
+ }
+ if strings.IndexByte(octAlphabet, ch) >= 0 {
+ octal := []byte{ch}
+ lex.read()
+ if ch, _ := lex.peek(); strings.IndexByte(octAlphabet, ch) >= 0 {
+ octal = append(octal, ch)
+ lex.read()
+ }
+ if ch, _ := lex.peek(); strings.IndexByte(octAlphabet, ch) >= 0 {
+ octal = append(octal, ch)
+ lex.read()
+ }
+ u, _ := strconv.ParseUint(string(octal), 8, 8)
+ return byte(u)
+ }
+ return ch
+}
+
+func (lex *Lexer) string() (Object, error) {
+ var value []byte
+ parens := 1
+ for {
+ ch, ok := lex.read()
+ if !ok {
+ return newError("unexpected end of string")
+ }
+ if lex.eatNewline(ch) {
+ ch = '\n'
+ } else if ch == '(' {
+ parens++
+ } else if ch == ')' {
+ if parens--; parens == 0 {
+ break
+ }
+ } else if ch == '\\' {
+ if ch, ok = lex.read(); !ok {
+ return newError("unexpected end of string")
+ } else if lex.eatNewline(ch) {
+ continue
+ } else {
+ ch = lex.unescape(ch)
+ }
+ }
+ value = append(value, ch)
+ }
+ return NewString(string(value)), nil
+}
+
+func (lex *Lexer) stringHex() (Object, error) {
+ var value, buf []byte
+ for {
+ ch, ok := lex.read()
+ if !ok {
+ return newError("unexpected end of hex string")
+ } else if ch == '>' {
+ break
+ } else if strings.IndexByte(hexAlphabet, ch) < 0 {
+ return newError("invalid hex string")
+ } else if buf = append(buf, ch); len(buf) == 2 {
+ u, _ := strconv.ParseUint(string(buf), 16, 8)
+ value = append(value, byte(u))
+ buf = nil
+ }
+ }
+ if len(buf) > 0 {
+ u, _ := strconv.ParseUint(string(buf)+"0", 16, 8)
+ value = append(value, byte(u))
+ }
+ return NewString(string(value)), nil
+}
+
+func (lex *Lexer) name() (Object, error) {
+ var value []byte
+ for {
+ ch, ok := lex.peek()
+ if !ok || strings.IndexByte(whitespace+delimiters, ch) >= 0 {
+ break
+ }
+ lex.read()
+ if ch == '#' {
+ var hexa []byte
+ if ch, _ := lex.peek(); strings.IndexByte(hexAlphabet, ch) >= 0 {
+ hexa = append(hexa, ch)
+ lex.read()
+ }
+ if ch, _ := lex.peek(); strings.IndexByte(hexAlphabet, ch) >= 0 {
+ hexa = append(hexa, ch)
+ lex.read()
+ }
+ if len(hexa) != 2 {
+ return newError("invalid name hexa escape")
+ }
+ u, _ := strconv.ParseUint(string(value), 16, 8)
+ ch = byte(u)
+ }
+ value = append(value, ch)
+ }
+ if len(value) == 0 {
+ return newError("unexpected end of name")
+ }
+ return NewName(string(value)), nil
+}
+
+func (lex *Lexer) comment() (Object, error) {
+ var value []byte
+ for {
+ ch, ok := lex.peek()
+ if !ok || ch == '\r' || ch == '\n' {
+ break
+ }
+ value = append(value, ch)
+ lex.read()
+ }
+ return NewComment(string(value)), nil
+}
+
+// XXX: Maybe invalid numbers should rather be interpreted as keywords.
+func (lex *Lexer) number() (Object, error) {
+ var value []byte
+ ch, ok := lex.peek()
+ if ch == '-' {
+ value = append(value, ch)
+ lex.read()
+ }
+ real, digits := false, false
+ for {
+ ch, ok = lex.peek()
+ if !ok {
+ break
+ } else if strings.IndexByte(decAlphabet, ch) >= 0 {
+ digits = true
+ } else if ch == '.' && !real {
+ real = true
+ } else {
+ break
+ }
+ value = append(value, ch)
+ lex.read()
+ }
+ if !digits {
+ return newError("invalid number")
+ }
+ f, _ := strconv.ParseFloat(string(value), 64)
+ return NewNumeric(f), nil
+}
+
+func (lex *Lexer) Next() (Object, error) {
+ ch, ok := lex.peek()
+ if !ok {
+ return New(End), nil
+ }
+ if strings.IndexByte("-0123456789.", ch) >= 0 {
+ return lex.number()
+ }
+
+ // {} end up being keywords, we might want to error out on those.
+ var value []byte
+ for {
+ ch, ok := lex.peek()
+ if !ok || strings.IndexByte(whitespace+delimiters, ch) >= 0 {
+ break
+ }
+ value = append(value, ch)
+ lex.read()
+ }
+ switch v := string(value); v {
+ case "":
+ case "null":
+ return New(Nil), nil
+ case "true":
+ return NewBool(true), nil
+ case "false":
+ return NewBool(false), nil
+ default:
+ return NewKeyword(v), nil
+ }
+
+ switch ch, _ := lex.read(); ch {
+ case '/':
+ return lex.name()
+ case '%':
+ return lex.comment()
+ case '(':
+ return lex.string()
+ case '[':
+ return New(BArray), nil
+ case ']':
+ return New(EArray), nil
+ case '<':
+ if ch, _ := lex.peek(); ch == '<' {
+ lex.read()
+ return New(BDict), nil
+ }
+ return lex.stringHex()
+ case '>':
+ if ch, _ := lex.peek(); ch == '>' {
+ lex.read()
+ return New(EDict), nil
+ }
+ return newError("unexpected '>'")
+ default:
+ if lex.eatNewline(ch) {
+ return New(NL), nil
+ }
+ if strings.IndexByte(whitespace, ch) >= 0 {
+ return lex.Next()
+ }
+ return newError("unexpected input")
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+// FIXME: Lines /should not/ be longer than 255 characters,
+// some wrapping is in order.
+func (o *Object) Serialize() string {
+ switch o.Kind {
+ case NL:
+ return "\n"
+ case Nil:
+ return "null"
+ case Bool:
+ if o.Number != 0 {
+ return "true"
+ }
+ return "false"
+ case Numeric:
+ return strconv.FormatFloat(o.Number, 'f', -1, 64)
+ case Keyword:
+ return o.String
+ case Name:
+ escaped := []byte{'/'}
+ for _, ch := range []byte(o.String) {
+ escaped = append(escaped, ch)
+ if ch == '#' || strings.IndexByte(delimiters+whitespace, ch) >= 0 {
+ escaped = append(escaped, fmt.Sprintf("%02x", ch)...)
+ }
+ }
+ return string(escaped)
+ case String:
+ escaped := []byte{'('}
+ for _, ch := range []byte(o.String) {
+ if ch == '\\' || ch == '(' || ch == ')' {
+ escaped = append(escaped, '\\')
+ }
+ escaped = append(escaped, ch)
+ }
+ return string(append(escaped, ')'))
+ case BArray:
+ return "["
+ case EArray:
+ return "]"
+ case BDict:
+ return "<<"
+ case EDict:
+ return ">>"
+ case Array:
+ var v []string
+ for _, i := range o.Array {
+ v = append(v, i.Serialize())
+ }
+ return "[ " + strings.Join(v, " ") + " ]"
+ case Dict:
+ b := bytes.NewBuffer(nil)
+ var keys []string
+ for k := range o.Dict {
+ keys = append(keys, k)
+ }
+ sort.Strings(keys)
+ for _, k := range keys {
+ v := o.Dict[k]
+ // FIXME: The key is also supposed to be escaped by Serialize.
+ fmt.Fprint(b, " /", k, " ", v.Serialize())
+ }
+ return "<<" + b.String() + " >>"
+ case Stream:
+ d := NewDict(o.Dict)
+ d.Dict["Length"] = NewNumeric(float64(len(o.Stream)))
+ return d.Serialize() + "\nstream\n" + string(o.Stream) + "\nendstream"
+ case Indirect:
+ return fmt.Sprintf("%d %d obj\n%s\nendobj", o.N, o.Generation,
+ o.Array[0].Serialize())
+ case Reference:
+ return fmt.Sprintf("%d %d R", o.N, o.Generation)
+ default:
+ panic("unsupported token for serialization")
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+type ref struct {
+ offset int64 // file offset, or N of the next free entry, or index
+ generation uint // object generation
+ compressed *uint // PDF 1.5: N of the containing compressed object
+ nonfree bool // whether this N is taken (for a good zero value)
+}
+
+// Updater is a utility class to help read and possibly incrementally update
+// PDF files.
+type Updater struct {
+ // cross-reference table
+ xref []ref
+
+ // current cross-reference table size, correlated to len(xref)
+ xrefSize uint
+
+ // list of updated objects
+ // TODO(p): A map to bool makes this simpler to work with.
+ // The same with another map to struct{} somewhere in this code.
+ updated map[uint]struct{}
+
+ // PDF document data
+ Document []byte
+
+ // the new trailer dictionary to be written, initialized with the old one
+ Trailer map[string]Object
+}
+
+// ListIndirect returns the whole cross-reference table as Reference Objects.
+func (u *Updater) ListIndirect() []Object {
+ result := []Object{}
+ for i := 0; i < len(u.xref); i++ {
+ if u.xref[i].nonfree {
+ result = append(result, NewReference(uint(i), u.xref[i].generation))
+ }
+ }
+ return result
+}
+
+func (u *Updater) parseStream(lex *Lexer, stack *[]Object) (Object, error) {
+ lenStack := len(*stack)
+ if lenStack < 1 {
+ return newError("missing stream dictionary")
+ }
+ dict := (*stack)[lenStack-1]
+ if dict.Kind != Dict {
+ return newError("stream not preceded by a dictionary")
+ }
+
+ *stack = (*stack)[:lenStack-1]
+ length, ok := dict.Dict["Length"]
+ if !ok {
+ return newError("missing stream Length")
+ }
+ length, err := u.Dereference(length)
+ if err != nil {
+ return length, err
+ }
+ if !length.IsUint() || length.Number > math.MaxInt {
+ return newError("stream Length not an unsigned integer")
+ }
+
+ // Expect exactly one newline.
+ if nl, err := lex.Next(); err != nil {
+ return nl, err
+ } else if nl.Kind != NL {
+ return newError("stream does not start with a newline")
+ }
+
+ size := int(length.Number)
+ if len(lex.P) < size {
+ return newError("stream is longer than the document")
+ }
+
+ dict.Kind = Stream
+ dict.Stream = lex.P[:size]
+ lex.P = lex.P[size:]
+
+ // Skip any number of trailing newlines or comments.
+ if end, err := u.parse(lex, stack); err != nil {
+ return end, err
+ } else if end.Kind != Keyword || end.String != "endstream" {
+ return newError("improperly terminated stream")
+ }
+ return dict, nil
+}
+
+func (u *Updater) parseIndirect(lex *Lexer, stack *[]Object) (Object, error) {
+ lenStack := len(*stack)
+ if lenStack < 2 {
+ return newError("missing object ID pair")
+ }
+
+ n := (*stack)[lenStack-2]
+ g := (*stack)[lenStack-1]
+ *stack = (*stack)[:lenStack-2]
+
+ if !g.IsUint() || !n.IsUint() {
+ return newError("invalid object ID pair")
+ }
+
+ var inner []Object
+ for {
+ object, _ := u.parse(lex, &inner)
+ if object.Kind == End {
+ return newError("object doesn't end")
+ }
+ if object.Kind == Keyword && object.String == "endobj" {
+ break
+ }
+ inner = append(inner, object)
+ }
+ if len(inner) != 1 {
+ return newError("indirect objects must contain exactly one object")
+ }
+ return NewIndirect(inner[0], uint(n.Number), uint(g.Number)), nil
+}
+
+func (u *Updater) parseR(stack *[]Object) (Object, error) {
+ lenStack := len(*stack)
+ if lenStack < 2 {
+ return newError("missing reference ID pair")
+ }
+
+ n := (*stack)[lenStack-2]
+ g := (*stack)[lenStack-1]
+ *stack = (*stack)[:lenStack-2]
+
+ if !g.IsUint() || !n.IsUint() {
+ return newError("invalid reference ID pair")
+ }
+ return NewReference(uint(n.Number), uint(g.Number)), nil
+}
+
+// parse reads an object at the lexer's position. Not a strict parser.
+//
+// TODO(p): We should fix all uses of this not to eat the error.
+func (u *Updater) parse(lex *Lexer, stack *[]Object) (Object, error) {
+ switch token, err := lex.Next(); token.Kind {
+ case NL, Comment:
+ // These are not important to parsing,
+ // not even for this procedure's needs.
+ return u.parse(lex, stack)
+ case BArray:
+ var array []Object
+ for {
+ object, _ := u.parse(lex, &array)
+ if object.Kind == End {
+ return newError("array doesn't end")
+ }
+ if object.Kind == EArray {
+ break
+ }
+ array = append(array, object)
+ }
+ return NewArray(array), nil
+ case BDict:
+ var array []Object
+ for {
+ object, _ := u.parse(lex, &array)
+ if object.Kind == End {
+ return newError("dictionary doesn't end")
+ }
+ if object.Kind == EDict {
+ break
+ }
+ array = append(array, object)
+ }
+ if len(array)%2 != 0 {
+ return newError("unbalanced dictionary")
+ }
+ dict := make(map[string]Object)
+ for i := 0; i < len(array); i += 2 {
+ if array[i].Kind != Name {
+ return newError("invalid dictionary key type")
+ }
+ dict[array[i].String] = array[i+1]
+ }
+ return NewDict(dict), nil
+ case Keyword:
+ switch token.String {
+ case "stream":
+ // Appears in the document body,
+ // typically needs to access the cross-reference table.
+ return u.parseStream(lex, stack)
+ case "obj":
+ return u.parseIndirect(lex, stack)
+ case "R":
+ return u.parseR(stack)
+ }
+ fallthrough
+ default:
+ return token, err
+ }
+}
+
+func (u *Updater) loadXrefEntry(
+ n uint, r ref, loadedEntries map[uint]struct{}) {
+ if _, ok := loadedEntries[n]; ok {
+ return
+ }
+ if lenXref := uint(len(u.xref)); n >= lenXref {
+ u.xref = append(u.xref, make([]ref, n-lenXref+1)...)
+ }
+ loadedEntries[n] = struct{}{}
+
+ u.xref[n] = r
+}
+
+func (u *Updater) loadXrefStream(
+ lex *Lexer, stack []Object, loadedEntries map[uint]struct{}) (
+ Object, error) {
+ var object Object
+ for {
+ var err error
+ if object, err = u.parse(lex, &stack); err != nil {
+ return New(End), fmt.Errorf("invalid xref table: %s", err)
+ } else if object.Kind == End {
+ return newError("invalid xref table")
+ }
+
+ // For the sake of simplicity, keep stacking until we find an object.
+ if object.Kind == Indirect {
+ break
+ }
+
+ stack = append(stack, object)
+ }
+
+ // ISO 32000-2:2020 7.5.8.2 Cross-reference stream dictionary
+ stream := object.Array[0]
+ if stream.Kind != Stream {
+ return newError("invalid xref table")
+ }
+ if typ, ok := stream.Dict["Type"]; !ok ||
+ typ.Kind != Name || typ.String != "XRef" {
+ return newError("invalid xref stream")
+ }
+
+ data, err := u.GetStreamData(stream)
+ if err != nil {
+ return New(End), fmt.Errorf("invalid xref stream: %s", err)
+ }
+
+ size, ok := stream.Dict["Size"]
+ if !ok || !size.IsUint() || size.Number <= 0 {
+ return newError("invalid or missing cross-reference stream Size")
+ }
+
+ type pair struct{ start, count uint }
+ pairs := []pair{}
+ if index, ok := stream.Dict["Index"]; !ok {
+ pairs = append(pairs, pair{0, uint(size.Number)})
+ } else {
+ if index.Kind != Array || len(index.Array)%2 != 0 {
+ return newError("invalid cross-reference stream Index")
+ }
+
+ a := index.Array
+ for i := 0; i < len(a); i += 2 {
+ if !a[i].IsUint() || !a[i+1].IsUint() {
+ return newError("invalid cross-reference stream Index")
+ }
+ pairs = append(pairs, pair{uint(a[i].Number), uint(a[i+1].Number)})
+ }
+ }
+
+ w, ok := stream.Dict["W"]
+ if !ok || w.Kind != Array || len(w.Array) != 3 ||
+ !w.Array[0].IsUint() || !w.Array[1].IsUint() || !w.Array[2].IsUint() {
+ return newError("invalid or missing cross-reference stream W")
+ }
+
+ w1 := uint(w.Array[0].Number)
+ w2 := uint(w.Array[1].Number)
+ w3 := uint(w.Array[2].Number)
+ if w2 == 0 {
+ return newError("invalid cross-reference stream W")
+ }
+
+ unit := w1 + w2 + w3
+ if uint(len(data))%unit != 0 {
+ return newError("invalid cross-reference stream length")
+ }
+
+ readField := func(data []byte, width uint) (uint, []byte) {
+ var n uint
+ for ; width != 0; width-- {
+ n = n<<8 | uint(data[0])
+ data = data[1:]
+ }
+ return n, data
+ }
+
+ // ISO 32000-2:2020 7.5.8.3 Cross-reference stream data
+ for _, pair := range pairs {
+ for i := uint(0); i < pair.count; i++ {
+ if uint(len(data)) < unit {
+ return newError("premature cross-reference stream EOF")
+ }
+
+ var f1, f2, f3 uint = 1, 0, 0
+ if w1 > 0 {
+ f1, data = readField(data, w1)
+ }
+ f2, data = readField(data, w2)
+ if w3 > 0 {
+ f3, data = readField(data, w3)
+ }
+
+ var r ref
+ switch f1 {
+ case 0:
+ r.offset = int64(f2)
+ r.generation = f3
+ case 1:
+ r.offset = int64(f2)
+ r.generation = f3
+ r.nonfree = true
+ case 2:
+ r.offset = int64(f3)
+ r.compressed = &f2
+ r.nonfree = true
+ default:
+ // TODO(p): It should be treated as a reference to
+ // the null object. We can't currently represent that.
+ return newError("unsupported cross-reference stream contents")
+ }
+
+ u.loadXrefEntry(pair.start+i, r, loadedEntries)
+ }
+ }
+
+ stream.Kind = Dict
+ stream.Stream = nil
+ return stream, nil
+}
+
+func (u *Updater) loadXref(lex *Lexer, loadedEntries map[uint]struct{}) (
+ Object, error) {
+ var throwawayStack []Object
+ if object, _ := u.parse(lex,
+ &throwawayStack); object.Kind != Keyword || object.String != "xref" {
+ return u.loadXrefStream(lex, []Object{object}, loadedEntries)
+ }
+ for {
+ object, _ := u.parse(lex, &throwawayStack)
+ if object.Kind == End {
+ return newError("unexpected EOF while looking for the trailer")
+ }
+ if object.Kind == Keyword && object.String == "trailer" {
+ break
+ }
+
+ second, _ := u.parse(lex, &throwawayStack)
+ if !object.IsUint() || !second.IsUint() {
+ return newError("invalid xref section header")
+ }
+
+ start, count := uint(object.Number), uint(second.Number)
+ for i := uint(0); i < count; i++ {
+ off, _ := u.parse(lex, &throwawayStack)
+ gen, _ := u.parse(lex, &throwawayStack)
+ key, _ := u.parse(lex, &throwawayStack)
+ if !off.IsInteger() || off.Number < 0 ||
+ off.Number > float64(len(u.Document)) ||
+ !gen.IsInteger() || gen.Number < 0 || gen.Number > 65535 ||
+ key.Kind != Keyword {
+ return newError("invalid xref entry")
+ }
+
+ free := true
+ if key.String == "n" {
+ free = false
+ } else if key.String != "f" {
+ return newError("invalid xref entry")
+ }
+
+ u.loadXrefEntry(start+i, ref{
+ offset: int64(off.Number),
+ generation: uint(gen.Number),
+ nonfree: !free,
+ }, loadedEntries)
+ }
+ }
+
+ trailer, _ := u.parse(lex, &throwawayStack)
+ if trailer.Kind != Dict {
+ return newError("invalid trailer dictionary")
+ }
+ return trailer, nil
+}
+
+// -----------------------------------------------------------------------------
+
+var trailerRE = regexp.MustCompile(`(?s:.*)\sstartxref\s+(\d+)\s+%%EOF`)
+
+// NewUpdater initializes an Updater, building the cross-reference table and
+// preparing a new trailer dictionary.
+func NewUpdater(document []byte) (*Updater, error) {
+ u := &Updater{Document: document}
+ u.updated = make(map[uint]struct{})
+
+ // We only need to look for startxref roughly within
+ // the last kibibyte of the document.
+ haystack := u.Document
+ if len(haystack) > 1024 {
+ haystack = haystack[len(haystack)-1024:]
+ }
+
+ m := trailerRE.FindSubmatch(haystack)
+ if m == nil {
+ return nil, errors.New("cannot find startxref")
+ }
+
+ xrefOffset, _ := strconv.ParseInt(string(m[1]), 10, 64)
+ lastXrefOffset := xrefOffset
+ loadedXrefs := make(map[int64]struct{})
+ loadedEntries := make(map[uint]struct{})
+
+ for {
+ if _, ok := loadedXrefs[xrefOffset]; ok {
+ return nil, errors.New("circular xref offsets")
+ }
+ if xrefOffset >= int64(len(u.Document)) {
+ return nil, errors.New("invalid xref offset")
+ }
+
+ lex := Lexer{u.Document[xrefOffset:]}
+ trailer, err := u.loadXref(&lex, loadedEntries)
+ if err != nil {
+ return nil, err
+ }
+
+ if len(loadedXrefs) == 0 {
+ u.Trailer = trailer.Dict
+ }
+ loadedXrefs[xrefOffset] = struct{}{}
+
+ // TODO(p): Descend into XRefStm here first, if present,
+ // which is also a linked list.
+
+ // We allow for mixed cross-reference tables and streams
+ // within a single Prev list, although this should never occur.
+ prevOffset, ok := trailer.Dict["Prev"]
+ if !ok {
+ break
+ }
+ // FIXME: Do not read offsets and sizes as floating point numbers.
+ if !prevOffset.IsInteger() {
+ return nil, errors.New("invalid Prev offset")
+ }
+ xrefOffset = int64(prevOffset.Number)
+ }
+
+ u.Trailer["Prev"] = NewNumeric(float64(lastXrefOffset))
+
+ lastSize, ok := u.Trailer["Size"]
+ if !ok || !lastSize.IsInteger() || lastSize.Number <= 0 {
+ return nil, errors.New("invalid or missing cross-reference table Size")
+ }
+ u.xrefSize = uint(lastSize.Number)
+ return u, nil
+}
+
+var versionRE = regexp.MustCompile(
+ `(?:^|[\r\n])%(?:!PS-Adobe-\d\.\d )?PDF-(\d)\.(\d)[\r\n]`)
+
+// Version extracts the claimed PDF version as a positive decimal number,
+// e.g. 17 for PDF 1.7. Returns zero on failure.
+func (u *Updater) Version(root *Object) int {
+ if version, ok := root.Dict["Version"]; ok && version.Kind == Name {
+ if v := version.String; len(v) == 3 && v[1] == '.' &&
+ v[0] >= '0' && v[0] <= '9' && v[2] >= '0' && v[2] <= '9' {
+ return int(v[0]-'0')*10 + int(v[2]-'0')
+ }
+ }
+
+ // We only need to look for the comment roughly within
+ // the first kibibyte of the document.
+ haystack := u.Document
+ if len(haystack) > 1024 {
+ haystack = haystack[:1024]
+ }
+ if m := versionRE.FindSubmatch(haystack); m != nil {
+ return int(m[1][0]-'0')*10 + int(m[2][0]-'0')
+ }
+ return 0
+}
+
+func (u *Updater) getFromObjStm(nObjStm, n uint) (Object, error) {
+ if nObjStm == n {
+ return newError("ObjStm recursion")
+ }
+
+ stream, err := u.Get(nObjStm, 0)
+ if err != nil {
+ return stream, err
+ }
+ if stream.Kind != Stream {
+ return newError("invalid ObjStm")
+ }
+ if typ, ok := stream.Dict["Type"]; !ok ||
+ typ.Kind != Name || typ.String != "ObjStm" {
+ return newError("invalid ObjStm")
+ }
+
+ data, err := u.GetStreamData(stream)
+ if err != nil {
+ return New(End), fmt.Errorf("invalid ObjStm: %s", err)
+ }
+ entryN, ok := stream.Dict["N"]
+ if !ok || !entryN.IsUint() || entryN.Number <= 0 {
+ return newError("invalid ObjStm N")
+ }
+ entryFirst, ok := stream.Dict["First"]
+ if !ok || !entryFirst.IsUint() || entryFirst.Number <= 0 {
+ return newError("invalid ObjStm First")
+ }
+
+ // NOTE: This means descending into that stream if n is not found here.
+ // It is meant to be an object reference.
+ if extends, ok := stream.Dict["Extends"]; ok && extends.Kind != Nil {
+ return newError("ObjStm extensions are unsupported")
+ }
+
+ count := uint(entryN.Number)
+ first := uint(entryFirst.Number)
+ if first > uint(len(data)) {
+ return newError("invalid ObjStm First")
+ }
+
+ lex1 := Lexer{data[:first]}
+ data = data[first:]
+
+ type pair struct{ n, offset uint }
+ pairs := []pair{}
+ for i := uint(0); i < count; i++ {
+ var throwawayStack []Object
+ objN, _ := u.parse(&lex1, &throwawayStack)
+ objOffset, _ := u.parse(&lex1, &throwawayStack)
+ if !objN.IsUint() || !objOffset.IsUint() {
+ return newError("invalid ObjStm pairs")
+ }
+ pairs = append(pairs, pair{uint(objN.Number), uint(objOffset.Number)})
+ }
+ for i, pair := range pairs {
+ if pair.offset > uint(len(data)) ||
+ i > 0 && pairs[i-1].offset >= pair.offset {
+ return newError("invalid ObjStm pairs")
+ }
+ }
+
+ for i, pair := range pairs {
+ if pair.n != n {
+ continue
+ }
+
+ if i+1 < len(pairs) {
+ data = data[pair.offset:pairs[i+1].offset]
+ } else {
+ data = data[pair.offset:]
+ }
+
+ lex2 := Lexer{data}
+ var stack []Object
+ for {
+ object, err := u.parse(&lex2, &stack)
+ if err != nil {
+ return object, err
+ } else if object.Kind == End {
+ break
+ } else {
+ stack = append(stack, object)
+ }
+ }
+ if len(stack) == 0 {
+ return newError("empty ObjStm object")
+ }
+ return stack[0], nil
+ }
+ return newError("object not found in ObjStm")
+}
+
+// Get retrieves an object by its number and generation--may return
+// Nil or End with an error.
+func (u *Updater) Get(n, generation uint) (Object, error) {
+ if n >= u.xrefSize {
+ return New(Nil), nil
+ }
+
+ ref := u.xref[n]
+ if !ref.nonfree || ref.generation != generation {
+ return New(Nil), nil
+ }
+
+ if ref.compressed != nil {
+ return u.getFromObjStm(*ref.compressed, n)
+ } else if ref.offset >= int64(len(u.Document)) {
+ return New(Nil), nil
+ }
+
+ lex := Lexer{u.Document[ref.offset:]}
+ var stack []Object
+ for {
+ object, err := u.parse(&lex, &stack)
+ if object.Kind == End {
+ return object, err
+ }
+ if object.Kind != Indirect {
+ stack = append(stack, object)
+ } else if object.N != n || object.Generation != generation {
+ return newError("object mismatch")
+ } else {
+ return object.Array[0], nil
+ }
+ }
+}
+
+// Derefence dereferences Reference objects, and passes the other kinds through.
+func (u *Updater) Dereference(o Object) (Object, error) {
+ if o.Kind != Reference {
+ return o, nil
+ }
+ return u.Get(o.N, o.Generation)
+}
+
+// Allocate allocates a new object number.
+func (u *Updater) Allocate() uint {
+ n := u.xrefSize
+ u.xrefSize++
+
+ if u.xrefSize == 0 {
+ panic("overflow")
+ } else if lenXref := uint(len(u.xref)); lenXref < u.xrefSize {
+ u.xref = append(u.xref, make([]ref, u.xrefSize-lenXref)...)
+ }
+
+ // We don't make sure it gets a subsection in the update yet because we
+ // make no attempts at fixing the linked list of free items either.
+ return n
+}
+
+// BytesWriter is an interface over a subset of bytes.Buffer methods.
+type BytesWriter interface {
+ Bytes() []byte
+ Len() int
+ Write(p []byte) (n int, err error)
+ WriteByte(c byte) error
+ WriteRune(r rune) (n int, err error)
+ WriteString(s string) (n int, err error)
+}
+
+// Update appends an updated object to the end of the document.
+// The fill callback must write exactly one PDF object.
+func (u *Updater) Update(n uint, fill func(buf BytesWriter)) {
+ oldRef := u.xref[n]
+ u.updated[n] = struct{}{}
+ u.xref[n] = ref{
+ offset: int64(len(u.Document) + 1),
+ generation: oldRef.generation,
+ nonfree: true,
+ }
+
+ buf := bytes.NewBuffer(u.Document)
+ fmt.Fprintf(buf, "\n%d %d obj\n", n, oldRef.generation)
+
+ // Separately so that the callback can use w.Len() to get current offset.
+ fill(buf)
+
+ buf.WriteString("\nendobj")
+ u.Document = buf.Bytes()
+}
+
+func (u *Updater) flushXRefStm(updated []uint, buf *bytes.Buffer) {
+ // The cross-reference stream has to point to itself.
+ // XXX: We only duplicate Update code here due to how we currently buffer.
+ n := u.Allocate()
+ updated = append(updated, n)
+
+ u.updated[n] = struct{}{}
+ u.xref[n] = ref{
+ offset: int64(buf.Len() + 1),
+ generation: 0,
+ nonfree: true,
+ }
+
+ index, b := []Object{}, []byte{}
+ write := func(f1 byte, f2, f3 uint64) {
+ b = append(b, f1)
+ b = binary.BigEndian.AppendUint64(b, f2)
+ b = binary.BigEndian.AppendUint64(b, f3)
+ }
+ for i := 0; i < len(updated); {
+ start, stop := updated[i], updated[i]+1
+ for i++; i < len(updated) && updated[i] == stop; i++ {
+ stop++
+ }
+
+ index = append(index,
+ NewNumeric(float64(start)), NewNumeric(float64(stop-start)))
+ for ; start < stop; start++ {
+ ref := u.xref[start]
+ if ref.compressed != nil {
+ write(2, uint64(*ref.compressed), uint64(ref.offset))
+ } else if ref.nonfree {
+ write(1, uint64(ref.offset), uint64(ref.generation))
+ } else {
+ write(0, uint64(ref.offset), uint64(ref.generation))
+ }
+ }
+ }
+
+ u.Trailer["Size"] = NewNumeric(float64(u.xrefSize))
+ u.Trailer["Index"] = NewArray(index)
+ u.Trailer["W"] = NewArray([]Object{
+ NewNumeric(1), NewNumeric(8), NewNumeric(8),
+ })
+
+ for _, key := range []string{
+ "Filter", "DecodeParms", "F", "FFilter", "FDecodeParms", "DL"} {
+ delete(u.Trailer, key)
+ }
+
+ stream := NewStream(u.Trailer, b)
+ fmt.Fprintf(buf, "\n%d 0 obj\n%s\nendobj", n, stream.Serialize())
+}
+
+func (u *Updater) flushXRefTable(updated []uint, buf *bytes.Buffer) {
+ buf.WriteString("\nxref\n")
+ for i := 0; i < len(updated); {
+ start, stop := updated[i], updated[i]+1
+ for i++; i < len(updated) && updated[i] == stop; i++ {
+ stop++
+ }
+
+ fmt.Fprintf(buf, "%d %d\n", start, stop-start)
+ for ; start < stop; start++ {
+ // XXX: We should warn about any object streams here.
+ ref := u.xref[start]
+ if ref.nonfree && ref.compressed == nil {
+ fmt.Fprintf(buf, "%010d %05d n \n", ref.offset, ref.generation)
+ } else {
+ fmt.Fprintf(buf, "%010d %05d f \n", ref.offset, ref.generation)
+ }
+ }
+ }
+
+ // Taking literally "Each cross-reference section begins with a line
+ // containing the keyword xref. Following this line are one or more
+ // cross-reference subsections." from 3.4.3 in PDF Reference.
+ if len(updated) == 0 {
+ fmt.Fprintf(buf, "%d %d\n", 0, 0)
+ }
+
+ u.Trailer["Size"] = NewNumeric(float64(u.xrefSize))
+ trailer := NewDict(u.Trailer)
+ fmt.Fprintf(buf, "trailer\n%s", trailer.Serialize())
+}
+
+// FlushUpdates writes an updated cross-reference table and trailer, or stream.
+func (u *Updater) FlushUpdates() {
+ updated := make([]uint, 0, len(u.updated))
+ for n := range u.updated {
+ updated = append(updated, n)
+ }
+ sort.Slice(updated, func(i, j int) bool {
+ return updated[i] < updated[j]
+ })
+
+ // It does not seem to be possible to upgrade a PDF file
+ // from trailer dictionaries to cross-reference streams,
+ // so keep continuity either way.
+ //
+ // (Downgrading from cross-reference streams using XRefStm would not
+ // create a true hybrid-reference file, although it should work.)
+ buf := bytes.NewBuffer(u.Document)
+ startXref := buf.Len() + 1 /* '\n' */
+ if typ, _ := u.Trailer["Type"]; typ.Kind == Name && typ.String == "XRef" {
+ u.flushXRefStm(updated, buf)
+ } else {
+ u.flushXRefTable(updated, buf)
+ }
+
+ fmt.Fprintf(buf, "\nstartxref\n%d\n%%%%EOF\n", startXref)
+ u.Document = buf.Bytes()
+ u.updated = make(map[uint]struct{})
+
+ u.Trailer["Prev"] = NewNumeric(float64(startXref))
+}
+
+// -----------------------------------------------------------------------------
+
+// NewDate makes a PDF object representing the given point in time.
+func NewDate(ts time.Time) Object {
+ buf := ts.AppendFormat(nil, "D:20060102150405")
+ // "Z07'00'" doesn't work, we need to do some of it manually.
+ if _, offset := ts.Zone(); offset != 0 {
+ o := ts.AppendFormat(nil, "-0700")
+ buf = append(buf, o[0], o[1], o[2], '\'', o[3], o[4], '\'')
+ } else {
+ buf = append(buf, 'Z')
+ }
+ return NewString(string(buf))
+}
+
+// GetStreamData returns the actual data stored in a stream object,
+// applying any filters.
+func (u *Updater) GetStreamData(stream Object) ([]byte, error) {
+ if f, ok := stream.Dict["F"]; ok && f.Kind != Nil {
+ return nil, errors.New("stream data in other files are unsupported")
+ }
+
+ // Support just enough to decode a common cross-reference stream.
+ if filter, ok := stream.Dict["Filter"]; !ok {
+ return stream.Stream, nil
+ } else if filter.Kind != Name || filter.String != "FlateDecode" {
+ return nil, errors.New("unsupported stream Filter")
+ }
+
+ // TODO(p): Support << /Columns N /Predictor 12 >>
+ // which usually appears in files with cross-reference streams.
+ if parms, ok := stream.Dict["DecodeParms"]; ok && parms.Kind != Nil {
+ return nil, errors.New("DecodeParms are not supported")
+ }
+
+ r, err := zlib.NewReader(bytes.NewReader(stream.Stream))
+ if err != nil {
+ return nil, err
+ }
+
+ var b bytes.Buffer
+ _, err = b.ReadFrom(r)
+ return b.Bytes(), err
+}
+
+// GetFirstPage retrieves the first page of the given page (sub)tree reference,
+// or returns a Nil object if unsuccessful.
+func (u *Updater) GetFirstPage(node Object) Object {
+ obj, err := u.Dereference(node)
+ if err != nil || obj.Kind != Dict {
+ return New(Nil)
+ }
+
+ // Out of convenience; these aren't filled normally.
+ obj.N = node.N
+ obj.Generation = node.Generation
+
+ if typ, ok := obj.Dict["Type"]; !ok || typ.Kind != Name {
+ return New(Nil)
+ } else if typ.String == "Page" {
+ return obj
+ } else if typ.String != "Pages" {
+ return New(Nil)
+ }
+
+ // XXX: Technically speaking, this may be an indirect reference.
+ // The correct way to solve this seems to be having Updater include
+ // a wrapper around "obj.Dict". Though does it still apply in Golang?
+ kids, ok := obj.Dict["Kids"]
+ if !ok || kids.Kind != Array || len(kids.Array) == 0 ||
+ kids.Array[0].Kind != Reference {
+ return New(Nil)
+ }
+
+ // XXX: Nothing prevents us from recursing in an evil circular graph.
+ return u.GetFirstPage(kids.Array[0])
+}
+
+// -----------------------------------------------------------------------------
+
+// PKCS12Parse parses and verifies PKCS#12 data.
+func PKCS12Parse(p12 []byte, password string) (
+ crypto.PrivateKey, []*x509.Certificate, error) {
+ // The pkcs12.Decode function doesn't support included intermediate
+ // certificates, we need to do some processing manually.
+ blocks, err := pkcs12.ToPEM(p12, password)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ // b.Type is literally CERTIFICATE or PRIVATE KEY, the Headers only contain
+ // a localKeyId field. It seems like the pkey and the cert share the same
+ // localKeyId value. Though the leaf certificate should also be the first
+ // one in the PKCS#12 file, so I probably don't need that value.
+ var allX509Blocks [][]byte
+ var allCertBlocks [][]byte
+ for _, b := range blocks {
+ // CERTIFICATE, PRIVATE KEY constants are defined locally in the pkcs12
+ // package. crypto/tls/tls.go seems to only use literals for these and
+ // also accepts words in front such as RSA PRIVATE KEY.
+ switch b.Type {
+ case "PRIVATE KEY":
+ allX509Blocks = append(allX509Blocks, b.Bytes)
+ case "CERTIFICATE":
+ allCertBlocks = append(allCertBlocks, b.Bytes)
+ }
+ }
+ switch {
+ case len(allX509Blocks) == 0:
+ return nil, nil, errors.New("missing private key")
+ case len(allX509Blocks) > 1:
+ return nil, nil, errors.New("more than one private key")
+ case len(allCertBlocks) == 0:
+ return nil, nil, errors.New("missing certificate")
+ }
+
+ // The PKCS#12 file may only contain PKCS#8-wrapped private keys but the
+ // pkcs12 package unwraps them to simple PKCS#1/EC while converting to PEM.
+ var key crypto.PrivateKey
+ if key, err = x509.ParsePKCS1PrivateKey(allX509Blocks[0]); err != nil {
+ if key, err = x509.ParseECPrivateKey(allX509Blocks[0]); err == nil {
+ return nil, nil, errors.New("failed to parse private key")
+ }
+ }
+
+ x509Certs, err := x509.ParseCertificates(allCertBlocks[0])
+ if err != nil {
+ return nil, nil, err
+ }
+ if len(x509Certs) != 1 {
+ return nil, nil,
+ errors.New("expected exactly one certificate in the first bag")
+ }
+
+ for _, cert := range allCertBlocks[1:] {
+ toAdd, err := x509.ParseCertificates(cert)
+ if err != nil {
+ return nil, nil, err
+ }
+ x509Certs = append(x509Certs, toAdd...)
+ }
+
+ // Copied from crypto/tls/tls.go.
+ switch pub := x509Certs[0].PublicKey.(type) {
+ case *rsa.PublicKey:
+ priv, ok := key.(*rsa.PrivateKey)
+ if !ok {
+ return nil, nil,
+ errors.New("private key type does not match public key type")
+ }
+ if pub.N.Cmp(priv.N) != 0 {
+ return nil, nil,
+ errors.New("private key does not match public key")
+ }
+ case *ecdsa.PublicKey:
+ priv, ok := key.(*ecdsa.PrivateKey)
+ if !ok {
+ return nil, nil,
+ errors.New("private key type does not match public key type")
+ }
+ if pub.X.Cmp(priv.X) != 0 || pub.Y.Cmp(priv.Y) != 0 {
+ return nil, nil,
+ errors.New("private key does not match public key")
+ }
+ default:
+ return nil, nil, errors.New("unknown public key algorithm")
+ }
+ return key, x509Certs, nil
+}
+
+// FillInSignature signs PDF contents and writes the signature into the given
+// window that has been reserved for this specific purpose.
+// This is a very low-level function.
+func FillInSignature(document []byte, signOff, signLen int,
+ key crypto.PrivateKey, certs []*x509.Certificate) error {
+ if signOff < 0 || signOff > len(document) ||
+ signLen < 2 || signOff+signLen > len(document) {
+ return errors.New("invalid signing window")
+ }
+
+ pkcsError := func(message interface{}) error {
+ return fmt.Errorf("key/cert: %s", message)
+ }
+
+ // Prevent useless signatures--makes pdfsig from poppler happy at least
+ // (and NSS by extension).
+ x509Cert := certs[0]
+ if x509Cert.KeyUsage&(x509.KeyUsageDigitalSignature|
+ x509.KeyUsageContentCommitment /* renamed non-repudiation */) == 0 {
+ return pkcsError("the certificate's key usage must include " +
+ "digital signatures or non-repudiation")
+ }
+
+ extOK := false
+ for _, u := range x509Cert.ExtKeyUsage {
+ if u == x509.ExtKeyUsageAny || u == x509.ExtKeyUsageEmailProtection {
+ extOK = true
+ }
+ }
+ if len(x509Cert.ExtKeyUsage) > 0 && !extOK {
+ return pkcsError("the certificate's extended key usage " +
+ "must include S/MIME")
+ }
+
+ // XXX: We'd like to stream to the hash manually instead of copying data.
+ data := make([]byte, len(document)-signLen)
+ copy(data, document[:signOff])
+ copy(data[signOff:], document[signOff+signLen:])
+
+ signedData, err := pkcs7.NewSignedData(data)
+ if err != nil {
+ return err
+ }
+ // The default digest is SHA1, which is mildly insecure now.
+ signedData.SetDigestAlgorithm(pkcs7.OIDDigestAlgorithmSHA256)
+ if err := signedData.AddSignerChain(
+ x509Cert, key, certs[1:], pkcs7.SignerInfoConfig{}); err != nil {
+ return err
+ }
+
+ signedData.Detach()
+ sig, err := signedData.Finish()
+ if err != nil {
+ return err
+ }
+
+ /*
+ Debugging: ioutil.WriteFile("pdf_signature.der", sig, 0666)
+ openssl cms -inform PEM -in pdf_signature.pem -noout -cmsout -print
+ Context: https://stackoverflow.com/a/29253469
+ */
+
+ if len(sig)*2 > signLen-2 /* hexstring quotes */ {
+ // The obvious solution is to increase the allocation... or spend
+ // a week reading specifications while losing all faith in humanity
+ // as a species, and skip the pkcs7 package entirely.
+ return fmt.Errorf("not enough space reserved for the signature "+
+ "(%d nibbles vs %d nibbles)", signLen-2, len(sig)*2)
+ }
+
+ hex.Encode(document[signOff+1:], sig)
+ return nil
+}
+
+// https://www.adobe.com/devnet-docs/acrobatetk/tools/DigSig/Acrobat_DigitalSignatures_in_PDF.pdf
+// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
+// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PPKAppearances.pdf
+
+// Sign signs the given document, growing and returning the passed-in slice.
+// There must be at least one certificate, matching the private key.
+// The certificates must form a chain.
+//
+// A good default for the reservation is around 4096 (the value is in bytes).
+//
+// The presumption here is that the document is valid and that it doesn't
+// employ cross-reference streams from PDF 1.5, or at least constitutes
+// a hybrid-reference file. The results with PDF 2.0 (2017) are currently
+// unknown as the standard costs money.
+func Sign(document []byte, key crypto.PrivateKey, certs []*x509.Certificate,
+ reservation int) ([]byte, error) {
+ pdf, err := NewUpdater(document)
+ if err != nil {
+ return nil, err
+ }
+
+ rootRef, ok := pdf.Trailer["Root"]
+ if !ok || rootRef.Kind != Reference {
+ return nil, errors.New("trailer does not contain a reference to Root")
+ }
+ root, err := pdf.Dereference(rootRef)
+ if err != nil {
+ return nil, fmt.Errorf("Root dictionary retrieval failed: %s", err)
+ }
+ if root.Kind != Dict {
+ return nil, errors.New("invalid Root dictionary reference")
+ }
+
+ // 8.7 Digital Signatures - /signature dictionary/
+ sigdictN := pdf.Allocate()
+ var byterangeOff, byterangeLen, signOff, signLen int
+ pdf.Update(sigdictN, func(buf BytesWriter) {
+ // The timestamp is important for Adobe Acrobat Reader DC.
+ // The ideal would be to use RFC 3161.
+ now := NewDate(time.Now())
+ buf.WriteString("<< /Type/Sig /Filter/Adobe.PPKLite" +
+ " /SubFilter/adbe.pkcs7.detached\n" +
+ " /M" + now.Serialize() + " /ByteRange ")
+
+ byterangeOff = buf.Len()
+ byterangeLen = 32 // fine for a gigabyte
+ buf.Write(bytes.Repeat([]byte{' '}, byterangeLen))
+ buf.WriteString("\n /Contents <")
+
+ signOff = buf.Len()
+ signLen = reservation * 2 // cert, digest, encrypted digest, ...
+ buf.Write(bytes.Repeat([]byte{'0'}, signLen))
+ buf.WriteString("> >>")
+
+ // We actually need to exclude the hexstring quotes from signing.
+ signOff -= 1
+ signLen += 2
+ })
+
+ sigfield := NewDict(map[string]Object{
+ // 8.6.3 Field Types - Signature Fields
+ "FT": NewName("Sig"),
+ "V": NewReference(sigdictN, 0),
+ // 8.4.5 Annotations Types - Widget Annotations
+ // We can merge the Signature Annotation and omit Kids here.
+ "Subtype": NewName("Widget"),
+ "F": NewNumeric(2 /* Hidden */),
+ "T": NewString("Signature1"),
+ "Rect": NewArray([]Object{
+ NewNumeric(0), NewNumeric(0), NewNumeric(0), NewNumeric(0),
+ }),
+ })
+
+ sigfieldN := pdf.Allocate()
+ pdf.Update(sigfieldN, func(buf BytesWriter) {
+ buf.WriteString(sigfield.Serialize())
+ })
+
+ pagesRef, ok := root.Dict["Pages"]
+ if !ok || pagesRef.Kind != Reference {
+ return nil, errors.New("invalid Pages reference")
+ }
+ page := pdf.GetFirstPage(pagesRef)
+ if page.Kind != Dict {
+ return nil, errors.New("invalid or unsupported page tree")
+ }
+
+ annots := page.Dict["Annots"]
+ if annots.Kind != Array {
+ // TODO(p): Indirectly referenced arrays might not be
+ // that hard to support.
+ if annots.Kind != End {
+ return nil, errors.New("unexpected Annots")
+ }
+ annots = NewArray(nil)
+ }
+ annots.Array = append(annots.Array, NewReference(sigfieldN, 0))
+
+ page.Dict["Annots"] = annots
+ pdf.Update(page.N, func(buf BytesWriter) {
+ buf.WriteString(page.Serialize())
+ })
+
+ // 8.6.1 Interactive Form Dictionary
+ if acroform, ok := root.Dict["AcroForm"]; ok && acroform.Kind != Nil {
+ return nil, errors.New("the document already contains forms, " +
+ "they would be overwritten")
+ }
+
+ root.Dict["AcroForm"] = NewDict(map[string]Object{
+ "Fields": NewArray([]Object{NewReference(sigfieldN, 0)}),
+ "SigFlags": NewNumeric(3 /* SignaturesExist | AppendOnly */),
+ })
+
+ // Upgrade the document version for SHA-256 etc.
+ if pdf.Version(&root) < 16 {
+ root.Dict["Version"] = NewName("1.6")
+ }
+
+ pdf.Update(rootRef.N, func(buf BytesWriter) {
+ buf.WriteString(root.Serialize())
+ })
+ pdf.FlushUpdates()
+
+ // Now that we know the length of everything, store byte ranges of
+ // what we're about to sign, which must be everything but the resulting
+ // signature itself.
+ tailOff := signOff + signLen
+ tailLen := len(pdf.Document) - tailOff
+
+ ranges := fmt.Sprintf("[0 %d %d %d]", signOff, tailOff, tailLen)
+ if len(ranges) > byterangeLen {
+ return nil, errors.New("not enough space reserved for /ByteRange")
+ }
+ copy(pdf.Document[byterangeOff:], []byte(ranges))
+ if err := FillInSignature(pdf.Document, signOff, signLen,
+ key, certs); err != nil {
+ return nil, err
+ }
+ return pdf.Document, nil
+}
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000..40bd165
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,86 @@
+#!/bin/sh -e
+# Test basic functionality of both versions
+# Usage: ./test.sh builddir/pdf-simple-sign cmd/pdf-simple-sign/pdf-simple-sign
+
+log() { echo "`tput sitm`-- $1`tput sgr0`"; }
+die() { echo "`tput bold`-- $1`tput sgr0`"; exit 1; }
+
+# Get rid of old test files
+rm -rf tmp
+mkdir tmp
+
+# Create documents in various tools
+log "Creating source documents"
+inkscape --pipe --export-filename=tmp/cairo.pdf --export-pdf-version=1.4 \
+<<'EOF' 2>/dev/null || :
+<svg xmlns="http://www.w3.org/2000/svg"><text x="5" y="10">Hello</text></svg>
+EOF
+
+date > tmp/lowriter.txt
+if command -v gropdf >/dev/null
+then groff -T pdf < tmp/lowriter.txt > tmp/groff.pdf
+fi
+lowriter --convert-to pdf tmp/lowriter.txt --outdir tmp >/dev/null || :
+convert rose: tmp/imagemagick.pdf || :
+
+# Create a root CA certificate pair
+log "Creating certificates"
+openssl req -newkey rsa:2048 -subj "/CN=Test CA" -nodes \
+ -keyout tmp/ca.key.pem -x509 -out tmp/ca.cert.pem 2>/dev/null
+
+# Create a private NSS database and insert our test CA there
+rm -rf tmp/nssdir
+mkdir tmp/nssdir
+certutil -N --empty-password -d sql:tmp/nssdir
+certutil -d sql:tmp/nssdir -A -n root -t ,C, -a -i tmp/ca.cert.pem
+
+# Create a leaf certificate pair
+cat > tmp/cert.cfg <<'EOF'
+[smime]
+basicConstraints = CA:FALSE
+keyUsage = digitalSignature
+extendedKeyUsage = emailProtection
+nsCertType = email
+EOF
+
+openssl req -newkey rsa:2048 -subj "/CN=Test Leaf" -nodes \
+ -keyout tmp/key.pem -out tmp/cert.csr 2>/dev/null
+openssl x509 -req -in tmp/cert.csr -out tmp/cert.pem \
+ -CA tmp/ca.cert.pem -CAkey tmp/ca.key.pem -set_serial 1 \
+ -extensions smime -extfile tmp/cert.cfg 2>/dev/null
+openssl verify -CAfile tmp/ca.cert.pem tmp/cert.pem >/dev/null
+
+# The second line accomodates the Go signer,
+# which doesn't support SHA-256 within pkcs12 handling
+openssl pkcs12 -inkey tmp/key.pem -in tmp/cert.pem \
+ -certpbe PBE-SHA1-3DES -keypbe PBE-SHA1-3DES -macalg sha1 \
+ -export -passout pass: -out tmp/key-pair.p12
+
+for tool in "$@"; do
+ rm -f tmp/*.signed.pdf
+ for source in tmp/*.pdf; do
+ log "Testing $tool with $source"
+ result=${source%.pdf}.signed.pdf
+ $tool "$source" "$result" tmp/key-pair.p12 ""
+ pdfsig -nssdir sql:tmp/nssdir "$result" | grep Validation
+
+ # Only some of our generators use PDF versions higher than 1.5
+ log "Testing $tool for version detection"
+ grep -q "/Version /1[.]6" "$result" \
+ || grep -q "^%PDF-1[.][67]" "$result" \
+ || die "Version detection seems to misbehave (no upgrade)"
+ done
+
+ log "Testing $tool for expected failures"
+ $tool "$result" "$source.fail.pdf" tmp/key-pair.p12 "" \
+ && die "Double signing shouldn't succeed"
+ $tool -r 1 "$source" "$source.fail.pdf" tmp/key-pair.p12 "" \
+ && die "Too low reservations shouldn't succeed"
+
+ sed '1s/%PDF-1../%PDF-1.7/' "$source" > "$source.alt"
+ $tool "$source.alt" "$result.alt" tmp/key-pair.p12 ""
+ grep -q "/Version /1.6" "$result.alt" \
+ && die "Version detection seems to misbehave (downgraded)"
+done
+
+log "OK"