From ceea7dca2f4ff356e136b9a72611b9b66d402eaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C5=99emysl=20Eric=20Janouch?= Date: Wed, 12 Aug 2020 07:29:57 +0200 Subject: C++: split out a library with a trivial interface Closes #2. --- meson.build | 9 +- pdf-simple-sign.cpp | 895 +-------------------------------------------------- pdf-simple-sign.h | 28 ++ pdf.cpp | 906 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 950 insertions(+), 888 deletions(-) create mode 100644 pdf-simple-sign.h create mode 100644 pdf.cpp diff --git a/meson.build b/meson.build index 0446cc3..6544cdb 100644 --- a/meson.build +++ b/meson.build @@ -1,5 +1,12 @@ project('pdf-simple-sign', 'cpp', default_options : ['cpp_std=c++11']) cryptodep = dependency('libcrypto') -executable('pdf-simple-sign', 'pdf-simple-sign.cpp', + +executable('pdf-simple-sign', 'pdf-simple-sign.cpp', 'pdf.cpp', + install : true, + dependencies : cryptodep) + +install_headers('pdf-simple-sign.h') +library('pdf-simple-sign', 'pdf.cpp', + soversion : 0, install : true, dependencies : cryptodep) diff --git a/pdf-simple-sign.cpp b/pdf-simple-sign.cpp index 818270e..8f3fe3f 100644 --- a/pdf-simple-sign.cpp +++ b/pdf-simple-sign.cpp @@ -2,7 +2,7 @@ // // pdf-simple-sign: simple PDF signer // -// Copyright (c) 2017, Přemysl Eric Janouch +// Copyright (c) 2017 - 2020, Přemysl Eric Janouch // // Permission to use, copy, modify, and/or distribute this software for any // purpose with or without fee is hereby granted. @@ -17,899 +17,22 @@ // #include -#include -#undef NDEBUG -#include - -#include -#include -#include -#include -#include - -#if defined __GLIBCXX__ && __GLIBCXX__ < 20140422 -#error Need libstdc++ >= 4.9 for -#endif +#include +#include +#include +#include #include #include -#include -#include -#include - -// ------------------------------------------------------------------------------------------------- - -using uint = unsigned int; - -static std::string concatenate(const std::vector& v, const std::string& delim) { - std::string res; - if (v.empty()) - return res; - for (const auto& s : v) - res += s + delim; - return res.substr(0, res.length() - delim.length()); -} - -template -std::string ssprintf(const std::string& format, Args... args) { - size_t size = std::snprintf(nullptr, 0, format.c_str(), args... ) + 1; - std::unique_ptr buf(new char[size]); - std::snprintf(buf.get(), size, format.c_str(), args...); - return std::string(buf.get(), buf.get() + size - 1); -} - -// ------------------------------------------------------------------------------------------------- - -/// PDF token/object thingy. Objects may be composed either from one or a sequence of tokens. -/// The PDF Reference doesn't actually speak of tokens. -struct pdf_object { - enum type { - END, NL, COMMENT, NIL, BOOL, NUMERIC, KEYWORD, NAME, STRING, - // Simple tokens - B_ARRAY, E_ARRAY, B_DICT, E_DICT, - // Higher-level objects - ARRAY, DICT, OBJECT, REFERENCE, - } type = END; - - std::string string; ///< END (error message), COMMENT/KEYWORD/NAME/STRING - double number = 0.; ///< BOOL, NUMERIC - std::vector array; ///< ARRAY, OBJECT - std::map dict; ///< DICT, in the future also STREAM - uint n = 0, generation = 0; ///< OBJECT, REFERENCE - - pdf_object(enum type type = END) : type(type) {} - pdf_object(enum type type, double v) : type(type), number(v) {} - pdf_object(enum type type, const std::string& v) : type(type), string(v) {} - pdf_object(enum type type, uint n, uint g) : type(type), n(n), generation(g) {} - pdf_object(const std::vector& array) : type(ARRAY), array(array) {} - pdf_object(const std::map& dict) : type(DICT), dict(dict) {} - - pdf_object(const pdf_object&) = default; - pdf_object(pdf_object&&) = default; - pdf_object& operator=(const pdf_object&) = default; - pdf_object& operator=(pdf_object&&) = default; - - /// Return whether this is a number without a fractional part - bool is_integer() const { - double tmp; - return type == NUMERIC && std::modf(number, &tmp) == 0.; - } -}; - -/// Basic lexical analyser for the Portable Document Format, giving limited error information -struct pdf_lexer { - const unsigned char* p; - pdf_lexer(const char* s) : p(reinterpret_cast(s)) {} - - static constexpr const char* oct_alphabet = "01234567"; - static constexpr const char* dec_alphabet = "0123456789"; - static constexpr const char* hex_alphabet = "0123456789abcdefABCDEF"; - static constexpr const char* whitespace = "\t\n\f\r "; - static constexpr const char* delimiters = "()<>[]{}/%"; - - bool eat_newline(int ch) { - if (ch == '\r') { - if (*p == '\n') p++; - return true; - } - return ch == '\n'; - } - - pdf_object string() { - std::string value; - int parens = 1; - while (1) { - if (!*p) return {pdf_object::END, "unexpected end of string"}; - auto ch = *p++; - if (eat_newline(ch)) ch = '\n'; - else if (ch == '(') { parens++; } - else if (ch == ')') { if (!--parens) break; } - else if (ch == '\\') { - if (!*p) return {pdf_object::END, "unexpected end of string"}; - switch ((ch = *p++)) { - case 'n': ch = '\n'; break; - case 'r': ch = '\r'; break; - case 't': ch = '\t'; break; - case 'b': ch = '\b'; break; - case 'f': ch = '\f'; break; - default: - if (eat_newline(ch)) - continue; - std::string octal; - if (ch && strchr(oct_alphabet, ch)) { - octal += ch; - if (*p && strchr(oct_alphabet, *p)) octal += *p++; - if (*p && strchr(oct_alphabet, *p)) octal += *p++; - ch = std::stoi(octal, nullptr, 8); - } - } - } - value += ch; - } - return {pdf_object::STRING, value}; - } - - pdf_object string_hex() { - std::string value, buf; - while (*p != '>') { - if (!*p) return {pdf_object::END, "unexpected end of hex string"}; - if (!strchr(hex_alphabet, *p)) - return {pdf_object::END, "invalid hex string"}; - buf += *p++; - if (buf.size() == 2) { - value += char(std::stoi(buf, nullptr, 16)); - buf.clear(); - } - } - p++; - if (!buf.empty()) value += char(std::stoi(buf + '0', nullptr, 16)); - return {pdf_object::STRING, value}; - } - - pdf_object name() { - std::string value; - while (!strchr(whitespace, *p) && !strchr(delimiters, *p)) { - auto ch = *p++; - if (ch == '#') { - std::string hexa; - if (*p && strchr(hex_alphabet, *p)) hexa += *p++; - if (*p && strchr(hex_alphabet, *p)) hexa += *p++; - if (hexa.size() != 2) - return {pdf_object::END, "invalid name hexa escape"}; - ch = char(std::stoi(hexa, nullptr, 16)); - } - value += ch; - } - if (value.empty()) return {pdf_object::END, "unexpected end of name"}; - return {pdf_object::NAME, value}; - } - - pdf_object comment() { - std::string value; - while (*p && *p != '\r' && *p != '\n') - value += *p++; - return {pdf_object::COMMENT, value}; - } - - // XXX maybe invalid numbers should rather be interpreted as keywords - pdf_object number() { - std::string value; - if (*p == '-') - value += *p++; - bool real = false, digits = false; - while (*p) { - if (strchr(dec_alphabet, *p)) - digits = true; - else if (*p == '.' && !real) - real = true; - else - break; - value += *p++; - } - if (!digits) return {pdf_object::END, "invalid number"}; - return {pdf_object::NUMERIC, std::stod(value, nullptr)}; - } - - pdf_object next() { - if (!*p) - return {pdf_object::END}; - if (strchr("-0123456789.", *p)) - return number(); - - // {} end up being keywords, we might want to error out on those - std::string value; - while (!strchr(whitespace, *p) && !strchr(delimiters, *p)) - value += *p++; - if (!value.empty()) { - if (value == "null") return {pdf_object::NIL}; - if (value == "true") return {pdf_object::BOOL, 1}; - if (value == "false") return {pdf_object::BOOL, 0}; - return {pdf_object::KEYWORD, value}; - } - - switch (char ch = *p++) { - case '/': return name(); - case '%': return comment(); - case '(': return string(); - case '[': return {pdf_object::B_ARRAY}; - case ']': return {pdf_object::E_ARRAY}; - case '<': - if (*p++ == '<') - return {pdf_object::B_DICT}; - p--; - return string_hex(); - case '>': - if (*p++ == '>') - return {pdf_object::E_DICT}; - p--; - return {pdf_object::END, "unexpected '>'"}; - default: - if (eat_newline(ch)) - return {pdf_object::NL}; - if (strchr(whitespace, ch)) - return next(); - return {pdf_object::END, "unexpected input"}; - } - } -}; - -// FIXME lines /should not/ be longer than 255 characters, some wrapping is in order -static std::string pdf_serialize(const pdf_object& o) { - switch (o.type) { - case pdf_object::NL: return "\n"; - case pdf_object::NIL: return "null"; - case pdf_object::BOOL: return o.number ? "true" : "false"; - case pdf_object::NUMERIC: - { - if (o.is_integer()) return std::to_string((long long) o.number); - return std::to_string(o.number); - } - case pdf_object::KEYWORD: return o.string; - case pdf_object::NAME: - { - std::string escaped = "/"; - for (char c : o.string) { - if (c == '#' || strchr(pdf_lexer::delimiters, c) || strchr(pdf_lexer::whitespace, c)) - escaped += ssprintf("#%02x", c); - else - escaped += c; - } - return escaped; - } - case pdf_object::STRING: - { - std::string escaped; - for (char c : o.string) { - if (c == '\\' || c == '(' || c == ')') - escaped += '\\'; - escaped += c; - } - return "(" + escaped + ")"; - } - case pdf_object::B_ARRAY: return "["; - case pdf_object::E_ARRAY: return "]"; - case pdf_object::B_DICT: return "<<"; - case pdf_object::E_DICT: return ">>"; - case pdf_object::ARRAY: - { - std::vector v; - for (const auto& i : o.array) - v.push_back(pdf_serialize(i)); - return "[ " + concatenate(v, " ") + " ]"; - } - case pdf_object::DICT: - { - std::string s; - for (const auto i : o.dict) - // FIXME the key is also supposed to be escaped by pdf_serialize() - s += " /" + i.first + " " + pdf_serialize(i.second); - return "<<" + s + " >>"; - } - case pdf_object::OBJECT: - return ssprintf("%u %u obj\n", o.n, o.generation) + pdf_serialize(o.array.at(0)) + "\nendobj"; - case pdf_object::REFERENCE: - return ssprintf("%u %u R", o.n, o.generation); - default: - assert(!"unsupported token for serialization"); - } -} - -// ------------------------------------------------------------------------------------------------- - -/// Utility class to help read and possibly incrementally update PDF files -class pdf_updater { - struct ref { - size_t offset = 0; ///< File offset or N of the next free entry - uint generation = 0; ///< Object generation - bool free = true; ///< Whether this N has been deleted - }; - - std::vector xref; ///< Cross-reference table - size_t xref_size = 0; ///< Current cross-reference table size, correlated to xref.size() - std::set updated; ///< List of updated objects - - pdf_object parse_obj(pdf_lexer& lex, std::vector& stack) const; - pdf_object parse_R(std::vector& stack) const; - pdf_object parse(pdf_lexer& lex, std::vector& stack) const; - std::string load_xref(pdf_lexer& lex, std::set& loaded_entries); - -public: - /// The new trailer dictionary to be written, initialized with the old one - std::map trailer; - - std::string& document; - pdf_updater(std::string& document) : document(document) {} - - /// Build the cross-reference table and prepare a new trailer dictionary - std::string initialize(); - /// Retrieve an object by its number and generation -- may return NIL or END with an error - pdf_object get(uint n, uint generation) const; - /// Allocate a new object number - uint allocate(); - /// Append an updated object to the end of the document - void update(uint n, std::function fill); - /// Write an updated cross-reference table and trailer - void flush_updates(); -}; - -// ------------------------------------------------------------------------------------------------- - -/// If the object is an error, forward its message, otherwise return err. -static std::string pdf_error(const pdf_object& o, const char* err) { - if (o.type != pdf_object::END || o.string.empty()) return err; - return o.string; -} - -pdf_object pdf_updater::parse_obj(pdf_lexer& lex, std::vector& stack) const { - if (stack.size() < 2) - return {pdf_object::END, "missing object ID pair"}; - - auto g = stack.back(); stack.pop_back(); - auto n = stack.back(); stack.pop_back(); - if (!g.is_integer() || g.number < 0 || g.number > UINT_MAX - || !n.is_integer() || n.number < 0 || n.number > UINT_MAX) - return {pdf_object::END, "invalid object ID pair"}; - - pdf_object obj{pdf_object::OBJECT}; - obj.n = n.number; - obj.generation = g.number; - - while (1) { - auto object = parse(lex, obj.array); - if (object.type == pdf_object::END) - return {pdf_object::END, pdf_error(object, "object doesn't end")}; - if (object.type == pdf_object::KEYWORD && object.string == "endobj") - break; - obj.array.push_back(std::move(object)); - } - return obj; -} - -pdf_object pdf_updater::parse_R(std::vector& stack) const { - if (stack.size() < 2) - return {pdf_object::END, "missing reference ID pair"}; - - auto g = stack.back(); stack.pop_back(); - auto n = stack.back(); stack.pop_back(); - if (!g.is_integer() || g.number < 0 || g.number > UINT_MAX - || !n.is_integer() || n.number < 0 || n.number > UINT_MAX) - return {pdf_object::END, "invalid reference ID pair"}; - - pdf_object ref{pdf_object::REFERENCE}; - ref.n = n.number; - ref.generation = g.number; - return ref; -} - -/// Read an object at the lexer's position. Not a strict parser. -pdf_object pdf_updater::parse(pdf_lexer& lex, std::vector& stack) const { - auto token = lex.next(); - switch (token.type) { - case pdf_object::NL: - case pdf_object::COMMENT: - // These are not important to parsing, not even for this procedure's needs - return parse(lex, stack); - case pdf_object::B_ARRAY: - { - std::vector array; - while (1) { - auto object = parse(lex, array); - if (object.type == pdf_object::END) - return {pdf_object::END, pdf_error(object, "array doesn't end")}; - if (object.type == pdf_object::E_ARRAY) - break; - array.push_back(std::move(object)); - } - return array; - } - case pdf_object::B_DICT: - { - std::vector array; - while (1) { - auto object = parse(lex, array); - if (object.type == pdf_object::END) - return {pdf_object::END, pdf_error(object, "dictionary doesn't end")}; - if (object.type == pdf_object::E_DICT) - break; - array.push_back(std::move(object)); - } - if (array.size() % 2) - return {pdf_object::END, "unbalanced dictionary"}; - std::map dict; - for (size_t i = 0; i < array.size(); i += 2) { - if (array[i].type != pdf_object::NAME) - return {pdf_object::END, "invalid dictionary key type"}; - dict.insert({array[i].string, std::move(array[i + 1])}); - } - return dict; - } - case pdf_object::KEYWORD: - // Appears in the document body, typically needs to access the cross-reference table - // TODO use the xref to read /Length etc. once we actually need to read such objects; - // presumably streams can use the pdf_object::string member - if (token.string == "stream") return {pdf_object::END, "streams are not supported yet"}; - if (token.string == "obj") return parse_obj(lex, stack); - if (token.string == "R") return parse_R(stack); - return token; - default: - return token; - } -} - -std::string pdf_updater::load_xref(pdf_lexer& lex, std::set& loaded_entries) { - std::vector throwaway_stack; - { - auto keyword = parse(lex, throwaway_stack); - if (keyword.type != pdf_object::KEYWORD || keyword.string != "xref") - return "invalid xref table"; - } - while (1) { - auto object = parse(lex, throwaway_stack); - if (object.type == pdf_object::END) - return pdf_error(object, "unexpected EOF while looking for the trailer"); - if (object.type == pdf_object::KEYWORD && object.string == "trailer") - break; - - auto second = parse(lex, throwaway_stack); - if (!object.is_integer() || object.number < 0 || object.number > UINT_MAX - || !second.is_integer() || second.number < 0 || second.number > UINT_MAX) - return "invalid xref section header"; - - const size_t start = object.number; - const size_t count = second.number; - for (size_t i = 0; i < count; i++) { - auto off = parse(lex, throwaway_stack); - auto gen = parse(lex, throwaway_stack); - auto key = parse(lex, throwaway_stack); - if (!off.is_integer() || off.number < 0 || off.number > document.length() - || !gen.is_integer() || gen.number < 0 || gen.number > 65535 - || key.type != pdf_object::KEYWORD) - return "invalid xref entry"; - - bool free = true; - if (key.string == "n") - free = false; - else if (key.string != "f") - return "invalid xref entry"; - - auto n = start + i; - if (loaded_entries.count(n)) - continue; - if (n >= xref.size()) - xref.resize(n + 1); - loaded_entries.insert(n); - - auto& ref = xref[n]; - ref.generation = gen.number; - ref.offset = off.number; - ref.free = free; - } - } - return ""; -} - -// ------------------------------------------------------------------------------------------------- - -std::string pdf_updater::initialize() { - // We only need to look for startxref roughly within the last kibibyte of the document - static std::regex haystack_re("[\\s\\S]*\\sstartxref\\s+(\\d+)\\s+%%EOF"); - std::string haystack = document.substr(document.length() < 1024 ? 0 : document.length() - 1024); - - std::smatch m; - if (!std::regex_search(haystack, m, haystack_re, std::regex_constants::match_continuous)) - return "cannot find startxref"; - - size_t xref_offset = std::stoul(m.str(1)), last_xref_offset = xref_offset; - std::set loaded_xrefs; - std::set loaded_entries; - - std::vector throwaway_stack; - while (1) { - if (loaded_xrefs.count(xref_offset)) - return "circular xref offsets"; - if (xref_offset >= document.length()) - return "invalid xref offset"; - - pdf_lexer lex(document.c_str() + xref_offset); - auto err = load_xref(lex, loaded_entries); - if (!err.empty()) return err; - - auto trailer = parse(lex, throwaway_stack); - if (trailer.type != pdf_object::DICT) - return pdf_error(trailer, "invalid trailer dictionary"); - if (loaded_xrefs.empty()) - this->trailer = trailer.dict; - loaded_xrefs.insert(xref_offset); - - const auto prev_offset = trailer.dict.find("Prev"); - if (prev_offset == trailer.dict.end()) - break; - // FIXME we don't check for size_t over or underflow - if (!prev_offset->second.is_integer()) - return "invalid Prev offset"; - xref_offset = prev_offset->second.number; - } - - trailer["Prev"] = {pdf_object::NUMERIC, double(last_xref_offset)}; - const auto last_size = trailer.find("Size"); - if (last_size == trailer.end() || !last_size->second.is_integer() || - last_size->second.number <= 0) - return "invalid or missing cross-reference table Size"; - - xref_size = last_size->second.number; - return ""; -} - -pdf_object pdf_updater::get(uint n, uint generation) const { - if (n >= xref_size) - return {pdf_object::NIL}; - - const auto& ref = xref[n]; - if (ref.free || ref.generation != generation || ref.offset >= document.length()) - return {pdf_object::NIL}; - - pdf_lexer lex(document.c_str() + ref.offset); - std::vector stack; - while (1) { - auto object = parse(lex, stack); - if (object.type == pdf_object::END) - return object; - if (object.type != pdf_object::OBJECT) - stack.push_back(std::move(object)); - else if (object.n != n || object.generation != generation) - return {pdf_object::END, "object mismatch"}; - else - return std::move(object.array.at(0)); - } -} - -uint pdf_updater::allocate() { - assert(xref_size < UINT_MAX); - - auto n = xref_size++; - if (xref.size() < xref_size) - xref.resize(xref_size); - - // We don't make sure it gets a subsection in the update yet because we - // make no attempts at fixing the linked list of free items either - return n; -} - -void pdf_updater::update(uint n, std::function fill) { - auto& ref = xref.at(n); - ref.offset = document.length() + 1; - ref.free = false; - updated.insert(n); - - document += ssprintf("\n%u %u obj\n", n, ref.generation); - // Separately so that the callback can use document.length() to get the current offset - fill(); - document += "\nendobj"; -} - -void pdf_updater::flush_updates() { - std::map groups; - for (auto i = updated.cbegin(); i != updated.cend(); ) { - size_t start = *i, count = 1; - while (++i != updated.cend() && *i == start + count) - count++; - groups[start] = count; - } - - // Taking literally "Each cross-reference section begins with a line containing the keyword xref. - // Following this line are one or more cross-reference subsections." from 3.4.3 in PDF Reference - if (groups.empty()) - groups[0] = 0; - - auto startxref = document.length() + 1; - document += "\nxref\n"; - for (const auto& g : groups) { - document += ssprintf("%u %zu\n", g.first, g.second); - for (size_t i = 0; i < g.second; i++) { - auto& ref = xref[g.first + i]; - document += ssprintf("%010zu %05u %c \n", ref.offset, ref.generation, "nf"[!!ref.free]); - } - } - - trailer["Size"] = {pdf_object::NUMERIC, double(xref_size)}; - document += "trailer\n" + pdf_serialize(trailer) - + ssprintf("\nstartxref\n%zu\n%%%%EOF\n", startxref); -} - -// ------------------------------------------------------------------------------------------------- - -/// Make a PDF object representing the given point in time -static pdf_object pdf_date(time_t timestamp) { - struct tm parts; - assert(localtime_r(×tamp, &parts)); - - char buf[64]; - assert(strftime(buf, sizeof buf, "D:%Y%m%d%H%M%S", &parts)); - - std::string offset = "Z"; - auto offset_min = parts.tm_gmtoff / 60; - if (parts.tm_gmtoff < 0) - offset = ssprintf("-%02ld'%02ld'", -offset_min / 60, -offset_min % 60); - if (parts.tm_gmtoff > 0) - offset = ssprintf("+%02ld'%02ld'", +offset_min / 60, +offset_min % 60); - return {pdf_object::STRING, buf + offset}; -} - -static pdf_object pdf_get_first_page(pdf_updater& pdf, uint node_n, uint node_generation) { - auto obj = pdf.get(node_n, node_generation); - if (obj.type != pdf_object::DICT) - return {pdf_object::NIL}; - - // Out of convenience; these aren't filled normally - obj.n = node_n; - obj.generation = node_generation; - - auto type = obj.dict.find("Type"); - if (type == obj.dict.end() || type->second.type != pdf_object::NAME) - return {pdf_object::NIL}; - if (type->second.string == "Page") - return obj; - if (type->second.string != "Pages") - return {pdf_object::NIL}; - - // XXX technically speaking, this may be an indirect reference. The correct way to solve this - // seems to be having "pdf_updater" include a wrapper around "obj.dict.find" - auto kids = obj.dict.find("Kids"); - if (kids == obj.dict.end() || kids->second.type != pdf_object::ARRAY - || kids->second.array.empty() - || kids->second.array.at(0).type != pdf_object::REFERENCE) - return {pdf_object::NIL}; - - // XXX nothing prevents us from recursing in an evil circular graph - return pdf_get_first_page(pdf, kids->second.array.at(0).n, kids->second.array.at(0).generation); -} - -// ------------------------------------------------------------------------------------------------- - -static std::string pkcs12_path, pkcs12_pass; - -// /All/ bytes are checked, except for the signature hexstring itself -static std::string pdf_fill_in_signature(std::string& document, size_t sign_off, size_t sign_len) { - size_t tail_off = sign_off + sign_len, tail_len = document.size() - tail_off; - if (pkcs12_path.empty()) - return "undefined path to the signing key"; - - auto pkcs12_fp = fopen(pkcs12_path.c_str(), "r"); - if (!pkcs12_fp) - return pkcs12_path + ": " + strerror(errno); - - // Abandon hope, all ye who enter OpenSSL! Half of it is undocumented. - OpenSSL_add_all_algorithms(); - ERR_load_crypto_strings(); - ERR_clear_error(); - - PKCS12* p12 = nullptr; - EVP_PKEY* private_key = nullptr; - X509* certificate = nullptr; - STACK_OF(X509)* chain = nullptr; - PKCS7* p7 = nullptr; - int len = 0, sign_flags = PKCS7_DETACHED | PKCS7_BINARY | PKCS7_NOSMIMECAP | PKCS7_PARTIAL; - BIO* p7bio = nullptr; - unsigned char* buf = nullptr; - - // OpenSSL error reasons will usually be of more value than any distinction I can come up with - std::string err = "OpenSSL failure"; - - if (!(p12 = d2i_PKCS12_fp(pkcs12_fp, nullptr)) - || !PKCS12_parse(p12, pkcs12_pass.c_str(), &private_key, &certificate, &chain)) { - err = pkcs12_path + ": parse failure"; - goto error; - } - if (!private_key || !certificate) { - err = pkcs12_path + ": must contain a private key and a valid certificate chain"; - goto error; - } - // Prevent useless signatures -- makes pdfsig from poppler happy at least (and NSS by extension) - if (!(X509_get_key_usage(certificate) & (KU_DIGITAL_SIGNATURE | KU_NON_REPUDIATION))) { - err = "the certificate's key usage must include digital signatures or non-repudiation"; - goto error; - } - if (!(X509_get_extended_key_usage(certificate) & (XKU_SMIME | XKU_ANYEKU))) { - err = "the certificate's extended key usage must include S/MIME"; - goto error; - } -#if 0 // This happily ignores XKU_ANYEKU and I want my tiny world to make a tiny bit more sense - if (X509_check_purpose(certificate, X509_PURPOSE_SMIME_SIGN, false /* not a CA certificate */)) { - err = "the certificate can't be used for S/MIME digital signatures"; - goto error; - } -#endif - - // The default digest is SHA1, which is mildly insecure now -- hence using PKCS7_sign_add_signer - if (!(p7 = PKCS7_sign(nullptr, nullptr, nullptr, nullptr, sign_flags)) - || !PKCS7_sign_add_signer(p7, certificate, private_key, EVP_sha256(), sign_flags)) - goto error; - // For RFC 3161, this is roughly how a timestamp token would be attached (see Appendix A): - // PKCS7_add_attribute(signer_info, NID_id_smime_aa_timeStampToken, V_ASN1_SEQUENCE, value) - for (int i = 0; i < sk_X509_num(chain); i++) - if (!PKCS7_add_certificate(p7, sk_X509_value(chain, i))) - goto error; - - // Adaptation of the innards of the undocumented PKCS7_final() -- I didn't feel like making - // a copy of the whole document. Hopefully this writes directly into a digest BIO. - if (!(p7bio = PKCS7_dataInit(p7, nullptr)) - || (ssize_t) sign_off != BIO_write(p7bio, document.data(), sign_off) - || (ssize_t) tail_len != BIO_write(p7bio, document.data() + tail_off, tail_len) - || BIO_flush(p7bio) != 1 || !PKCS7_dataFinal(p7, p7bio)) - goto error; - -#if 0 - { - // Debugging: openssl cms -inform PEM -in pdf_signature.pem -noout -cmsout -print - // Context: https://stackoverflow.com/a/29253469 - auto fp = fopen("pdf_signature.pem", "wb"); - assert(PEM_write_PKCS7(fp, p7) && !fclose(fp)); - } -#endif - - if ((len = i2d_PKCS7(p7, &buf)) < 0) - goto error; - if (size_t(len) * 2 > sign_len - 2 /* hexstring quotes */) { - // The obvious solution is to increase the allocation... or spend a week reading specifications - // while losing all faith in humanity as a species, and skip the PKCS7 API entirely - err = ssprintf("not enough space reserved for the signature (%zu nibbles vs %zu nibbles)", - sign_len - 2, size_t(len) * 2); - goto error; - } - for (int i = 0; i < len; i++) { - document[sign_off + 2 * i + 1] = "0123456789abcdef"[buf[i] / 16]; - document[sign_off + 2 * i + 2] = "0123456789abcdef"[buf[i] % 16]; - } - err.clear(); - -error: - OPENSSL_free(buf); - BIO_free_all(p7bio); - PKCS7_free(p7); - sk_X509_pop_free(chain, X509_free); - X509_free(certificate); - EVP_PKEY_free(private_key); - PKCS12_free(p12); - - // In any case, clear the error stack (it's a queue, really) to avoid confusion elsewhere - while (auto code = ERR_get_error()) - if (auto reason = ERR_reason_error_string(code)) - err = err + "; " + reason; - - fclose(pkcs12_fp); - return err; -} - -// ------------------------------------------------------------------------------------------------- - -/// The presumption here is that the document is valid and that it doesn't employ cross-reference -/// streams from PDF 1.5, or at least constitutes a hybrid-reference file. The results with -/// PDF 2.0 (2017) are currently unknown as the standard costs money. -/// -/// Carelessly assumes that the version of the original document is at most PDF 1.6. -/// -/// https://www.adobe.com/devnet-docs/acrobatetk/tools/DigSig/Acrobat_DigitalSignatures_in_PDF.pdf -/// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf -/// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PPKAppearances.pdf -static std::string pdf_sign(std::string& document) { - pdf_updater pdf(document); - auto err = pdf.initialize(); - if (!err.empty()) - return err; - - auto root_ref = pdf.trailer.find("Root"); - if (root_ref == pdf.trailer.end() || root_ref->second.type != pdf_object::REFERENCE) - return "trailer does not contain a reference to Root"; - auto root = pdf.get(root_ref->second.n, root_ref->second.generation); - if (root.type != pdf_object::DICT) - return "invalid Root dictionary reference"; - - // 8.7 Digital Signatures - /signature dictionary/ - auto sigdict_n = pdf.allocate(); - size_t byterange_off = 0, byterange_len = 0, sign_off = 0, sign_len = 0; - pdf.update(sigdict_n, [&]{ - // The timestamp is important for Adobe Acrobat Reader DC. The ideal would be to use RFC 3161. - pdf.document.append("<< /Type/Sig /Filter/Adobe.PPKLite /SubFilter/adbe.pkcs7.detached\n" - " /M" + pdf_serialize(pdf_date(time(nullptr))) + " /ByteRange "); - byterange_off = pdf.document.size(); - pdf.document.append((byterange_len = 32 /* fine for a gigabyte */), ' '); - pdf.document.append("\n /Contents <"); - sign_off = pdf.document.size(); - pdf.document.append((sign_len = 8192 /* certificate, digest, encrypted digest, ... */), '0'); - pdf.document.append("> >>"); - - // We actually need to exclude the hexstring quotes from signing - sign_off -= 1; - sign_len += 2; - }); - - // 8.6.3 Field Types - Signature Fields - pdf_object sigfield{pdf_object::DICT}; - sigfield.dict.insert({"FT", {pdf_object::NAME, "Sig"}}); - sigfield.dict.insert({"V", {pdf_object::REFERENCE, sigdict_n, 0}}); - // 8.4.5 Annotations Types - Widget Annotations - // We can merge the Signature Annotation and omit Kids here - sigfield.dict.insert({"Subtype", {pdf_object::NAME, "Widget"}}); - sigfield.dict.insert({"F", {pdf_object::NUMERIC, 2 /* Hidden */}}); - sigfield.dict.insert({"T", {pdf_object::STRING, "Signature1"}}); - sigfield.dict.insert({"Rect", {std::vector{ - {pdf_object::NUMERIC, 0}, - {pdf_object::NUMERIC, 0}, - {pdf_object::NUMERIC, 0}, - {pdf_object::NUMERIC, 0}, - }}}); - - auto sigfield_n = pdf.allocate(); - pdf.update(sigfield_n, [&]{ pdf.document += pdf_serialize(sigfield); }); - - auto pages_ref = root.dict.find("Pages"); - if (pages_ref == root.dict.end() || pages_ref->second.type != pdf_object::REFERENCE) - return "invalid Pages reference"; - auto page = pdf_get_first_page(pdf, pages_ref->second.n, pages_ref->second.generation); - if (page.type != pdf_object::DICT) - return "invalid or unsupported page tree"; - - // XXX assuming this won't be an indirectly referenced array - auto& annots = page.dict["Annots"]; - if (annots.type != pdf_object::ARRAY) - annots = {pdf_object::ARRAY}; - annots.array.emplace_back(pdf_object::REFERENCE, sigfield_n, 0); - pdf.update(page.n, [&]{ pdf.document += pdf_serialize(page); }); - - // 8.6.1 Interactive Form Dictionary - // XXX assuming there are no forms already, overwriting everything - root.dict["AcroForm"] = {std::map{ - {"Fields", {std::vector{ - {pdf_object::REFERENCE, sigfield_n, 0} - }}}, - {"SigFlags", {pdf_object::NUMERIC, 3 /* SignaturesExist | AppendOnly */}} - }}; - - // Upgrade the document version for SHA-256 etc. - // XXX assuming that it's not newer than 1.6 already -- while Cairo can't currently use a newer - // version that 1.5, it's not a bad idea to use cairo_pdf_surface_restrict_to_version() - root.dict["Version"] = {pdf_object::NAME, "1.6"}; - pdf.update(root_ref->second.n, [&]{ pdf.document += pdf_serialize(root); }); - pdf.flush_updates(); - - // Now that we know the length of everything, store byte ranges of what we're about to sign, - // which must be everything but the resulting signature itself - size_t tail_off = sign_off + sign_len, tail_len = pdf.document.size() - tail_off; - auto ranges = ssprintf("[0 %zu %zu %zu]", sign_off, tail_off, tail_len); - if (ranges.length() > byterange_len) - return "not enough space reserved for /ByteRange"; - pdf.document.replace(byterange_off, std::min(ranges.length(), byterange_len), ranges); - return pdf_fill_in_signature(pdf.document, sign_off, sign_len); -} -// ------------------------------------------------------------------------------------------------- +#include "pdf-simple-sign.h" __attribute__((format(printf, 2, 3))) static void die(int status, const char* format, ...) { va_list ap; va_start(ap, format); if (isatty(fileno(stderr))) - vfprintf(stderr, ssprintf("\x1b[31m%s\x1b[0m\n", format).c_str(), ap); + vfprintf(stderr, ("\x1b[31m" + std::string(format) + "\x1b[0m\n").c_str(), ap); else vfprintf(stderr, format, ap); va_end(ap); @@ -949,8 +72,6 @@ int main(int argc, char* argv[]) { const char* input_path = argv[0]; const char* output_path = argv[1]; - pkcs12_path = argv[2]; - pkcs12_pass = argv[3]; std::string pdf_document; if (auto fp = fopen(input_path, "rb")) { @@ -964,7 +85,7 @@ int main(int argc, char* argv[]) { die(1, "%s: %s", input_path, strerror(errno)); } - auto err = pdf_sign(pdf_document); + auto err = pdf_simple_sign(pdf_document, argv[2], argv[3]); if (!err.empty()) { die(2, "Error: %s", err.c_str()); } diff --git a/pdf-simple-sign.h b/pdf-simple-sign.h new file mode 100644 index 0000000..022731c --- /dev/null +++ b/pdf-simple-sign.h @@ -0,0 +1,28 @@ +// vim: set sw=2 ts=2 sts=2 et tw=100: +// +// pdf-simple-sign: simple PDF signer +// +// Copyright (c) 2017 - 2020, Přemysl Eric Janouch +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// + +#pragma once + +#include + +/// Sign basic PDF documents, as generated by e.g. Cairo, using the key-certificate pair +/// stored in the PKCS#12 file named `pkcs12_path`, with password `pkcs12_pass`. +/// Returns a non-empty error string on failure. +std::string pdf_simple_sign(std::string& document, + const std::string& pkcs12_path, + const std::string& pkcs12_pass); diff --git a/pdf.cpp b/pdf.cpp new file mode 100644 index 0000000..867c3d4 --- /dev/null +++ b/pdf.cpp @@ -0,0 +1,906 @@ +// vim: set sw=2 ts=2 sts=2 et tw=100: +// +// pdf-simple-sign: simple PDF signer +// +// Copyright (c) 2017 - 2020, Přemysl Eric Janouch +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// + +#include +#undef NDEBUG +#include + +#include +#include +#include +#include +#include + +#if defined __GLIBCXX__ && __GLIBCXX__ < 20140422 +#error Need libstdc++ >= 4.9 for +#endif + +#include +#include +#include + +#include "pdf-simple-sign.h" + +namespace { + +using uint = unsigned int; + +static std::string concatenate(const std::vector& v, const std::string& delim) { + std::string res; + if (v.empty()) + return res; + for (const auto& s : v) + res += s + delim; + return res.substr(0, res.length() - delim.length()); +} + +template +static std::string ssprintf(const std::string& format, Args... args) { + size_t size = std::snprintf(nullptr, 0, format.c_str(), args... ) + 1; + std::unique_ptr buf(new char[size]); + std::snprintf(buf.get(), size, format.c_str(), args...); + return std::string(buf.get(), buf.get() + size - 1); +} + +// ------------------------------------------------------------------------------------------------- + +/// PDF token/object thingy. Objects may be composed either from one or a sequence of tokens. +/// The PDF Reference doesn't actually speak of tokens. +struct pdf_object { + enum type { + END, NL, COMMENT, NIL, BOOL, NUMERIC, KEYWORD, NAME, STRING, + // Simple tokens + B_ARRAY, E_ARRAY, B_DICT, E_DICT, + // Higher-level objects + ARRAY, DICT, OBJECT, REFERENCE, + } type = END; + + std::string string; ///< END (error message), COMMENT/KEYWORD/NAME/STRING + double number = 0.; ///< BOOL, NUMERIC + std::vector array; ///< ARRAY, OBJECT + std::map dict; ///< DICT, in the future also STREAM + uint n = 0, generation = 0; ///< OBJECT, REFERENCE + + pdf_object(enum type type = END) : type(type) {} + pdf_object(enum type type, double v) : type(type), number(v) {} + pdf_object(enum type type, const std::string& v) : type(type), string(v) {} + pdf_object(enum type type, uint n, uint g) : type(type), n(n), generation(g) {} + pdf_object(const std::vector& array) : type(ARRAY), array(array) {} + pdf_object(const std::map& dict) : type(DICT), dict(dict) {} + + pdf_object(const pdf_object&) = default; + pdf_object(pdf_object&&) = default; + pdf_object& operator=(const pdf_object&) = default; + pdf_object& operator=(pdf_object&&) = default; + + /// Return whether this is a number without a fractional part + bool is_integer() const { + double tmp; + return type == NUMERIC && std::modf(number, &tmp) == 0.; + } +}; + +/// Basic lexical analyser for the Portable Document Format, giving limited error information +struct pdf_lexer { + const unsigned char* p; + pdf_lexer(const char* s) : p(reinterpret_cast(s)) {} + + static constexpr const char* oct_alphabet = "01234567"; + static constexpr const char* dec_alphabet = "0123456789"; + static constexpr const char* hex_alphabet = "0123456789abcdefABCDEF"; + static constexpr const char* whitespace = "\t\n\f\r "; + static constexpr const char* delimiters = "()<>[]{}/%"; + + bool eat_newline(int ch) { + if (ch == '\r') { + if (*p == '\n') p++; + return true; + } + return ch == '\n'; + } + + pdf_object string() { + std::string value; + int parens = 1; + while (1) { + if (!*p) return {pdf_object::END, "unexpected end of string"}; + auto ch = *p++; + if (eat_newline(ch)) ch = '\n'; + else if (ch == '(') { parens++; } + else if (ch == ')') { if (!--parens) break; } + else if (ch == '\\') { + if (!*p) return {pdf_object::END, "unexpected end of string"}; + switch ((ch = *p++)) { + case 'n': ch = '\n'; break; + case 'r': ch = '\r'; break; + case 't': ch = '\t'; break; + case 'b': ch = '\b'; break; + case 'f': ch = '\f'; break; + default: + if (eat_newline(ch)) + continue; + std::string octal; + if (ch && strchr(oct_alphabet, ch)) { + octal += ch; + if (*p && strchr(oct_alphabet, *p)) octal += *p++; + if (*p && strchr(oct_alphabet, *p)) octal += *p++; + ch = std::stoi(octal, nullptr, 8); + } + } + } + value += ch; + } + return {pdf_object::STRING, value}; + } + + pdf_object string_hex() { + std::string value, buf; + while (*p != '>') { + if (!*p) return {pdf_object::END, "unexpected end of hex string"}; + if (!strchr(hex_alphabet, *p)) + return {pdf_object::END, "invalid hex string"}; + buf += *p++; + if (buf.size() == 2) { + value += char(std::stoi(buf, nullptr, 16)); + buf.clear(); + } + } + p++; + if (!buf.empty()) value += char(std::stoi(buf + '0', nullptr, 16)); + return {pdf_object::STRING, value}; + } + + pdf_object name() { + std::string value; + while (!strchr(whitespace, *p) && !strchr(delimiters, *p)) { + auto ch = *p++; + if (ch == '#') { + std::string hexa; + if (*p && strchr(hex_alphabet, *p)) hexa += *p++; + if (*p && strchr(hex_alphabet, *p)) hexa += *p++; + if (hexa.size() != 2) + return {pdf_object::END, "invalid name hexa escape"}; + ch = char(std::stoi(hexa, nullptr, 16)); + } + value += ch; + } + if (value.empty()) return {pdf_object::END, "unexpected end of name"}; + return {pdf_object::NAME, value}; + } + + pdf_object comment() { + std::string value; + while (*p && *p != '\r' && *p != '\n') + value += *p++; + return {pdf_object::COMMENT, value}; + } + + // XXX maybe invalid numbers should rather be interpreted as keywords + pdf_object number() { + std::string value; + if (*p == '-') + value += *p++; + bool real = false, digits = false; + while (*p) { + if (strchr(dec_alphabet, *p)) + digits = true; + else if (*p == '.' && !real) + real = true; + else + break; + value += *p++; + } + if (!digits) return {pdf_object::END, "invalid number"}; + return {pdf_object::NUMERIC, std::stod(value, nullptr)}; + } + + pdf_object next() { + if (!*p) + return {pdf_object::END}; + if (strchr("-0123456789.", *p)) + return number(); + + // {} end up being keywords, we might want to error out on those + std::string value; + while (!strchr(whitespace, *p) && !strchr(delimiters, *p)) + value += *p++; + if (!value.empty()) { + if (value == "null") return {pdf_object::NIL}; + if (value == "true") return {pdf_object::BOOL, 1}; + if (value == "false") return {pdf_object::BOOL, 0}; + return {pdf_object::KEYWORD, value}; + } + + switch (char ch = *p++) { + case '/': return name(); + case '%': return comment(); + case '(': return string(); + case '[': return {pdf_object::B_ARRAY}; + case ']': return {pdf_object::E_ARRAY}; + case '<': + if (*p++ == '<') + return {pdf_object::B_DICT}; + p--; + return string_hex(); + case '>': + if (*p++ == '>') + return {pdf_object::E_DICT}; + p--; + return {pdf_object::END, "unexpected '>'"}; + default: + if (eat_newline(ch)) + return {pdf_object::NL}; + if (strchr(whitespace, ch)) + return next(); + return {pdf_object::END, "unexpected input"}; + } + } +}; + +// FIXME lines /should not/ be longer than 255 characters, some wrapping is in order +static std::string pdf_serialize(const pdf_object& o) { + switch (o.type) { + case pdf_object::NL: return "\n"; + case pdf_object::NIL: return "null"; + case pdf_object::BOOL: return o.number ? "true" : "false"; + case pdf_object::NUMERIC: + { + if (o.is_integer()) return std::to_string((long long) o.number); + return std::to_string(o.number); + } + case pdf_object::KEYWORD: return o.string; + case pdf_object::NAME: + { + std::string escaped = "/"; + for (char c : o.string) { + if (c == '#' || strchr(pdf_lexer::delimiters, c) || strchr(pdf_lexer::whitespace, c)) + escaped += ssprintf("#%02x", c); + else + escaped += c; + } + return escaped; + } + case pdf_object::STRING: + { + std::string escaped; + for (char c : o.string) { + if (c == '\\' || c == '(' || c == ')') + escaped += '\\'; + escaped += c; + } + return "(" + escaped + ")"; + } + case pdf_object::B_ARRAY: return "["; + case pdf_object::E_ARRAY: return "]"; + case pdf_object::B_DICT: return "<<"; + case pdf_object::E_DICT: return ">>"; + case pdf_object::ARRAY: + { + std::vector v; + for (const auto& i : o.array) + v.push_back(pdf_serialize(i)); + return "[ " + concatenate(v, " ") + " ]"; + } + case pdf_object::DICT: + { + std::string s; + for (const auto& i : o.dict) + // FIXME the key is also supposed to be escaped by pdf_serialize() + s += " /" + i.first + " " + pdf_serialize(i.second); + return "<<" + s + " >>"; + } + case pdf_object::OBJECT: + return ssprintf("%u %u obj\n", o.n, o.generation) + pdf_serialize(o.array.at(0)) + "\nendobj"; + case pdf_object::REFERENCE: + return ssprintf("%u %u R", o.n, o.generation); + default: + assert(!"unsupported token for serialization"); + } +} + +// ------------------------------------------------------------------------------------------------- + +/// Utility class to help read and possibly incrementally update PDF files +class pdf_updater { + struct ref { + size_t offset = 0; ///< File offset or N of the next free entry + uint generation = 0; ///< Object generation + bool free = true; ///< Whether this N has been deleted + }; + + std::vector xref; ///< Cross-reference table + size_t xref_size = 0; ///< Current cross-reference table size, correlated to xref.size() + std::set updated; ///< List of updated objects + + pdf_object parse_obj(pdf_lexer& lex, std::vector& stack) const; + pdf_object parse_R(std::vector& stack) const; + pdf_object parse(pdf_lexer& lex, std::vector& stack) const; + std::string load_xref(pdf_lexer& lex, std::set& loaded_entries); + +public: + /// The new trailer dictionary to be written, initialized with the old one + std::map trailer; + + std::string& document; + pdf_updater(std::string& document) : document(document) {} + + /// Build the cross-reference table and prepare a new trailer dictionary + std::string initialize(); + /// Retrieve an object by its number and generation -- may return NIL or END with an error + pdf_object get(uint n, uint generation) const; + /// Allocate a new object number + uint allocate(); + /// Append an updated object to the end of the document + void update(uint n, std::function fill); + /// Write an updated cross-reference table and trailer + void flush_updates(); +}; + +// ------------------------------------------------------------------------------------------------- + +/// If the object is an error, forward its message, otherwise return err. +static std::string pdf_error(const pdf_object& o, const char* err) { + if (o.type != pdf_object::END || o.string.empty()) return err; + return o.string; +} + +pdf_object pdf_updater::parse_obj(pdf_lexer& lex, std::vector& stack) const { + if (stack.size() < 2) + return {pdf_object::END, "missing object ID pair"}; + + auto g = stack.back(); stack.pop_back(); + auto n = stack.back(); stack.pop_back(); + if (!g.is_integer() || g.number < 0 || g.number > UINT_MAX + || !n.is_integer() || n.number < 0 || n.number > UINT_MAX) + return {pdf_object::END, "invalid object ID pair"}; + + pdf_object obj{pdf_object::OBJECT}; + obj.n = n.number; + obj.generation = g.number; + + while (1) { + auto object = parse(lex, obj.array); + if (object.type == pdf_object::END) + return {pdf_object::END, pdf_error(object, "object doesn't end")}; + if (object.type == pdf_object::KEYWORD && object.string == "endobj") + break; + obj.array.push_back(std::move(object)); + } + return obj; +} + +pdf_object pdf_updater::parse_R(std::vector& stack) const { + if (stack.size() < 2) + return {pdf_object::END, "missing reference ID pair"}; + + auto g = stack.back(); stack.pop_back(); + auto n = stack.back(); stack.pop_back(); + if (!g.is_integer() || g.number < 0 || g.number > UINT_MAX + || !n.is_integer() || n.number < 0 || n.number > UINT_MAX) + return {pdf_object::END, "invalid reference ID pair"}; + + pdf_object ref{pdf_object::REFERENCE}; + ref.n = n.number; + ref.generation = g.number; + return ref; +} + +/// Read an object at the lexer's position. Not a strict parser. +pdf_object pdf_updater::parse(pdf_lexer& lex, std::vector& stack) const { + auto token = lex.next(); + switch (token.type) { + case pdf_object::NL: + case pdf_object::COMMENT: + // These are not important to parsing, not even for this procedure's needs + return parse(lex, stack); + case pdf_object::B_ARRAY: + { + std::vector array; + while (1) { + auto object = parse(lex, array); + if (object.type == pdf_object::END) + return {pdf_object::END, pdf_error(object, "array doesn't end")}; + if (object.type == pdf_object::E_ARRAY) + break; + array.push_back(std::move(object)); + } + return array; + } + case pdf_object::B_DICT: + { + std::vector array; + while (1) { + auto object = parse(lex, array); + if (object.type == pdf_object::END) + return {pdf_object::END, pdf_error(object, "dictionary doesn't end")}; + if (object.type == pdf_object::E_DICT) + break; + array.push_back(std::move(object)); + } + if (array.size() % 2) + return {pdf_object::END, "unbalanced dictionary"}; + std::map dict; + for (size_t i = 0; i < array.size(); i += 2) { + if (array[i].type != pdf_object::NAME) + return {pdf_object::END, "invalid dictionary key type"}; + dict.insert({array[i].string, std::move(array[i + 1])}); + } + return dict; + } + case pdf_object::KEYWORD: + // Appears in the document body, typically needs to access the cross-reference table + // TODO use the xref to read /Length etc. once we actually need to read such objects; + // presumably streams can use the pdf_object::string member + if (token.string == "stream") return {pdf_object::END, "streams are not supported yet"}; + if (token.string == "obj") return parse_obj(lex, stack); + if (token.string == "R") return parse_R(stack); + return token; + default: + return token; + } +} + +std::string pdf_updater::load_xref(pdf_lexer& lex, std::set& loaded_entries) { + std::vector throwaway_stack; + { + auto keyword = parse(lex, throwaway_stack); + if (keyword.type != pdf_object::KEYWORD || keyword.string != "xref") + return "invalid xref table"; + } + while (1) { + auto object = parse(lex, throwaway_stack); + if (object.type == pdf_object::END) + return pdf_error(object, "unexpected EOF while looking for the trailer"); + if (object.type == pdf_object::KEYWORD && object.string == "trailer") + break; + + auto second = parse(lex, throwaway_stack); + if (!object.is_integer() || object.number < 0 || object.number > UINT_MAX + || !second.is_integer() || second.number < 0 || second.number > UINT_MAX) + return "invalid xref section header"; + + const size_t start = object.number; + const size_t count = second.number; + for (size_t i = 0; i < count; i++) { + auto off = parse(lex, throwaway_stack); + auto gen = parse(lex, throwaway_stack); + auto key = parse(lex, throwaway_stack); + if (!off.is_integer() || off.number < 0 || off.number > document.length() + || !gen.is_integer() || gen.number < 0 || gen.number > 65535 + || key.type != pdf_object::KEYWORD) + return "invalid xref entry"; + + bool free = true; + if (key.string == "n") + free = false; + else if (key.string != "f") + return "invalid xref entry"; + + auto n = start + i; + if (loaded_entries.count(n)) + continue; + if (n >= xref.size()) + xref.resize(n + 1); + loaded_entries.insert(n); + + auto& ref = xref[n]; + ref.generation = gen.number; + ref.offset = off.number; + ref.free = free; + } + } + return ""; +} + +// ------------------------------------------------------------------------------------------------- + +std::string pdf_updater::initialize() { + // We only need to look for startxref roughly within the last kibibyte of the document + static std::regex haystack_re("[\\s\\S]*\\sstartxref\\s+(\\d+)\\s+%%EOF"); + std::string haystack = document.substr(document.length() < 1024 ? 0 : document.length() - 1024); + + std::smatch m; + if (!std::regex_search(haystack, m, haystack_re, std::regex_constants::match_continuous)) + return "cannot find startxref"; + + size_t xref_offset = std::stoul(m.str(1)), last_xref_offset = xref_offset; + std::set loaded_xrefs; + std::set loaded_entries; + + std::vector throwaway_stack; + while (1) { + if (loaded_xrefs.count(xref_offset)) + return "circular xref offsets"; + if (xref_offset >= document.length()) + return "invalid xref offset"; + + pdf_lexer lex(document.c_str() + xref_offset); + auto err = load_xref(lex, loaded_entries); + if (!err.empty()) return err; + + auto trailer = parse(lex, throwaway_stack); + if (trailer.type != pdf_object::DICT) + return pdf_error(trailer, "invalid trailer dictionary"); + if (loaded_xrefs.empty()) + this->trailer = trailer.dict; + loaded_xrefs.insert(xref_offset); + + const auto prev_offset = trailer.dict.find("Prev"); + if (prev_offset == trailer.dict.end()) + break; + // FIXME we don't check for size_t over or underflow + if (!prev_offset->second.is_integer()) + return "invalid Prev offset"; + xref_offset = prev_offset->second.number; + } + + trailer["Prev"] = {pdf_object::NUMERIC, double(last_xref_offset)}; + const auto last_size = trailer.find("Size"); + if (last_size == trailer.end() || !last_size->second.is_integer() || + last_size->second.number <= 0) + return "invalid or missing cross-reference table Size"; + + xref_size = last_size->second.number; + return ""; +} + +pdf_object pdf_updater::get(uint n, uint generation) const { + if (n >= xref_size) + return {pdf_object::NIL}; + + const auto& ref = xref[n]; + if (ref.free || ref.generation != generation || ref.offset >= document.length()) + return {pdf_object::NIL}; + + pdf_lexer lex(document.c_str() + ref.offset); + std::vector stack; + while (1) { + auto object = parse(lex, stack); + if (object.type == pdf_object::END) + return object; + if (object.type != pdf_object::OBJECT) + stack.push_back(std::move(object)); + else if (object.n != n || object.generation != generation) + return {pdf_object::END, "object mismatch"}; + else + return std::move(object.array.at(0)); + } +} + +uint pdf_updater::allocate() { + assert(xref_size < UINT_MAX); + + auto n = xref_size++; + if (xref.size() < xref_size) + xref.resize(xref_size); + + // We don't make sure it gets a subsection in the update yet because we + // make no attempts at fixing the linked list of free items either + return n; +} + +void pdf_updater::update(uint n, std::function fill) { + auto& ref = xref.at(n); + ref.offset = document.length() + 1; + ref.free = false; + updated.insert(n); + + document += ssprintf("\n%u %u obj\n", n, ref.generation); + // Separately so that the callback can use document.length() to get the current offset + fill(); + document += "\nendobj"; +} + +void pdf_updater::flush_updates() { + std::map groups; + for (auto i = updated.cbegin(); i != updated.cend(); ) { + size_t start = *i, count = 1; + while (++i != updated.cend() && *i == start + count) + count++; + groups[start] = count; + } + + // Taking literally "Each cross-reference section begins with a line containing the keyword xref. + // Following this line are one or more cross-reference subsections." from 3.4.3 in PDF Reference + if (groups.empty()) + groups[0] = 0; + + auto startxref = document.length() + 1; + document += "\nxref\n"; + for (const auto& g : groups) { + document += ssprintf("%u %zu\n", g.first, g.second); + for (size_t i = 0; i < g.second; i++) { + auto& ref = xref[g.first + i]; + document += ssprintf("%010zu %05u %c \n", ref.offset, ref.generation, "nf"[!!ref.free]); + } + } + + trailer["Size"] = {pdf_object::NUMERIC, double(xref_size)}; + document += "trailer\n" + pdf_serialize(trailer) + + ssprintf("\nstartxref\n%zu\n%%%%EOF\n", startxref); +} + +// ------------------------------------------------------------------------------------------------- + +/// Make a PDF object representing the given point in time +static pdf_object pdf_date(time_t timestamp) { + struct tm parts; + assert(localtime_r(×tamp, &parts)); + + char buf[64]; + assert(strftime(buf, sizeof buf, "D:%Y%m%d%H%M%S", &parts)); + + std::string offset = "Z"; + auto offset_min = parts.tm_gmtoff / 60; + if (parts.tm_gmtoff < 0) + offset = ssprintf("-%02ld'%02ld'", -offset_min / 60, -offset_min % 60); + if (parts.tm_gmtoff > 0) + offset = ssprintf("+%02ld'%02ld'", +offset_min / 60, +offset_min % 60); + return {pdf_object::STRING, buf + offset}; +} + +static pdf_object pdf_get_first_page(pdf_updater& pdf, uint node_n, uint node_generation) { + auto obj = pdf.get(node_n, node_generation); + if (obj.type != pdf_object::DICT) + return {pdf_object::NIL}; + + // Out of convenience; these aren't filled normally + obj.n = node_n; + obj.generation = node_generation; + + auto type = obj.dict.find("Type"); + if (type == obj.dict.end() || type->second.type != pdf_object::NAME) + return {pdf_object::NIL}; + if (type->second.string == "Page") + return obj; + if (type->second.string != "Pages") + return {pdf_object::NIL}; + + // XXX technically speaking, this may be an indirect reference. The correct way to solve this + // seems to be having "pdf_updater" include a wrapper around "obj.dict.find" + auto kids = obj.dict.find("Kids"); + if (kids == obj.dict.end() || kids->second.type != pdf_object::ARRAY + || kids->second.array.empty() + || kids->second.array.at(0).type != pdf_object::REFERENCE) + return {pdf_object::NIL}; + + // XXX nothing prevents us from recursing in an evil circular graph + return pdf_get_first_page(pdf, kids->second.array.at(0).n, kids->second.array.at(0).generation); +} + +// ------------------------------------------------------------------------------------------------- + +// /All/ bytes are checked, except for the signature hexstring itself +static std::string pdf_fill_in_signature(std::string& document, size_t sign_off, size_t sign_len, + const std::string& pkcs12_path, + const std::string& pkcs12_pass) { + size_t tail_off = sign_off + sign_len, tail_len = document.size() - tail_off; + if (pkcs12_path.empty()) + return "undefined path to the signing key"; + + auto pkcs12_fp = fopen(pkcs12_path.c_str(), "r"); + if (!pkcs12_fp) + return pkcs12_path + ": " + strerror(errno); + + // Abandon hope, all ye who enter OpenSSL! Half of it is undocumented. + OpenSSL_add_all_algorithms(); + ERR_load_crypto_strings(); + ERR_clear_error(); + + PKCS12* p12 = nullptr; + EVP_PKEY* private_key = nullptr; + X509* certificate = nullptr; + STACK_OF(X509)* chain = nullptr; + PKCS7* p7 = nullptr; + int len = 0, sign_flags = PKCS7_DETACHED | PKCS7_BINARY | PKCS7_NOSMIMECAP | PKCS7_PARTIAL; + BIO* p7bio = nullptr; + unsigned char* buf = nullptr; + + // OpenSSL error reasons will usually be of more value than any distinction I can come up with + std::string err = "OpenSSL failure"; + + if (!(p12 = d2i_PKCS12_fp(pkcs12_fp, nullptr)) + || !PKCS12_parse(p12, pkcs12_pass.c_str(), &private_key, &certificate, &chain)) { + err = pkcs12_path + ": parse failure"; + goto error; + } + if (!private_key || !certificate) { + err = pkcs12_path + ": must contain a private key and a valid certificate chain"; + goto error; + } + // Prevent useless signatures -- makes pdfsig from poppler happy at least (and NSS by extension) + if (!(X509_get_key_usage(certificate) & (KU_DIGITAL_SIGNATURE | KU_NON_REPUDIATION))) { + err = "the certificate's key usage must include digital signatures or non-repudiation"; + goto error; + } + if (!(X509_get_extended_key_usage(certificate) & (XKU_SMIME | XKU_ANYEKU))) { + err = "the certificate's extended key usage must include S/MIME"; + goto error; + } +#if 0 // This happily ignores XKU_ANYEKU and I want my tiny world to make a tiny bit more sense + if (X509_check_purpose(certificate, X509_PURPOSE_SMIME_SIGN, false /* not a CA certificate */)) { + err = "the certificate can't be used for S/MIME digital signatures"; + goto error; + } +#endif + + // The default digest is SHA1, which is mildly insecure now -- hence using PKCS7_sign_add_signer + if (!(p7 = PKCS7_sign(nullptr, nullptr, nullptr, nullptr, sign_flags)) + || !PKCS7_sign_add_signer(p7, certificate, private_key, EVP_sha256(), sign_flags)) + goto error; + // For RFC 3161, this is roughly how a timestamp token would be attached (see Appendix A): + // PKCS7_add_attribute(signer_info, NID_id_smime_aa_timeStampToken, V_ASN1_SEQUENCE, value) + for (int i = 0; i < sk_X509_num(chain); i++) + if (!PKCS7_add_certificate(p7, sk_X509_value(chain, i))) + goto error; + + // Adaptation of the innards of the undocumented PKCS7_final() -- I didn't feel like making + // a copy of the whole document. Hopefully this writes directly into a digest BIO. + if (!(p7bio = PKCS7_dataInit(p7, nullptr)) + || (ssize_t) sign_off != BIO_write(p7bio, document.data(), sign_off) + || (ssize_t) tail_len != BIO_write(p7bio, document.data() + tail_off, tail_len) + || BIO_flush(p7bio) != 1 || !PKCS7_dataFinal(p7, p7bio)) + goto error; + +#if 0 + { + // Debugging: openssl cms -inform PEM -in pdf_signature.pem -noout -cmsout -print + // Context: https://stackoverflow.com/a/29253469 + auto fp = fopen("pdf_signature.pem", "wb"); + assert(PEM_write_PKCS7(fp, p7) && !fclose(fp)); + } +#endif + + if ((len = i2d_PKCS7(p7, &buf)) < 0) + goto error; + if (size_t(len) * 2 > sign_len - 2 /* hexstring quotes */) { + // The obvious solution is to increase the allocation... or spend a week reading specifications + // while losing all faith in humanity as a species, and skip the PKCS7 API entirely + err = ssprintf("not enough space reserved for the signature (%zu nibbles vs %zu nibbles)", + sign_len - 2, size_t(len) * 2); + goto error; + } + for (int i = 0; i < len; i++) { + document[sign_off + 2 * i + 1] = "0123456789abcdef"[buf[i] / 16]; + document[sign_off + 2 * i + 2] = "0123456789abcdef"[buf[i] % 16]; + } + err.clear(); + +error: + OPENSSL_free(buf); + BIO_free_all(p7bio); + PKCS7_free(p7); + sk_X509_pop_free(chain, X509_free); + X509_free(certificate); + EVP_PKEY_free(private_key); + PKCS12_free(p12); + + // In any case, clear the error stack (it's a queue, really) to avoid confusion elsewhere + while (auto code = ERR_get_error()) + if (auto reason = ERR_reason_error_string(code)) + err = err + "; " + reason; + + fclose(pkcs12_fp); + return err; +} + +} // anonymous namespace + +// ------------------------------------------------------------------------------------------------- + +// The presumption here is that the document is valid and that it doesn't employ cross-reference +// streams from PDF 1.5, or at least constitutes a hybrid-reference file. The results with +// PDF 2.0 (2017) are currently unknown as the standard costs money. +// +// Carelessly assumes that the version of the original document is at most PDF 1.6. +// +// https://www.adobe.com/devnet-docs/acrobatetk/tools/DigSig/Acrobat_DigitalSignatures_in_PDF.pdf +// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf +// https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PPKAppearances.pdf +std::string pdf_simple_sign(std::string& document, + const std::string& pkcs12_path, + const std::string& pkcs12_pass) { + pdf_updater pdf(document); + auto err = pdf.initialize(); + if (!err.empty()) + return err; + + auto root_ref = pdf.trailer.find("Root"); + if (root_ref == pdf.trailer.end() || root_ref->second.type != pdf_object::REFERENCE) + return "trailer does not contain a reference to Root"; + auto root = pdf.get(root_ref->second.n, root_ref->second.generation); + if (root.type != pdf_object::DICT) + return "invalid Root dictionary reference"; + + // 8.7 Digital Signatures - /signature dictionary/ + auto sigdict_n = pdf.allocate(); + size_t byterange_off = 0, byterange_len = 0, sign_off = 0, sign_len = 0; + pdf.update(sigdict_n, [&]{ + // The timestamp is important for Adobe Acrobat Reader DC. The ideal would be to use RFC 3161. + pdf.document.append("<< /Type/Sig /Filter/Adobe.PPKLite /SubFilter/adbe.pkcs7.detached\n" + " /M" + pdf_serialize(pdf_date(time(nullptr))) + " /ByteRange "); + byterange_off = pdf.document.size(); + pdf.document.append((byterange_len = 32 /* fine for a gigabyte */), ' '); + pdf.document.append("\n /Contents <"); + sign_off = pdf.document.size(); + pdf.document.append((sign_len = 8192 /* certificate, digest, encrypted digest, ... */), '0'); + pdf.document.append("> >>"); + + // We actually need to exclude the hexstring quotes from signing + sign_off -= 1; + sign_len += 2; + }); + + // 8.6.3 Field Types - Signature Fields + pdf_object sigfield{pdf_object::DICT}; + sigfield.dict.insert({"FT", {pdf_object::NAME, "Sig"}}); + sigfield.dict.insert({"V", {pdf_object::REFERENCE, sigdict_n, 0}}); + // 8.4.5 Annotations Types - Widget Annotations + // We can merge the Signature Annotation and omit Kids here + sigfield.dict.insert({"Subtype", {pdf_object::NAME, "Widget"}}); + sigfield.dict.insert({"F", {pdf_object::NUMERIC, 2 /* Hidden */}}); + sigfield.dict.insert({"T", {pdf_object::STRING, "Signature1"}}); + sigfield.dict.insert({"Rect", {std::vector{ + {pdf_object::NUMERIC, 0}, + {pdf_object::NUMERIC, 0}, + {pdf_object::NUMERIC, 0}, + {pdf_object::NUMERIC, 0}, + }}}); + + auto sigfield_n = pdf.allocate(); + pdf.update(sigfield_n, [&]{ pdf.document += pdf_serialize(sigfield); }); + + auto pages_ref = root.dict.find("Pages"); + if (pages_ref == root.dict.end() || pages_ref->second.type != pdf_object::REFERENCE) + return "invalid Pages reference"; + auto page = pdf_get_first_page(pdf, pages_ref->second.n, pages_ref->second.generation); + if (page.type != pdf_object::DICT) + return "invalid or unsupported page tree"; + + // XXX assuming this won't be an indirectly referenced array + auto& annots = page.dict["Annots"]; + if (annots.type != pdf_object::ARRAY) + annots = {pdf_object::ARRAY}; + annots.array.emplace_back(pdf_object::REFERENCE, sigfield_n, 0); + pdf.update(page.n, [&]{ pdf.document += pdf_serialize(page); }); + + // 8.6.1 Interactive Form Dictionary + // XXX assuming there are no forms already, overwriting everything + root.dict["AcroForm"] = {std::map{ + {"Fields", {std::vector{ + {pdf_object::REFERENCE, sigfield_n, 0} + }}}, + {"SigFlags", {pdf_object::NUMERIC, 3 /* SignaturesExist | AppendOnly */}} + }}; + + // Upgrade the document version for SHA-256 etc. + // XXX assuming that it's not newer than 1.6 already -- while Cairo can't currently use a newer + // version that 1.5, it's not a bad idea to use cairo_pdf_surface_restrict_to_version() + root.dict["Version"] = {pdf_object::NAME, "1.6"}; + pdf.update(root_ref->second.n, [&]{ pdf.document += pdf_serialize(root); }); + pdf.flush_updates(); + + // Now that we know the length of everything, store byte ranges of what we're about to sign, + // which must be everything but the resulting signature itself + size_t tail_off = sign_off + sign_len, tail_len = pdf.document.size() - tail_off; + auto ranges = ssprintf("[0 %zu %zu %zu]", sign_off, tail_off, tail_len); + if (ranges.length() > byterange_len) + return "not enough space reserved for /ByteRange"; + pdf.document.replace(byterange_off, std::min(ranges.length(), byterange_len), ranges); + return pdf_fill_in_signature(pdf.document, sign_off, sign_len, pkcs12_path, pkcs12_pass); +} -- cgit v1.2.3