From a2e56611331c62aea48e96ee092d450fb6de3b1f Mon Sep 17 00:00:00 2001
From: Přemysl Janouch
Date: Sun, 12 Feb 2017 20:22:07 +0100
Subject: Add a partial decoder for PDF
---
plugins/pdf.lua | 514 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 514 insertions(+)
create mode 100644 plugins/pdf.lua
diff --git a/plugins/pdf.lua b/plugins/pdf.lua
new file mode 100644
index 0000000..91f8b6b
--- /dev/null
+++ b/plugins/pdf.lua
@@ -0,0 +1,514 @@
+--
+-- pdf.lua: Portable Document Format
+--
+-- Based on PDF Reference, version 1.7
+-- In practice almost useless, I just wanted to learn about the file format.
+-- FIXME: it's also not very robust and doesn't support all documents.
+--
+-- Copyright (c) 2017, Přemysl Janouch
+--
+-- Permission to use, copy, modify, and/or distribute this software for any
+-- purpose with or without fee is hereby granted, provided that the above
+-- copyright notice and this permission notice appear in all copies.
+--
+-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+-- SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+-- OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+-- CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+--
+
+local oct_alphabet = "01234567"
+local dec_alphabet = "0123456789"
+local hex_alphabet = "0123456789abcdefABCDEF"
+local whitespace = "\x00\t\n\f\r "
+local delimiters = "()<>[]{}/%"
+
+local strchr = function (s, ch) return s:find (ch, 1, true) end
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+local Lexer = {}
+Lexer.__index = Lexer
+
+function Lexer:new (c)
+ return setmetatable ({ c = c }, self)
+end
+
+-- TODO: make it possible to follow a string, we should probably be able to
+-- supply callbacks to the constructor, or a wrapper object;
+-- this will be used for object streams
+function Lexer:getc ()
+ if self.c.eof then return nil end
+ return self.c:read (1)
+end
+
+function Lexer:ungetc ()
+ self.c.position = self.c.position - 1
+end
+
+function Lexer:token (type, value, description)
+ if description then
+ self.c (self.start, self.c.position - 1):mark (description)
+ end
+ return { type=type, value=value,
+ start=self.start, stop=self.c.position - 1 }
+end
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+function Lexer:eat_newline (ch)
+ if ch == '\r' then
+ ch = self:getc ()
+ if ch and ch ~= '\n' then self:ungetc () end
+ return true
+ elseif ch == '\n' then
+ return true
+ end
+end
+
+function Lexer:string ()
+ local value, level, ch = "", 1
+::continue::
+ while true do
+ ch = self:getc ()
+ if not ch then return nil
+ elseif ch == '\\' then
+ ch = self:getc ()
+ if not ch then return nil
+ elseif ch == 'n' then ch = '\n'
+ elseif ch == 'r' then ch = '\r'
+ elseif ch == 't' then ch = '\t'
+ elseif ch == 'b' then ch = '\b'
+ elseif ch == 'f' then ch = '\f'
+ elseif self:eat_newline (ch) then goto continue
+ elseif strchr (oct_alphabet, ch) then
+ local buf, i = ch
+ for i = 1, 2 do
+ ch = self:getc ()
+ if not ch then return nil
+ elseif not strchr (oct_alphabet, ch) then
+ self:ungetc ()
+ break
+ end
+ buf = buf .. ch
+ end
+ ch = string.char (tonumber (buf, 8))
+ end
+ elseif self:eat_newline (ch) then
+ ch = '\n'
+ elseif ch == '(' then
+ level = level + 1
+ elseif ch == ')' then
+ level = level - 1
+ if level == 0 then break end
+ end
+ value = value .. ch
+ end
+ return self:token ('string', value, "string literal")
+end
+
+function Lexer:string_hex ()
+ local value, buf, ch = ""
+ while true do
+ ch = self:getc ()
+ if not ch then return nil
+ elseif ch == '>' then
+ break
+ elseif not strchr (hex_alphabet, ch) then
+ return nil
+ elseif buf then
+ value = value .. string.char (tonumber (buf .. ch, 16))
+ buf = nil
+ else
+ buf = ch
+ end
+ end
+ if buf then value = value .. string.char (tonumber (buf .. '0', 16)) end
+ return self:token ('string', value, "string hex")
+end
+
+function Lexer:name ()
+ local value, ch = ""
+ while true do
+ ch = self:getc ()
+ if not ch then break
+ elseif ch == '#' then
+ local ch1, ch2 = self:getc (), self:getc ()
+ if not ch1 or not ch2
+ or not strchr (hex_alphabet, ch1)
+ or not strchr (hex_alphabet, ch2) then
+ return nil
+ end
+ ch = string.char (tonumber (ch1 .. ch2, 16))
+ elseif strchr (whitespace .. delimiters, ch) then
+ self:ungetc ()
+ break
+ end
+ value = value .. ch
+ end
+ if value == "" then return nil end
+ return self:token ('name', value, "name")
+end
+
+function Lexer:comment ()
+ local value, ch = ""
+ while true do
+ ch = self:getc ()
+ if not ch then break
+ elseif ch == '\r' or ch == '\n' then
+ self:ungetc ()
+ break
+ end
+ value = value .. ch
+ end
+ return self:token ('comment', value, "comment")
+end
+
+function Lexer:number (ch)
+ local value, real, digits = "", false, false
+ if ch == '-' then
+ value = ch
+ ch = self:getc ()
+ end
+ while ch do
+ if strchr (dec_alphabet, ch) then
+ digits = true
+ elseif ch == '.' and not real then
+ real = true
+ else
+ self:ungetc ()
+ break
+ end
+ value = value .. ch
+ ch = self:getc ()
+ end
+ -- XXX: perhaps we should instead let it be interpreted as a keyword
+ if not digits then return nil end
+ -- XXX: maybe we should differentiate between integers and real values
+ return self:token ('number', tonumber (value, 10), "number")
+end
+
+function Lexer:get_token ()
+::restart::
+ self.start = self.c.position
+ local ch = self:getc ()
+
+ if not ch then return nil
+ elseif ch == '(' then return self:string ()
+ elseif ch == '[' then return self:token ('begin_array')
+ elseif ch == ']' then return self:token ('end_array')
+ elseif ch == '<' then
+ -- It seems they ran out of paired characters, yet {} is unused
+ ch = self:getc ()
+ if not ch then return nil
+ elseif ch == '<' then return self:token ('begin_dictionary')
+ else
+ self:ungetc ()
+ return self:string_hex ()
+ end
+ elseif ch == '>' then
+ ch = self:getc ()
+ if not ch then return nil
+ elseif ch == '>' then return self:token ('end_dictionary')
+ else return nil end
+ elseif ch == '/' then return self:name ()
+ elseif ch == '%' then return self:comment ()
+ elseif strchr ("-0123456789.", ch) then return self:number (ch)
+ elseif self:eat_newline (ch) then return self:token ('newline')
+ elseif strchr (whitespace, ch) then goto restart
+ else
+ -- {} end up being keywords but we should probably error out
+ local value = ch
+ while true do
+ ch = self:getc ()
+ if not ch then break
+ elseif strchr (whitespace .. delimiters, ch) then
+ self:ungetc ()
+ break
+ end
+ value = value .. ch
+ end
+ if value == "null" then
+ return self:token ('null', nil, "null")
+ elseif value == "true" then
+ return self:token ('boolean', true, "boolean")
+ elseif value == "false" then
+ return self:token ('boolean', false, "boolean")
+ end
+ return self:token ('keyword', value, "keyword")
+ end
+end
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+local is_value = function (t)
+ return t == 'null' or t == 'boolean' or t == 'name'
+ or t == 'number' or t == 'string'
+end
+
+-- Retrieve the next thing in the stream, possibly popping values from the stack
+local function get_object (lex, stack, deref)
+::restart::
+ local token = lex:get_token ()
+ if token == nil then return nil
+ elseif token.type == 'begin_array' then
+ local array = {}
+ repeat
+ local object = get_object (lex, array, deref)
+ if not object then error ("array doesn't end") end
+ table.insert (array, object)
+ until object.type == 'end_array'
+ local stop = table.remove (array)
+ return { type='array', value=array, start=token.start, stop=stop.stop }
+ elseif token.type == 'begin_dictionary' then
+ local dict = {}
+ repeat
+ local object = get_object (lex, dict, deref)
+ if not object then error ("dictionary doesn't end") end
+ table.insert (dict, object)
+ until object.type == 'end_dictionary'
+ local stop, kv = table.remove (dict), {}
+ if #dict % 2 == 1 then error ("unbalanced dictionary") end
+ for i = 1, #dict, 2 do
+ local k, v = dict[i], dict[i + 1]
+ if k.type ~= 'name' then error ("invalid dictionary key type") end
+ kv[k.value] = v
+ end
+ return { type='dict', value=kv, start=token.start, stop=stop.stop }
+ elseif token.type == 'keyword' and token.value == 'stream' then
+ if #stack < 1 then error ("no dictionary for stream") end
+ local d = table.remove (stack)
+ if d.type ~= 'dict' then error ("stream not preceded by dictionary") end
+
+ if not lex:eat_newline (lex:getc ()) then
+ error ("'stream' not followed by newline")
+ end
+
+ local len = deref (d.value['Length'])
+ if not len or len.type ~= 'number' then
+ error ("missing stream length")
+ end
+
+ local data, stop = lex.c:read (len.value), get_object (lex, {}, deref)
+ if not stop or stop.type ~= 'keyword' or stop.value ~= 'endstream' then
+ error ("missing 'endstream'")
+ end
+
+ return { type='stream', value={ dict=dict, data=data },
+ start=token.start, stop=stop.stop }
+ elseif token.type == 'keyword' and token.value == 'obj' then
+ if #stack < 2 then error ("missing object ID pair") end
+ local gen, n = table.remove (stack), table.remove (stack)
+ if n.type ~= 'number' or gen.type ~= 'number' then
+ error ("object ID pair must be two integers")
+ end
+
+ local tmp = {}
+ repeat
+ local object = get_object (lex, tmp, deref)
+ if not object then error ("object doesn't end") end
+ table.insert (tmp, object)
+ until object.type == 'keyword' and object.value == 'endobj'
+ local stop = table.remove (tmp)
+
+ if #tmp ~= 1 then error ("objects must contain exactly one value") end
+ local value = table.remove (tmp)
+ return { type='object', n=n.value, gen=gen.value, value=value,
+ start=n.start, stop=stop.stop }
+ elseif token.type == 'keyword' and token.value == 'R' then
+ if #stack < 2 then error ("missing reference ID pair") end
+ local gen, n = table.remove (stack), table.remove (stack)
+ if n.type ~= 'number' or gen.type ~= 'number' then
+ error ("reference ID pair must be two integers")
+ end
+ return { type='reference', value={ n.value, gen.value } }
+ elseif token.type == 'newline' or token.type == 'comment' then
+ -- These are not objects and our callers aren't interested
+ goto restart
+ else
+ return token
+ end
+end
+
+-- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+local detect = function (c)
+ return c:read (5) == "%PDF-"
+end
+
+local decode_xref_subsection = function (lex, start, count, result)
+ if not lex:eat_newline (lex:getc ()) then
+ error ("xref subsection must start on a new line")
+ end
+ for i = 0, count - 1 do
+ local entry = lex.c:read (20)
+ local off, gen, typ = entry:match
+ ("^(%d%d%d%d%d%d%d%d%d%d) (%d%d%d%d%d) ([fn])[\r ][\r\n]$")
+ if not off then error ("invalid xref entry") end
+
+ -- Translated to the extended XRefStm format
+ result[start + i] = {
+ t = typ == 'n' and 1 or 0,
+ o = math.tointeger (off),
+ g = math.tointeger (gen),
+ }
+ end
+end
+
+-- A deref that can't actually resolve anything, for early stages of processing
+local deref_nil = function (x)
+ if not x or x.type == 'reference' then return nil end
+ return x
+end
+
+-- Creates a table with named indexes from the trailer and items indexed by
+-- object numbers containing { XRefStm fields... }
+local decode_xref_normal = function (lex)
+ local result = {}
+ while true do
+ local a = get_object (lex, {}, deref_nil)
+ local b = get_object (lex, {}, deref_nil)
+ if not a or not b then
+ error ("xref section ends too soon")
+ elseif a.type == 'number' and b.type == 'number' then
+ decode_xref_subsection (lex, a.value, b.value, result)
+ elseif a.type == 'keyword' and a.value == 'trailer'
+ and b.type == 'dict' then
+ for k, v in pairs (b.value) do
+ result[k] = v
+ end
+ return result
+ else
+ error ("invalid xref contents")
+ end
+ end
+end
+
+local decode_xref_stream = function (lex, stream)
+ if stream.dict['Type'] ~= 'XRef' then error ("expected an XRef stream") end
+
+ -- TODO: decode a cross-reference stream from stream.{dict,data};
+ -- the compression filter, if present, is always going to be FlateDecode,
+ -- which we'll have to import or implement
+ -- TODO: take care to also cache cross-reference streams by offset when
+ -- they're actually implemented
+ error ("cross-reference streams not implemented")
+end
+
+local decode_xref = function (c)
+ local lex, stack = Lexer:new (c), {}
+ while true do
+ local object = get_object (lex, stack, deref_nil)
+ if object == nil then
+ return nil
+ elseif object.type == 'keyword' and object.value == 'xref' then
+ return decode_xref_normal (lex)
+ elseif object.type == 'stream' then
+ return decode_xref_stream (lex, object)
+ end
+ table.insert (stack, object)
+ end
+end
+
+-- Return all objects found in xref tables as a table indexed by object number,
+-- pointing to a list of generations and overwrites, from newest to oldest.
+local read_all_xrefs = function (c, start_offset)
+ local loaded, result, offset = {}, {}, start_offset
+ while true do
+ -- Prevent an infinite loop with malicious files
+ if loaded[offset] then error ("cyclic cross-reference sections") end
+
+ local xref = decode_xref (c (1 + offset, #c))
+ if not xref then break end
+ for k, v in pairs (xref) do
+ if type (k) == 'number' then
+ if not result[k] then result[k] = {} end
+ table.insert (result[k], v)
+ end
+ end
+ loaded[offset] = true
+
+ -- TODO: when 'XRefStm' is found, it has precedence over this 'Prev',
+ -- and also has its own 'Prev' chain
+ local prev = xref['Prev']
+ if not prev or prev.type ~= 'number' then break end
+ offset = prev.value
+ end
+ return result
+end
+
+local decode = function (c)
+ assert (c.position == 1)
+ if not detect (c ()) then error ("not a PDF file") end
+
+ -- Look for a pointer to the xref section within the last kibibyte
+ -- NOTE: we could probably look backwards for the "trailer" line from here
+ -- but we don't know how long the trailer is and we don't want to regex
+ -- scan the whole file (ignoring that dictionary contents might, possibly
+ -- legally, include the word "trailer" at the beginning of a new line)
+ local tail_len = math.min (1024, #c)
+ local tail = c (#c - tail_len, #c):read (tail_len)
+ local xref_loc = tail:match (".*%sstartxref%s+(%d+)%s+%%%%EOF")
+ if not xref_loc then error ("cannot find trailer") end
+
+ -- We need to decode xref sections in order to be able to resolve indirect
+ -- references to stream lengths
+ local xref = read_all_xrefs (c, math.tointeger (xref_loc))
+ local deref
+
+ -- We have to make sure that we don't decode objects twice as that would
+ -- duplicate all marks, so we simply cache all objects by offset.
+ -- This may be quite the memory load but it seems to be the best thing.
+ local cache = {}
+ local read_object = function (offset)
+ if cache[offset] then return cache[offset] end
+
+ local lex, stack = Lexer:new (c (1 + offset, #c)), {}
+ repeat
+ local object = get_object (lex, stack, deref)
+ if not object then error ("object doesn't end") end
+ table.insert (stack, object)
+ until object.type == 'object'
+
+ local object = table.remove (stack)
+ cache[offset] = object
+ c (offset + object.start, offset + object.stop)
+ :mark ("object " .. object.n .. " " .. object.gen)
+ return object
+ end
+
+ -- Resolve an object -- if it's a reference, look it up in "xref",
+ -- otherwise just return the object as it was passed
+ deref = function (x)
+ if not x or x.type ~= 'reference' then return x end
+ local n, gen = x.value[1], x.value[2]
+
+ -- TODO: we should also ignore object numbers >= trailer /Size
+ local bin = xref[n]
+ if not bin then return nil end
+ local entry = bin[1]
+ if not entry or entry.t ~= 1 or entry.g ~= gen then return nil end
+
+ local object = read_object (entry.o)
+ if not object or object.n ~= n or object.gen ~= gen then return nil end
+ return object.value
+ end
+
+ -- Read all objects accessible from the current version of the document
+ for n, bin in pairs (xref) do
+ local entry = bin[1]
+ if entry and entry.t == 1 then
+ read_object (entry.o)
+ end
+ end
+
+ -- TODO: we should actually try to decode even unreferenced objects.
+ -- The problem with decoding content from previous versions of the
+ -- document is that we must ignore xref updates from newer versions.
+ -- The version information needs to be propagated everywhere.
+end
+
+hex.register { type="pdf", detect=detect, decode=decode }
--
cgit v1.2.3-70-g09d2