From 9c77aac640e11ca5df144e3c943f48dfeab8485a Mon Sep 17 00:00:00 2001
From: Přemysl Eric Janouch
Date: Mon, 29 Nov 2021 22:23:41 +0100
Subject: Add a tool to extract information from JPEG images
---
tools/.gitignore | 1 +
tools/Makefile | 2 +-
tools/jpeginfo.c | 463 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 465 insertions(+), 1 deletion(-)
create mode 100644 tools/jpeginfo.c
diff --git a/tools/.gitignore b/tools/.gitignore
index 8463b6e..5888ef6 100644
--- a/tools/.gitignore
+++ b/tools/.gitignore
@@ -1 +1,2 @@
/pnginfo
+/jpeginfo
diff --git a/tools/Makefile b/tools/Makefile
index c7fa1a5..a60c17d 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -4,7 +4,7 @@ CFLAGS = -g -O2 -Wall -Wextra `pkg-config --cflags $(deps)`
LDLIBS = -ljq `pkg-config --libs $(deps)`
deps = libpng
-targets = pnginfo
+targets = pnginfo jpeginfo
all: $(targets)
clean:
diff --git a/tools/jpeginfo.c b/tools/jpeginfo.c
new file mode 100644
index 0000000..fcfda0e
--- /dev/null
+++ b/tools/jpeginfo.c
@@ -0,0 +1,463 @@
+//
+// jpeginfo.c: acquire information about JPEG files in JSON format
+//
+// Copyright (c) 2021, Přemysl Eric Janouch
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+// --- JPEG --------------------------------------------------------------------
+// See: https://www.w3.org/Graphics/JPEG/itu-t81.pdf
+
+enum {
+ TEM = 0x01,
+ SOF0 = 0xC0, SOF1, SOF2, SOF3,
+ DHT = 0xC4,
+ SOF5, SOF6, SOF7,
+ JPG = 0xC8,
+ SOF9, SOF10, SOF11,
+ DAC = 0xCC,
+ SOF13, SOF14, SOF15,
+
+ RST0 = 0xD0, RST1, RST2, RST3, RST4, RST5, RST6, RST7,
+
+ SOI = 0xD8,
+ EOI = 0xD9,
+ SOS = 0xDA,
+ DQT = 0xDB,
+ DNL = 0xDC,
+ DRI = 0xDD,
+ DHP = 0xDE,
+ EXP = 0xDF,
+
+ APP0 = 0xE0, APP1, APP2, APP3, APP4, APP5, APP6, APP7,
+ APP8, APP9, APP10, APP11, APP12, APP13, APP14, APP15,
+
+ JPG0 = 0xF0, JPG1, JPG2, JPG3, JPG4, JPG5, JPG6, JPG7,
+ JPG8, JPG9, JPG10, JPG11, JPG12, JPG13,
+
+ COM = 0xFE
+};
+
+// The rest is "RES (Reserved)", except for 0xFF (filler) and 0x00 (invalid).
+static const char *marker_ids[0xFF] = {
+ [TEM] = "TEM",
+ [SOF0] = "SOF0", [SOF1] = "SOF1", [SOF2] = "SOF2", [SOF3] = "SOF3",
+ [DHT] = "DHT", [SOF5] = "SOF5", [SOF6] = "SOF6", [SOF7] = "SOF7",
+ [JPG] = "JPG", [SOF9] = "SOF9", [SOF10] = "SOF10", [SOF11] = "SOF11",
+ [DAC] = "DAC", [SOF13] = "SOF13", [SOF14] = "SOF14", [SOF15] = "SOF15",
+ [RST0] = "RST0", [RST1] = "RST1", [RST2] = "RST2", [RST3] = "RST3",
+ [RST4] = "RST4", [RST5] = "RST5", [RST6] = "RST6", [RST7] = "RST7",
+ [SOI] = "SOI", [EOI] = "EOI", [SOS] = "SOS", [DQT] = "DQT",
+ [DNL] = "DNL", [DRI] = "DRI", [DHP] = "DHP", [EXP] = "EXP",
+ [APP0] = "APP0", [APP1] = "APP1", [APP2] = "APP2", [APP3] = "APP3",
+ [APP4] = "APP4", [APP5] = "APP5", [APP6] = "APP6", [APP7] = "APP7",
+ [APP8] = "APP8", [APP9] = "APP9", [APP10] = "APP10", [APP11] = "APP11",
+ [APP12] = "APP12", [APP13] = "APP13", [APP14] = "APP14", [APP15] = "APP15",
+ [JPG0] = "JPG0", [JPG1] = "JPG1", [JPG2] = "JPG2", [JPG3] = "JPG3",
+ [JPG4] = "JPG4", [JPG5] = "JPG5", [JPG6] = "JPG6", [JPG7] = "JPG7",
+ [JPG8] = "JPG8", [JPG9] = "JPG9", [JPG10] = "JPG10", [JPG11] = "JPG11",
+ [JPG12] = "JPG12", [JPG13] = "JPG13", [COM] = "COM"
+};
+
+/*
+// The rest is "RES (Reserved)", except for 0xFF (filler) and 0x00 (invalid).
+static const char *marker_descriptions[0xFF] = {
+ [TEM] = "For temporary private use in arithmetic coding",
+ [SOF0] = "Baseline DCT",
+ [SOF1] = "Extended sequential DCT",
+ [SOF2] = "Progressive DCT",
+ [SOF3] = "Lossless (sequential)",
+ [DHT] = "Define Huffman table(s)",
+ [SOF5] = "Differential sequential DCT",
+ [SOF6] = "Differential progressive DCT",
+ [SOF7] = "Differential lossless (sequential)",
+ [JPG] = "Reserved for JPEG extensions",
+ [SOF9] = "Extended sequential DCT",
+ [SOF10] = "Progressive DCT",
+ [SOF11] = "Lossless (sequential)",
+ [DAC] = "Define arithmetic coding conditioning(s)",
+ [SOF13] = "Differential sequential DCT",
+ [SOF14] = "Differential progressive DCT",
+ [SOF15] = "Differential lossless (sequential)",
+ [RST0] = "Restart with module 8 count 0",
+ [RST1] = "Restart with module 8 count 1",
+ [RST2] = "Restart with module 8 count 2",
+ [RST3] = "Restart with module 8 count 3",
+ [RST4] = "Restart with module 8 count 4",
+ [RST5] = "Restart with module 8 count 5",
+ [RST6] = "Restart with module 8 count 6",
+ [RST7] = "Restart with module 8 count 7",
+ [SOI] = "Start of image",
+ [EOI] = "End of image",
+ [SOS] = "Start of scan",
+ [DQT] = "Define quantization table(s)",
+ [DNL] = "Define number of lines",
+ [DRI] = "Define restart interval",
+ [DHP] = "Define hierarchical progression",
+ [EXP] = "Expand reference component(s)",
+ [APP0] = "Reserved for application segments, 0",
+ [APP1] = "Reserved for application segments, 1",
+ [APP2] = "Reserved for application segments, 2",
+ [APP3] = "Reserved for application segments, 3",
+ [APP4] = "Reserved for application segments, 4",
+ [APP5] = "Reserved for application segments, 5",
+ [APP6] = "Reserved for application segments, 6",
+ [APP7] = "Reserved for application segments, 7",
+ [APP8] = "Reserved for application segments, 8",
+ [APP9] = "Reserved for application segments, 9",
+ [APP10] = "Reserved for application segments, 10",
+ [APP11] = "Reserved for application segments, 11",
+ [APP12] = "Reserved for application segments, 12",
+ [APP13] = "Reserved for application segments, 13",
+ [APP14] = "Reserved for application segments, 14",
+ [APP15] = "Reserved for application segments, 15",
+ [JPG0] = "Reserved for JPEG extensions, 0",
+ [JPG1] = "Reserved for JPEG extensions, 1",
+ [JPG2] = "Reserved for JPEG extensions, 2",
+ [JPG3] = "Reserved for JPEG extensions, 3",
+ [JPG4] = "Reserved for JPEG extensions, 4",
+ [JPG5] = "Reserved for JPEG extensions, 5",
+ [JPG6] = "Reserved for JPEG extensions, 6",
+ [JPG7] = "Reserved for JPEG extensions, 7",
+ [JPG8] = "Reserved for JPEG extensions, 8",
+ [JPG9] = "Reserved for JPEG extensions, 9",
+ [JPG10] = "Reserved for JPEG extensions, 10",
+ [JPG11] = "Reserved for JPEG extensions, 11",
+ [JPG12] = "Reserved for JPEG extensions, 12",
+ [JPG13] = "Reserved for JPEG extensions, 13",
+ [COM] = "Comment",
+};
+*/
+
+// --- Analysis ----------------------------------------------------------------
+// Because the JPEG file format is simple, just do it manually.
+
+static jv
+add_to_subarray(jv o, const char *key, jv value)
+{
+ // Invalid values are not allocated, and we use up any valid one.
+ // Beware that jv_get() returns jv_null() rather than jv_invalid().
+ jv a = jv_object_get(jv_copy(o), jv_string(key));
+ return jv_set(o, jv_string(key),
+ jv_is_valid(jv_copy(a)) ? jv_array_append(a, value) : JV_ARRAY(value));
+}
+
+static jv
+add_warning(jv o, const char *message)
+{
+ return add_to_subarray(o, "warnings", jv_string(message));
+}
+
+static jv
+add_error(jv o, const char *message)
+{
+ return jv_object_set(o, jv_string("error"), jv_string(message));
+}
+
+struct data {
+ bool ended;
+ char *exif, *icc;
+ size_t exif_len, icc_len;
+ int icc_sequence, icc_done;
+};
+
+static const uint8_t *
+parse_marker(uint8_t marker, const uint8_t *p, const uint8_t *end,
+ struct data *data, jv *o)
+{
+ // Suspected: MJPEG? Undetected format recursion, e.g., thumbnails?
+ // Found: Random metadata! Multi-Picture Format!
+ if ((data->ended = marker == EOI) && p != end)
+ *o = add_warning(*o, "trailing data");
+
+ // These markers stand alone, not starting a marker segment.
+ switch (marker) {
+ case RST0:
+ case RST1:
+ case RST2:
+ case RST3:
+ case RST4:
+ case RST5:
+ case RST6:
+ case RST7:
+ *o = add_warning(*o, "unexpected restart marker");
+ // Fall-through
+ case SOI:
+ case EOI:
+ case TEM:
+ return p;
+ }
+
+ uint16_t length = p[0] << 8 | p[1];
+ const uint8_t *payload = p + 2;
+ if ((p += length) > end) {
+ *o = add_error(*o, "runaway marker segment");
+ return NULL;
+ }
+
+ switch (marker) {
+ case SOF0:
+ case SOF1:
+ case SOF2:
+ case SOF3:
+ case SOF5:
+ case SOF6:
+ case SOF7:
+ case SOF9:
+ case SOF10:
+ case SOF11:
+ case SOF13:
+ case SOF14:
+ case SOF15:
+ case DHP: // B.2.2 and B.3.2.
+ // As per B.2.5, Y can be zero, then there needs to be a DNL segment.
+ *o = add_to_subarray(*o, "info", JV_OBJECT(
+ jv_string("bits"), jv_number(payload[0]),
+ jv_string("height"), jv_number(payload[1] << 8 | payload[2]),
+ jv_string("width"), jv_number(payload[3] << 8 | payload[4]),
+ jv_string("components"), jv_number(payload[5])
+ ));
+ return p;
+ }
+
+ // See B.1.1.5, we can brute-force our way through the entropy-coded data.
+ if (marker == SOS) {
+ while (p + 2 <= end && (p[0] != 0xFF || p[1] < 0xC0 || p[1] > 0xFE ||
+ (p[1] >= RST0 && p[1] <= RST7)))
+ p++;
+ return p;
+ }
+
+ // "The interpretation is left to the application."
+ if (marker == COM) {
+ int superascii = 0;
+ char *buf = calloc(3, p - payload), *bufp = buf;
+ for (const uint8_t *q = payload; q < p; q++) {
+ if (*q < 128) {
+ *bufp++ = *q;
+ } else {
+ superascii++;
+ *bufp++ = 0xC0 | (*q >> 6);
+ *bufp++ = 0x80 | (*q & 0x3F);
+ }
+ }
+ *bufp++ = 0;
+ *o = add_to_subarray(*o, "comments", jv_string(buf));
+ free(buf);
+
+ if (superascii)
+ *o = add_warning(*o, "super-ASCII comments");
+ }
+
+ // These mostly contain an ASCII string header, following JPEG FIF:
+ //
+ // "Application-specific APP0 marker segments are identified
+ // by a zero terminated string which identifies the application
+ // (not 'JFIF' or 'JFXX')."
+ if (marker >= APP0 && marker <= APP15) {
+ const uint8_t *nul = memchr(payload, 0, p - payload);
+ int unprintable = !nul;
+ if (nul) {
+ for (const uint8_t *q = payload; q < nul; q++)
+ unprintable += *q < 32 || *q >= 127;
+ }
+ *o = add_to_subarray(*o, "apps",
+ unprintable ? jv_null() : jv_string((const char *) payload));
+ }
+
+ // CIPA DC-007 (Multi-Picture Format)
+ // http://fileformats.archiveteam.org/wiki/Multi-Picture_Format
+ // TODO(p): Handle by properly skipping trailing data (use MPF offsets).
+
+ // CIPA DC-006 (Stereo Still Image Format for Digital Cameras)
+ // http://fileformats.archiveteam.org/wiki/Multi-Picture_Format
+ // TODO(p): Handle by properly skipping trailing data (use Stim offsets).
+
+ // https://www.w3.org/Graphics/JPEG/jfif3.pdf
+ if (marker == APP0 && p - payload >= 14 && !memcmp(payload, "JFIF\0", 5)) {
+ payload += 5;
+
+ jv units = jv_number(payload[2]);
+ switch (payload[2]) {
+ break; case 0: units = jv_null();
+ break; case 1: units = jv_string("DPI");
+ break; case 2: units = jv_string("dots per cm");
+ }
+
+ // The rest is picture data.
+ *o = add_to_subarray(*o, "JFIF", JV_OBJECT(
+ jv_string("version"), jv_number(payload[0] * 100 + payload[1]),
+ jv_string("units"), units,
+ jv_string("density-x"), jv_number(payload[3] << 8 | payload[4]),
+ jv_string("density-y"), jv_number(payload[5] << 8 | payload[6]),
+ jv_string("thumbnail-w"), jv_number(payload[7]),
+ jv_string("thumbnail-h"), jv_number(payload[8])
+ ));
+ }
+
+ // https://www.w3.org/Graphics/JPEG/jfif3.pdf
+ if (marker == APP0 && p - payload >= 6 && !memcmp(payload, "JFXX\0", 5)) {
+ payload += 5;
+
+ jv extension = jv_number(payload[0]);
+ switch (payload[0]) {
+ break; case 0x10: extension = jv_string("JPEG thumbnail");
+ break; case 0x11: extension = jv_string("Paletted thumbnail");
+ break; case 0x13: extension = jv_string("RGB thumbnail");
+ }
+
+ // The rest is picture data.
+ *o = add_to_subarray(*o, "JFXX",
+ JV_OBJECT(jv_string("extension"), extension));
+ }
+
+ // https://www.cipa.jp/std/documents/e/DC-008-2012_E.pdf 4.7.2
+ if (marker == APP1 && p - payload >= 6 && !memcmp(payload, "Exif\0", 5)) {
+ payload += 6;
+
+ if (payload[-1] != 0)
+ *o = add_warning(*o, "weirdly padded Exif header");
+
+ if (data->exif) {
+ *o = add_warning(*o, "multiple Exifs");
+ } else {
+ data->exif = realloc(data->exif, (data->exif_len = p - payload));
+ memcpy(data->exif, payload, data->exif_len);
+ }
+ }
+
+ // https://www.color.org/specification/ICC1v43_2010-12.pdf B.4
+ if (marker == APP2 && p - payload >= 14 &&
+ !memcmp(payload, "ICC_PROFILE\0", 12) && !data->icc_done &&
+ payload[12] == ++data->icc_sequence && payload[13] >= payload[12]) {
+ payload += 14;
+
+ size_t icc_longer = data->icc_len + (p - payload);
+ data->icc = realloc(data->icc, icc_longer);
+ memcpy(data->icc + data->icc_len, payload, p - payload);
+ data->icc_len = icc_longer;
+
+ data->icc_done = payload[-1] == data->icc_sequence;
+ }
+ return p;
+}
+
+static jv
+parse_jpeg(jv o, const uint8_t *p, size_t len)
+{
+ struct data data = {};
+ const uint8_t *end = p + len;
+ jv markers = jv_array();
+ while (p) {
+ // This is an expectable condition, use a simple warning.
+ if (p + 2 > end) {
+ if (!data.ended)
+ o = add_warning(o, "unexpected EOF");
+ break;
+ }
+ if (*p++ != 0xFF || *p == 0) {
+ o = add_error(o, "no marker found where one was expected");
+ break;
+ }
+
+ // Markers may be preceded by fill bytes.
+ if (*p == 0xFF) {
+ o = jv_object_set(o, jv_string("fillers"), jv_bool(true));
+ continue;
+ }
+
+ uint8_t marker = *p++;
+ markers = jv_array_append(markers,
+ jv_string(marker_ids[marker] ? marker_ids[marker] : "RES"));
+ p = parse_marker(marker, p, end, &data, &o);
+ }
+
+ if (data.exif) {
+ // TODO(p): Decode.
+ free(data.exif);
+ }
+
+ if (data.icc) {
+ if (data.icc_done) {
+ // TODO(p): Decode.
+ } else {
+ o = add_warning(o, "bad ICC profile sequence");
+ }
+ free(data.icc);
+ }
+
+ return jv_set(o, jv_string("markers"), markers);
+}
+
+static jv
+do_file(const char *filename, jv o)
+{
+ const char *err = NULL;
+ FILE *fp = fopen(filename, "rb");
+ if (!fp) {
+ err = strerror(errno);
+ goto error;
+ }
+
+ uint8_t *data = NULL, buf[256 << 10];
+ size_t n, len = 0;
+ while ((n = fread(buf, sizeof *buf, sizeof buf / sizeof *buf, fp))) {
+ data = realloc(data, len + n);
+ memcpy(data + len, buf, n);
+ len += n;
+ }
+ if (ferror(fp)) {
+ err = strerror(errno);
+ goto error_read;
+ }
+
+#if 0
+ // Not sure if I want to ensure their existence...
+ o = jv_object_set(o, jv_string("info"), jv_array());
+ o = jv_object_set(o, jv_string("warnings"), jv_array());
+#endif
+
+ o = parse_jpeg(o, data, len);
+error_read:
+ fclose(fp);
+ free(data);
+error:
+ if (err)
+ o = add_error(o, err);
+ return o;
+}
+
+int
+main(int argc, char *argv[])
+{
+ // XXX: Can't use `xargs -P0`, there's a risk of non-atomic writes.
+ // Usage: find . -iname *.png -print0 | xargs -0 ./pnginfo
+ for (int i = 1; i < argc; i++) {
+ const char *filename = argv[i];
+
+ jv o = jv_object();
+ o = jv_object_set(o, jv_string("filename"), jv_string(filename));
+ o = do_file(filename, o);
+ jv_dumpf(o, stdout, 0 /* Might consider JV_PRINT_SORTED. */);
+ fputc('\n', stdout);
+ }
+ return 0;
+}
--
cgit v1.2.3-70-g09d2