aboutsummaryrefslogtreecommitdiff
path: root/tools/info.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/info.c')
-rw-r--r--tools/info.c286
1 files changed, 286 insertions, 0 deletions
diff --git a/tools/info.c b/tools/info.c
new file mode 100644
index 0000000..440939f
--- /dev/null
+++ b/tools/info.c
@@ -0,0 +1,286 @@
+//
+// info.c: acquire information about JPEG/TIFF/BMFF/WebP files in JSON format
+//
+// Copyright (c) 2021 - 2023, Přemysl Eric Janouch <p@janouch.name>
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+
+#include "info.h"
+
+#include <jv.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// --- ISO/IEC base media file format ------------------------------------------
+// ISO/IEC 14496-12:2015(E), used to be publicly available, now there's only:
+// https://mpeg.chiariglione.org/standards/mpeg-4/iso-base-media-file-format/text-isoiec-14496-12-5th-edition
+// but people have managed to archive the final version as well:
+// https://b.goeswhere.com/ISO_IEC_14496-12_2015.pdf
+//
+// ISO/IEC 23008-12:2017 Information technology -
+// High efficiency coding and media delivery in heterogeneous environments -
+// Part 12: Image File Format + Cor 1:2020 Technical Corrigendum 1
+// https://standards.iso.org/ittf/PubliclyAvailableStandards/
+
+static jv
+parse_bmff_box(jv o, const char *type, const uint8_t *data, size_t len)
+{
+ // TODO(p): Parse out "uuid"'s uint8_t[16] initial field, present as hex.
+ // TODO(p): Parse out "ftyp" contents: 14496-12:2015 4.3
+ // TODO(p): Parse out other important boxes: 14496-12:2015 8+
+ return add_to_subarray(o, "boxes", jv_string(type));
+}
+
+static bool
+detect_bmff(const uint8_t *p, size_t len)
+{
+ // 4.2 Object Structure--this box need not be present, nor at the beginning
+ // TODO(p): What does `aligned(8)` mean? It's probably in bits.
+ return len >= 8 && !memcmp(p + 4, "ftyp", 4);
+}
+
+static jv
+parse_bmff(jv o, const uint8_t *p, size_t len)
+{
+ if (!detect_bmff(p, len))
+ return add_error(o, "not BMFF at all or unsupported");
+
+ const uint8_t *end = p + len;
+ while (p < end) {
+ if (end - p < 8) {
+ o = add_warning(o, "box framing mismatch");
+ break;
+ }
+
+ char type[5] = "";
+ memcpy(type, p + 4, 4);
+
+ uint64_t box_size = u32be(p);
+ const uint8_t *data = p + 8;
+ if (box_size == 1) {
+ if (end - p < 16) {
+ o = add_warning(o, "unexpected EOF");
+ break;
+ }
+ box_size = u64be(data);
+ data += 8;
+ } else if (!box_size)
+ box_size = end - p;
+
+ if (box_size > (uint64_t) (end - p)) {
+ o = add_warning(o, "unexpected EOF");
+ break;
+ }
+
+ size_t data_len = box_size - (data - p);
+ o = parse_bmff_box(o, type, data, data_len);
+ p += box_size;
+ }
+ return o;
+}
+
+// --- WebP --------------------------------------------------------------------
+// libwebp won't let us simply iterate over all chunks, so handroll it.
+//
+// https://github.com/webmproject/libwebp/blob/master/doc/webp-container-spec.txt
+// https://github.com/webmproject/libwebp/blob/master/doc/webp-lossless-bitstream-spec.txt
+// https://datatracker.ietf.org/doc/html/rfc6386
+//
+// Pretty versions, hopefully not outdated:
+// https://developers.google.com/speed/webp/docs/riff_container
+// https://developers.google.com/speed/webp/docs/webp_lossless_bitstream_specification
+
+static bool
+detect_webp(const uint8_t *p, size_t len)
+{
+ return len >= 12 && !memcmp(p, "RIFF", 4) && !memcmp(p + 8, "WEBP", 4);
+}
+
+static jv
+parse_webp_vp8(jv o, const uint8_t *p, size_t len)
+{
+ if (len < 10 || (p[0] & 1) != 0 /* key frame */ ||
+ p[3] != 0x9d || p[4] != 0x01 || p[5] != 0x2a) {
+ return add_warning(o, "invalid VP8 chunk");
+ }
+
+ o = jv_set(o, jv_string("width"), jv_number(u16le(p + 6) & 0x3fff));
+ o = jv_set(o, jv_string("height"), jv_number(u16le(p + 8) & 0x3fff));
+ return o;
+}
+
+static jv
+parse_webp_vp8l(jv o, const uint8_t *p, size_t len)
+{
+ if (len < 5 || p[0] != 0x2f)
+ return add_warning(o, "invalid VP8L chunk");
+
+ // Reading LSB-first from a little endian value means reading in order.
+ uint32_t header = u32le(p + 1);
+ o = jv_set(o, jv_string("width"), jv_number((header & 0x3fff) + 1));
+ header >>= 14;
+ o = jv_set(o, jv_string("height"), jv_number((header & 0x3fff) + 1));
+ header >>= 14;
+ o = jv_set(o, jv_string("alpha_is_used"), jv_bool(header & 1));
+ return o;
+}
+
+static jv
+parse_webp_vp8x(jv o, const uint8_t *p, size_t len)
+{
+ if (len < 10)
+ return add_warning(o, "invalid VP8X chunk");
+
+ // Most of the fields in this chunk are duplicate or inferrable.
+ // Probably not worth decoding or verifying.
+ // TODO(p): For animations, we need to use the width and height from here.
+ uint8_t flags = p[0];
+ o = jv_set(o, jv_string("animation"), jv_bool((flags >> 1) & 1));
+ return o;
+}
+
+static jv
+parse_webp(jv o, const uint8_t *p, size_t len)
+{
+ if (!detect_webp(p, len))
+ return add_error(o, "not a WEBP file");
+
+ // TODO(p): This can still be parseable.
+ // TODO(p): Warn on trailing data.
+ uint32_t size = u32le(p + 4);
+ if (8 + size < len)
+ return add_error(o, "truncated file");
+
+ const uint8_t *end = p + 8 + size;
+ p += 12;
+
+ jv chunks = jv_array();
+ while (p < end) {
+ if (end - p < 8) {
+ o = add_warning(o, "framing mismatch");
+ printf("%ld", end - p);
+ break;
+ }
+
+ uint32_t chunk_size = u32le(p + 4);
+ uint32_t chunk_advance = (chunk_size + 1) & ~1;
+ if (p + 8 + chunk_advance > end) {
+ o = add_warning(o, "runaway chunk payload");
+ break;
+ }
+
+ char fourcc[5] = "";
+ memcpy(fourcc, p, 4);
+ chunks = jv_array_append(chunks, jv_string(fourcc));
+ p += 8;
+
+ // TODO(p): Decode more chunks.
+ if (!strcmp(fourcc, "VP8 "))
+ o = parse_webp_vp8(o, p, chunk_size);
+ if (!strcmp(fourcc, "VP8L"))
+ o = parse_webp_vp8l(o, p, chunk_size);
+ if (!strcmp(fourcc, "VP8X"))
+ o = parse_webp_vp8x(o, p, chunk_size);
+ if (!strcmp(fourcc, "EXIF"))
+ o = parse_exif(o, p, chunk_size);
+ if (!strcmp(fourcc, "ICCP"))
+ o = parse_icc(o, p, chunk_size);
+ p += chunk_advance;
+ }
+ return jv_set(o, jv_string("chunks"), chunks);
+}
+
+// --- I/O ---------------------------------------------------------------------
+
+static struct {
+ const char *name;
+ bool (*detect) (const uint8_t *, size_t);
+ jv (*parse) (jv, const uint8_t *, size_t);
+} formats[] = {
+ {"JPEG", detect_jpeg, parse_jpeg},
+ {"TIFF", detect_tiff, parse_tiff},
+ {"BMFF", detect_bmff, parse_bmff},
+ {"WebP", detect_webp, parse_webp},
+};
+
+static jv
+parse_any(jv o, const uint8_t *p, size_t len)
+{
+ // TODO(p): Also see if the file extension is appropriate.
+ for (size_t i = 0; i < sizeof formats / sizeof *formats; i++) {
+ if (!formats[i].detect(p, len))
+ continue;
+ if (getenv("INFO_IDENTIFY"))
+ o = jv_set(o, jv_string("format"), jv_string(formats[i].name));
+ return formats[i].parse(o, p, len);
+ }
+ return add_error(o, "unsupported file format");
+}
+
+static jv
+do_file(const char *filename, jv o)
+{
+ const char *err = NULL;
+ FILE *fp = fopen(filename, "rb");
+ if (!fp) {
+ err = strerror(errno);
+ goto error;
+ }
+
+ uint8_t *data = NULL, buf[256 << 10];
+ size_t n, len = 0;
+ while ((n = fread(buf, sizeof *buf, sizeof buf / sizeof *buf, fp))) {
+ data = realloc(data, len + n);
+ memcpy(data + len, buf, n);
+ len += n;
+ }
+ if (ferror(fp)) {
+ err = strerror(errno);
+ goto error_read;
+ }
+
+#if 0
+ // Not sure if I want to ensure their existence...
+ o = jv_object_set(o, jv_string("info"), jv_array());
+ o = jv_object_set(o, jv_string("warnings"), jv_array());
+#endif
+
+ o = parse_any(o, data, len);
+error_read:
+ fclose(fp);
+ free(data);
+error:
+ if (err)
+ o = add_error(o, err);
+ return o;
+}
+
+int
+main(int argc, char *argv[])
+{
+ // XXX: Can't use `xargs -P0`, there's a risk of non-atomic writes.
+ // Usage: find . -print0 | xargs -0 ./info
+ for (int i = 1; i < argc; i++) {
+ const char *filename = argv[i];
+
+ jv o = jv_object();
+ o = jv_object_set(o, jv_string("filename"), jv_string(filename));
+ o = do_file(filename, o);
+ jv_dumpf(o, stdout, 0 /* JV_PRINT_SORTED would discard information. */);
+ fputc('\n', stdout);
+ }
+ return 0;
+}