From 42a57b3271575fd323068bf8b9108d00f0b4a5b3 Mon Sep 17 00:00:00 2001 From: Přemysl Eric Janouch
Date: Fri, 22 Dec 2023 23:46:27 +0100 Subject: WIP: Global duplicate search --- main.go | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++++- public/gallery.js | 57 ++++++++++++++++++++++++- public/style.css | 3 ++ 3 files changed, 180 insertions(+), 4 deletions(-) diff --git a/main.go b/main.go index d1fdbb9..4f574cc 100644 --- a/main.go +++ b/main.go @@ -446,7 +446,8 @@ func getSimilar(sha1 string, pixels int64, distance int) ( // // If there's a dhash, there should also be thumbnail dimensions, // so not bothering with IFNULL on them. - rows, err := db.Query(`SELECT sha1, width * height, thumbw, thumbh + rows, err := db.Query(` + SELECT sha1, width * height, IFNULL(thumbw, 0), IFNULL(thumbh, 0) FROM image WHERE hamming(dhash, (SELECT dhash FROM image WHERE sha1 = ?)) = ? AND sha1 <> ?`, sha1, distance, sha1) @@ -497,7 +498,8 @@ func handleAPISimilar(w http.ResponseWriter, r *http.Request) { } var width, height int64 - err := db.QueryRow(`SELECT width, height, thumbw, thumbh + err := db.QueryRow(` + SELECT width, height, IFNULL(thumbw, 0), IFNULL(thumbh, 0) FROM image WHERE sha1 = ?`, params.SHA1).Scan(&width, &height, &result.Info.ThumbW, &result.Info.ThumbH) if err != nil { @@ -522,6 +524,123 @@ func handleAPISimilar(w http.ResponseWriter, r *http.Request) { // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +type webDuplicateImage struct { + SHA1 string `json:"sha1"` + ThumbW int64 `json:"thumbW"` + ThumbH int64 `json:"thumbH"` + Occurences int64 `json:"occurences"` +} + +type webDuplicateGroup struct { + Main webDuplicateImage `json:"main"` + Similar []webDuplicateImage `json:"similar"` +} + +func getDuplicateSimilar(stmt *sql.Stmt, sha1 string, dhash int64) ( + result []webDuplicateImage, err error) { + rows, err := stmt.Query(dhash, sha1) + if err != nil { + return nil, err + } + defer rows.Close() + + result = []webDuplicateImage{} + for rows.Next() { + var image webDuplicateImage + if err = rows.Scan(&image.SHA1, &image.ThumbW, &image.ThumbH, + &image.Occurences); err != nil { + return nil, err + } + result = append(result, image) + } + return result, rows.Err() +} + +// A hamming distance of zero (direct dhash match) will be more than sufficient. +const duplicatesCTE = `WITH + multiplied(sha1, count) AS ( + SELECT sha1, COUNT(*) AS count FROM node + GROUP BY sha1 HAVING count > 1 + ), + similarized(sha1, count) AS ( + SELECT i1.sha1, COUNT(*) AS count FROM image AS i1 + JOIN image AS i2 ON i1.dhash = i2.dhash AND i1.sha1 <> i2.sha1 + GROUP BY i1.sha1 + ), + duplicates(sha1) AS ( + SELECT sha1 FROM multiplied + UNION + SELECT sha1 FROM similarized + )` + +func getDuplicates() (result []webDuplicateGroup, err error) { + stmt, err := db.Prepare(` + SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0), + COUNT(*) AS count + FROM image AS i + JOIN node AS n ON n.sha1 = i.sha1 + WHERE i.dhash = ? AND i.sha1 <> ? + GROUP BY n.sha1`) + if err != nil { + return nil, err + } + + // FIXME: Never duplicate images. + rows, err := db.Query(duplicatesCTE + ` + SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0), + i.dhash, COUNT(*) AS count + FROM image AS i + JOIN duplicates AS d ON d.sha1 = i.sha1 + JOIN node AS n ON n.sha1 = i.sha1 + GROUP BY n.sha1`) + if err != nil { + return nil, err + } + defer rows.Close() + + result = []webDuplicateGroup{} + for rows.Next() { + var ( + image webDuplicateImage + similar []webDuplicateImage + dhash int64 + ) + if err = rows.Scan(&image.SHA1, &image.ThumbW, &image.ThumbH, + &dhash, &image.Occurences); err != nil { + return nil, err + } + if similar, err = getDuplicateSimilar( + stmt, image.SHA1, dhash); err != nil { + return nil, err + } + result = append(result, webDuplicateGroup{ + Main: image, + Similar: similar, + }) + } + return result, rows.Err() +} + +func handleAPIDuplicates(w http.ResponseWriter, r *http.Request) { + var params struct{} + if err := json.NewDecoder(r.Body).Decode(¶ms); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + result, err := getDuplicates() + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if err := json.NewEncoder(w).Encode(result); err != nil { + log.Println(err) + } +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // cmdRun runs a web UI against GD on ADDRESS. func cmdRun(args []string) error { if len(args) != 2 { @@ -543,6 +662,7 @@ func cmdRun(args []string) error { http.HandleFunc("/api/browse", handleAPIBrowse) http.HandleFunc("/api/info", handleAPIInfo) http.HandleFunc("/api/similar", handleAPISimilar) + http.HandleFunc("/api/duplicates", handleAPIDuplicates) host, port, err := net.SplitHostPort(address) if err != nil { diff --git a/public/gallery.js b/public/gallery.js index ee11858..009eb1e 100644 --- a/public/gallery.js +++ b/public/gallery.js @@ -213,7 +213,7 @@ let View = { m(m.route.Link, { href: `/similar/:key`, params: {key: ViewModel.sha1}, - }, "Similar") + }, "Similar"), ]), m('.body', {}, [view, m(ViewBar)]), ]) @@ -302,7 +302,7 @@ let Similar = { m(m.route.Link, { href: `/view/:key`, params: {key: SimilarModel.sha1}, - }, "View") + }, "View"), ]), m('.body', {}, m(SimilarList)), ]) @@ -311,6 +311,58 @@ let Similar = { // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +let DuplicatesModel = { + entries: [], + + async reload() { + this.entries = await call('duplicates', {}) + }, +} + +let DuplicatesThumbnail = { + view(vnode) { + const info = vnode.attrs.info + return [ + m(m.route.Link, {href: `/similar/${info.sha1}`}, + m('img', {src: `/thumb/${info.sha1}`, + width: info.thumbW, height: info.thumbH})), + info.occurences, + ] + }, +} + +let DuplicatesList = { + view(vnode) { + if (DuplicatesModel.entries.length == 0) + return "No duplicates" + + return m('.duplicates', {}, DuplicatesModel.entries.map(entry => + m('.row', [ + m(DuplicatesThumbnail, {info: entry.main}), + entry.similar.map(entry => + m(DuplicatesThumbnail, {info: entry})), + ]), + )) + }, +} + +let Duplicates = { + oninit(vnode) { + DuplicatesModel.reload() + }, + + view(vnode) { + return m('.container', {}, [ + m('.header', {}, [ + "Duplicates", + ]), + m('.body', {}, m(DuplicatesList)), + ]) + }, +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + window.addEventListener('load', () => { m.route(document.body, "/browse/", { // The path doesn't need to be escaped, perhaps change that (":key..."). @@ -318,6 +370,7 @@ window.addEventListener('load', () => { "/browse/:key": Browse, "/view/:key": View, "/similar/:key": Similar, + "/duplicates": Duplicates, "/tags": undefined, "/tags/:space": undefined, diff --git a/public/style.css b/public/style.css index d6c2e3f..d18735d 100644 --- a/public/style.css +++ b/public/style.css @@ -54,3 +54,6 @@ ul.sidebar li.child a { .similar h2 { margin: 1em 0 0.5em 0; padding: 0; font-size: 1.2rem; } .similar .row { display: flex; } .similar .row ul { margin: 0; padding: 0 0 0 1.25em; list-style-type: "- "; } + +.duplicates { padding: .5rem; flex-grow: 1; overflow: auto; } +.duplicates .row { display: flex; } -- cgit v1.2.3-70-g09d2