From 42a57b3271575fd323068bf8b9108d00f0b4a5b3 Mon Sep 17 00:00:00 2001 From: Přemysl Eric Janouch
Date: Fri, 22 Dec 2023 23:46:27 +0100 Subject: WIP: Global duplicate search --- main.go | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 122 insertions(+), 2 deletions(-) (limited to 'main.go') diff --git a/main.go b/main.go index d1fdbb9..4f574cc 100644 --- a/main.go +++ b/main.go @@ -446,7 +446,8 @@ func getSimilar(sha1 string, pixels int64, distance int) ( // // If there's a dhash, there should also be thumbnail dimensions, // so not bothering with IFNULL on them. - rows, err := db.Query(`SELECT sha1, width * height, thumbw, thumbh + rows, err := db.Query(` + SELECT sha1, width * height, IFNULL(thumbw, 0), IFNULL(thumbh, 0) FROM image WHERE hamming(dhash, (SELECT dhash FROM image WHERE sha1 = ?)) = ? AND sha1 <> ?`, sha1, distance, sha1) @@ -497,7 +498,8 @@ func handleAPISimilar(w http.ResponseWriter, r *http.Request) { } var width, height int64 - err := db.QueryRow(`SELECT width, height, thumbw, thumbh + err := db.QueryRow(` + SELECT width, height, IFNULL(thumbw, 0), IFNULL(thumbh, 0) FROM image WHERE sha1 = ?`, params.SHA1).Scan(&width, &height, &result.Info.ThumbW, &result.Info.ThumbH) if err != nil { @@ -522,6 +524,123 @@ func handleAPISimilar(w http.ResponseWriter, r *http.Request) { // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +type webDuplicateImage struct { + SHA1 string `json:"sha1"` + ThumbW int64 `json:"thumbW"` + ThumbH int64 `json:"thumbH"` + Occurences int64 `json:"occurences"` +} + +type webDuplicateGroup struct { + Main webDuplicateImage `json:"main"` + Similar []webDuplicateImage `json:"similar"` +} + +func getDuplicateSimilar(stmt *sql.Stmt, sha1 string, dhash int64) ( + result []webDuplicateImage, err error) { + rows, err := stmt.Query(dhash, sha1) + if err != nil { + return nil, err + } + defer rows.Close() + + result = []webDuplicateImage{} + for rows.Next() { + var image webDuplicateImage + if err = rows.Scan(&image.SHA1, &image.ThumbW, &image.ThumbH, + &image.Occurences); err != nil { + return nil, err + } + result = append(result, image) + } + return result, rows.Err() +} + +// A hamming distance of zero (direct dhash match) will be more than sufficient. +const duplicatesCTE = `WITH + multiplied(sha1, count) AS ( + SELECT sha1, COUNT(*) AS count FROM node + GROUP BY sha1 HAVING count > 1 + ), + similarized(sha1, count) AS ( + SELECT i1.sha1, COUNT(*) AS count FROM image AS i1 + JOIN image AS i2 ON i1.dhash = i2.dhash AND i1.sha1 <> i2.sha1 + GROUP BY i1.sha1 + ), + duplicates(sha1) AS ( + SELECT sha1 FROM multiplied + UNION + SELECT sha1 FROM similarized + )` + +func getDuplicates() (result []webDuplicateGroup, err error) { + stmt, err := db.Prepare(` + SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0), + COUNT(*) AS count + FROM image AS i + JOIN node AS n ON n.sha1 = i.sha1 + WHERE i.dhash = ? AND i.sha1 <> ? + GROUP BY n.sha1`) + if err != nil { + return nil, err + } + + // FIXME: Never duplicate images. + rows, err := db.Query(duplicatesCTE + ` + SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0), + i.dhash, COUNT(*) AS count + FROM image AS i + JOIN duplicates AS d ON d.sha1 = i.sha1 + JOIN node AS n ON n.sha1 = i.sha1 + GROUP BY n.sha1`) + if err != nil { + return nil, err + } + defer rows.Close() + + result = []webDuplicateGroup{} + for rows.Next() { + var ( + image webDuplicateImage + similar []webDuplicateImage + dhash int64 + ) + if err = rows.Scan(&image.SHA1, &image.ThumbW, &image.ThumbH, + &dhash, &image.Occurences); err != nil { + return nil, err + } + if similar, err = getDuplicateSimilar( + stmt, image.SHA1, dhash); err != nil { + return nil, err + } + result = append(result, webDuplicateGroup{ + Main: image, + Similar: similar, + }) + } + return result, rows.Err() +} + +func handleAPIDuplicates(w http.ResponseWriter, r *http.Request) { + var params struct{} + if err := json.NewDecoder(r.Body).Decode(¶ms); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + result, err := getDuplicates() + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if err := json.NewEncoder(w).Encode(result); err != nil { + log.Println(err) + } +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // cmdRun runs a web UI against GD on ADDRESS. func cmdRun(args []string) error { if len(args) != 2 { @@ -543,6 +662,7 @@ func cmdRun(args []string) error { http.HandleFunc("/api/browse", handleAPIBrowse) http.HandleFunc("/api/info", handleAPIInfo) http.HandleFunc("/api/similar", handleAPISimilar) + http.HandleFunc("/api/duplicates", handleAPIDuplicates) host, port, err := net.SplitHostPort(address) if err != nil { -- cgit v1.2.3-70-g09d2