diff options
author | Přemysl Eric Janouch <p@janouch.name> | 2023-12-22 23:46:27 +0100 |
---|---|---|
committer | Přemysl Eric Janouch <p@janouch.name> | 2023-12-22 23:46:27 +0100 |
commit | 42a57b3271575fd323068bf8b9108d00f0b4a5b3 (patch) | |
tree | 0b5e3e25d68d1d49e1f533cb3a7e1141796b1d42 /main.go | |
parent | 0b5d388af213680d0def2c03d3c8814c3e2ceaa2 (diff) | |
download | gallery-42a57b3271575fd323068bf8b9108d00f0b4a5b3.tar.gz gallery-42a57b3271575fd323068bf8b9108d00f0b4a5b3.tar.xz gallery-42a57b3271575fd323068bf8b9108d00f0b4a5b3.zip |
WIP: Global duplicate search
Diffstat (limited to 'main.go')
-rw-r--r-- | main.go | 124 |
1 files changed, 122 insertions, 2 deletions
@@ -446,7 +446,8 @@ func getSimilar(sha1 string, pixels int64, distance int) ( // // If there's a dhash, there should also be thumbnail dimensions, // so not bothering with IFNULL on them. - rows, err := db.Query(`SELECT sha1, width * height, thumbw, thumbh + rows, err := db.Query(` + SELECT sha1, width * height, IFNULL(thumbw, 0), IFNULL(thumbh, 0) FROM image WHERE hamming(dhash, (SELECT dhash FROM image WHERE sha1 = ?)) = ? AND sha1 <> ?`, sha1, distance, sha1) @@ -497,7 +498,8 @@ func handleAPISimilar(w http.ResponseWriter, r *http.Request) { } var width, height int64 - err := db.QueryRow(`SELECT width, height, thumbw, thumbh + err := db.QueryRow(` + SELECT width, height, IFNULL(thumbw, 0), IFNULL(thumbh, 0) FROM image WHERE sha1 = ?`, params.SHA1).Scan(&width, &height, &result.Info.ThumbW, &result.Info.ThumbH) if err != nil { @@ -522,6 +524,123 @@ func handleAPISimilar(w http.ResponseWriter, r *http.Request) { // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +type webDuplicateImage struct { + SHA1 string `json:"sha1"` + ThumbW int64 `json:"thumbW"` + ThumbH int64 `json:"thumbH"` + Occurences int64 `json:"occurences"` +} + +type webDuplicateGroup struct { + Main webDuplicateImage `json:"main"` + Similar []webDuplicateImage `json:"similar"` +} + +func getDuplicateSimilar(stmt *sql.Stmt, sha1 string, dhash int64) ( + result []webDuplicateImage, err error) { + rows, err := stmt.Query(dhash, sha1) + if err != nil { + return nil, err + } + defer rows.Close() + + result = []webDuplicateImage{} + for rows.Next() { + var image webDuplicateImage + if err = rows.Scan(&image.SHA1, &image.ThumbW, &image.ThumbH, + &image.Occurences); err != nil { + return nil, err + } + result = append(result, image) + } + return result, rows.Err() +} + +// A hamming distance of zero (direct dhash match) will be more than sufficient. +const duplicatesCTE = `WITH + multiplied(sha1, count) AS ( + SELECT sha1, COUNT(*) AS count FROM node + GROUP BY sha1 HAVING count > 1 + ), + similarized(sha1, count) AS ( + SELECT i1.sha1, COUNT(*) AS count FROM image AS i1 + JOIN image AS i2 ON i1.dhash = i2.dhash AND i1.sha1 <> i2.sha1 + GROUP BY i1.sha1 + ), + duplicates(sha1) AS ( + SELECT sha1 FROM multiplied + UNION + SELECT sha1 FROM similarized + )` + +func getDuplicates() (result []webDuplicateGroup, err error) { + stmt, err := db.Prepare(` + SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0), + COUNT(*) AS count + FROM image AS i + JOIN node AS n ON n.sha1 = i.sha1 + WHERE i.dhash = ? AND i.sha1 <> ? + GROUP BY n.sha1`) + if err != nil { + return nil, err + } + + // FIXME: Never duplicate images. + rows, err := db.Query(duplicatesCTE + ` + SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0), + i.dhash, COUNT(*) AS count + FROM image AS i + JOIN duplicates AS d ON d.sha1 = i.sha1 + JOIN node AS n ON n.sha1 = i.sha1 + GROUP BY n.sha1`) + if err != nil { + return nil, err + } + defer rows.Close() + + result = []webDuplicateGroup{} + for rows.Next() { + var ( + image webDuplicateImage + similar []webDuplicateImage + dhash int64 + ) + if err = rows.Scan(&image.SHA1, &image.ThumbW, &image.ThumbH, + &dhash, &image.Occurences); err != nil { + return nil, err + } + if similar, err = getDuplicateSimilar( + stmt, image.SHA1, dhash); err != nil { + return nil, err + } + result = append(result, webDuplicateGroup{ + Main: image, + Similar: similar, + }) + } + return result, rows.Err() +} + +func handleAPIDuplicates(w http.ResponseWriter, r *http.Request) { + var params struct{} + if err := json.NewDecoder(r.Body).Decode(¶ms); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + result, err := getDuplicates() + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + if err := json.NewEncoder(w).Encode(result); err != nil { + log.Println(err) + } +} + +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // cmdRun runs a web UI against GD on ADDRESS. func cmdRun(args []string) error { if len(args) != 2 { @@ -543,6 +662,7 @@ func cmdRun(args []string) error { http.HandleFunc("/api/browse", handleAPIBrowse) http.HandleFunc("/api/info", handleAPIInfo) http.HandleFunc("/api/similar", handleAPISimilar) + http.HandleFunc("/api/duplicates", handleAPIDuplicates) host, port, err := net.SplitHostPort(address) if err != nil { |