From e0a64f2e1dcf9effdb9e0d6aea0eca7124bafadb Mon Sep 17 00:00:00 2001 From: Přemysl Eric Janouch
Date: Sat, 23 Dec 2023 01:32:06 +0100 Subject: WIP: Global duplicate search --- main.go | 115 +++++++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 70 insertions(+), 45 deletions(-) (limited to 'main.go') diff --git a/main.go b/main.go index 4f574cc..ae0a5f7 100644 --- a/main.go +++ b/main.go @@ -531,14 +531,24 @@ type webDuplicateImage struct { Occurences int64 `json:"occurences"` } -type webDuplicateGroup struct { - Main webDuplicateImage `json:"main"` - Similar []webDuplicateImage `json:"similar"` -} - -func getDuplicateSimilar(stmt *sql.Stmt, sha1 string, dhash int64) ( +// A hamming distance of zero (direct dhash match) will be more than sufficient. +const duplicatesCTE = `WITH + duplicated(dhash, count) AS ( + SELECT dhash, COUNT(*) AS count FROM image + GROUP BY dhash HAVING count > 1 + ), + multipathed(sha1, count) AS ( + SELECT n.sha1, COUNT(*) AS count FROM node AS n + JOIN image AS i ON i.sha1 = n.sha1 + WHERE i.dhash IS NULL + OR i.dhash NOT IN (SELECT dhash FROM duplicated) + GROUP BY n.sha1 HAVING count > 1 + ) +` + +func getDuplicatesSimilar(stmt *sql.Stmt, dhash int64) ( result []webDuplicateImage, err error) { - rows, err := stmt.Query(dhash, sha1) + rows, err := stmt.Query(dhash) if err != nil { return nil, err } @@ -556,67 +566,75 @@ func getDuplicateSimilar(stmt *sql.Stmt, sha1 string, dhash int64) ( return result, rows.Err() } -// A hamming distance of zero (direct dhash match) will be more than sufficient. -const duplicatesCTE = `WITH - multiplied(sha1, count) AS ( - SELECT sha1, COUNT(*) AS count FROM node - GROUP BY sha1 HAVING count > 1 - ), - similarized(sha1, count) AS ( - SELECT i1.sha1, COUNT(*) AS count FROM image AS i1 - JOIN image AS i2 ON i1.dhash = i2.dhash AND i1.sha1 <> i2.sha1 - GROUP BY i1.sha1 - ), - duplicates(sha1) AS ( - SELECT sha1 FROM multiplied - UNION - SELECT sha1 FROM similarized - )` - -func getDuplicates() (result []webDuplicateGroup, err error) { +func getDuplicates1(result [][]webDuplicateImage) ( + [][]webDuplicateImage, error) { stmt, err := db.Prepare(` SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0), - COUNT(*) AS count + COUNT(*) AS occurences FROM image AS i JOIN node AS n ON n.sha1 = i.sha1 - WHERE i.dhash = ? AND i.sha1 <> ? + WHERE i.dhash = ? GROUP BY n.sha1`) if err != nil { return nil, err } + defer stmt.Close() + + rows, err := db.Query(duplicatesCTE + `SELECT dhash FROM duplicated`) + if err != nil { + return nil, err + } + defer rows.Close() + + for rows.Next() { + var ( + group []webDuplicateImage + dhash int64 + ) + if err = rows.Scan(&dhash); err != nil { + return nil, err + } + if group, err = getDuplicatesSimilar(stmt, dhash); err != nil { + return nil, err + } + result = append(result, group) + } + return result, rows.Err() +} - // FIXME: Never duplicate images. - rows, err := db.Query(duplicatesCTE + ` +func getDuplicates2(result [][]webDuplicateImage) ( + [][]webDuplicateImage, error) { + stmt, err := db.Prepare(` SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0), - i.dhash, COUNT(*) AS count + COUNT(*) AS occurences FROM image AS i - JOIN duplicates AS d ON d.sha1 = i.sha1 JOIN node AS n ON n.sha1 = i.sha1 + WHERE i.sha1 = ? GROUP BY n.sha1`) if err != nil { return nil, err } + defer stmt.Close() + + rows, err := db.Query(duplicatesCTE + `SELECT sha1 FROM multipathed`) + if err != nil { + return nil, err + } defer rows.Close() - result = []webDuplicateGroup{} for rows.Next() { var ( - image webDuplicateImage - similar []webDuplicateImage - dhash int64 + image webDuplicateImage + sha1 string ) - if err = rows.Scan(&image.SHA1, &image.ThumbW, &image.ThumbH, - &dhash, &image.Occurences); err != nil { + if err = rows.Scan(&sha1); err != nil { return nil, err } - if similar, err = getDuplicateSimilar( - stmt, image.SHA1, dhash); err != nil { + if err := stmt.QueryRow(sha1).Scan(&image.SHA1, + &image.ThumbW, &image.ThumbH, &image.Occurences); err != nil { return nil, err } - result = append(result, webDuplicateGroup{ - Main: image, - Similar: similar, - }) + result = append(result, []webDuplicateImage{image}) } return result, rows.Err() } @@ -628,8 +646,15 @@ func handleAPIDuplicates(w http.ResponseWriter, r *http.Request) { return } - result, err := getDuplicates() - if err != nil { + var ( + result [][]webDuplicateImage + err error + ) + if result, err = getDuplicates1(result); err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + if result, err = getDuplicates2(result); err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) return } -- cgit v1.2.3-70-g09d2