summaryrefslogtreecommitdiff
path: root/main.go
diff options
context:
space:
mode:
authorPřemysl Eric Janouch <p@janouch.name>2023-12-22 23:46:27 +0100
committerPřemysl Eric Janouch <p@janouch.name>2023-12-22 23:46:27 +0100
commit42a57b3271575fd323068bf8b9108d00f0b4a5b3 (patch)
tree0b5e3e25d68d1d49e1f533cb3a7e1141796b1d42 /main.go
parent0b5d388af213680d0def2c03d3c8814c3e2ceaa2 (diff)
downloadgallery-42a57b3271575fd323068bf8b9108d00f0b4a5b3.tar.gz
gallery-42a57b3271575fd323068bf8b9108d00f0b4a5b3.tar.xz
gallery-42a57b3271575fd323068bf8b9108d00f0b4a5b3.zip
WIP: Global duplicate search
Diffstat (limited to 'main.go')
-rw-r--r--main.go124
1 files changed, 122 insertions, 2 deletions
diff --git a/main.go b/main.go
index d1fdbb9..4f574cc 100644
--- a/main.go
+++ b/main.go
@@ -446,7 +446,8 @@ func getSimilar(sha1 string, pixels int64, distance int) (
//
// If there's a dhash, there should also be thumbnail dimensions,
// so not bothering with IFNULL on them.
- rows, err := db.Query(`SELECT sha1, width * height, thumbw, thumbh
+ rows, err := db.Query(`
+ SELECT sha1, width * height, IFNULL(thumbw, 0), IFNULL(thumbh, 0)
FROM image
WHERE hamming(dhash, (SELECT dhash FROM image WHERE sha1 = ?)) = ?
AND sha1 <> ?`, sha1, distance, sha1)
@@ -497,7 +498,8 @@ func handleAPISimilar(w http.ResponseWriter, r *http.Request) {
}
var width, height int64
- err := db.QueryRow(`SELECT width, height, thumbw, thumbh
+ err := db.QueryRow(`
+ SELECT width, height, IFNULL(thumbw, 0), IFNULL(thumbh, 0)
FROM image WHERE sha1 = ?`, params.SHA1).Scan(&width, &height,
&result.Info.ThumbW, &result.Info.ThumbH)
if err != nil {
@@ -522,6 +524,123 @@ func handleAPISimilar(w http.ResponseWriter, r *http.Request) {
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+type webDuplicateImage struct {
+ SHA1 string `json:"sha1"`
+ ThumbW int64 `json:"thumbW"`
+ ThumbH int64 `json:"thumbH"`
+ Occurences int64 `json:"occurences"`
+}
+
+type webDuplicateGroup struct {
+ Main webDuplicateImage `json:"main"`
+ Similar []webDuplicateImage `json:"similar"`
+}
+
+func getDuplicateSimilar(stmt *sql.Stmt, sha1 string, dhash int64) (
+ result []webDuplicateImage, err error) {
+ rows, err := stmt.Query(dhash, sha1)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ result = []webDuplicateImage{}
+ for rows.Next() {
+ var image webDuplicateImage
+ if err = rows.Scan(&image.SHA1, &image.ThumbW, &image.ThumbH,
+ &image.Occurences); err != nil {
+ return nil, err
+ }
+ result = append(result, image)
+ }
+ return result, rows.Err()
+}
+
+// A hamming distance of zero (direct dhash match) will be more than sufficient.
+const duplicatesCTE = `WITH
+ multiplied(sha1, count) AS (
+ SELECT sha1, COUNT(*) AS count FROM node
+ GROUP BY sha1 HAVING count > 1
+ ),
+ similarized(sha1, count) AS (
+ SELECT i1.sha1, COUNT(*) AS count FROM image AS i1
+ JOIN image AS i2 ON i1.dhash = i2.dhash AND i1.sha1 <> i2.sha1
+ GROUP BY i1.sha1
+ ),
+ duplicates(sha1) AS (
+ SELECT sha1 FROM multiplied
+ UNION
+ SELECT sha1 FROM similarized
+ )`
+
+func getDuplicates() (result []webDuplicateGroup, err error) {
+ stmt, err := db.Prepare(`
+ SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0),
+ COUNT(*) AS count
+ FROM image AS i
+ JOIN node AS n ON n.sha1 = i.sha1
+ WHERE i.dhash = ? AND i.sha1 <> ?
+ GROUP BY n.sha1`)
+ if err != nil {
+ return nil, err
+ }
+
+ // FIXME: Never duplicate images.
+ rows, err := db.Query(duplicatesCTE + `
+ SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0),
+ i.dhash, COUNT(*) AS count
+ FROM image AS i
+ JOIN duplicates AS d ON d.sha1 = i.sha1
+ JOIN node AS n ON n.sha1 = i.sha1
+ GROUP BY n.sha1`)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ result = []webDuplicateGroup{}
+ for rows.Next() {
+ var (
+ image webDuplicateImage
+ similar []webDuplicateImage
+ dhash int64
+ )
+ if err = rows.Scan(&image.SHA1, &image.ThumbW, &image.ThumbH,
+ &dhash, &image.Occurences); err != nil {
+ return nil, err
+ }
+ if similar, err = getDuplicateSimilar(
+ stmt, image.SHA1, dhash); err != nil {
+ return nil, err
+ }
+ result = append(result, webDuplicateGroup{
+ Main: image,
+ Similar: similar,
+ })
+ }
+ return result, rows.Err()
+}
+
+func handleAPIDuplicates(w http.ResponseWriter, r *http.Request) {
+ var params struct{}
+ if err := json.NewDecoder(r.Body).Decode(&params); err != nil {
+ http.Error(w, err.Error(), http.StatusBadRequest)
+ return
+ }
+
+ result, err := getDuplicates()
+ if err != nil {
+ http.Error(w, err.Error(), http.StatusInternalServerError)
+ return
+ }
+
+ if err := json.NewEncoder(w).Encode(result); err != nil {
+ log.Println(err)
+ }
+}
+
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
// cmdRun runs a web UI against GD on ADDRESS.
func cmdRun(args []string) error {
if len(args) != 2 {
@@ -543,6 +662,7 @@ func cmdRun(args []string) error {
http.HandleFunc("/api/browse", handleAPIBrowse)
http.HandleFunc("/api/info", handleAPIInfo)
http.HandleFunc("/api/similar", handleAPISimilar)
+ http.HandleFunc("/api/duplicates", handleAPIDuplicates)
host, port, err := net.SplitHostPort(address)
if err != nil {