aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPřemysl Eric Janouch <p@janouch.name>2023-12-23 01:32:06 +0100
committerPřemysl Eric Janouch <p@janouch.name>2023-12-23 01:32:06 +0100
commite0a64f2e1dcf9effdb9e0d6aea0eca7124bafadb (patch)
treec4b94e793f4a96eb608147fc20f5b97be20ff3f2
parent42a57b3271575fd323068bf8b9108d00f0b4a5b3 (diff)
downloadgallery-e0a64f2e1dcf9effdb9e0d6aea0eca7124bafadb.tar.gz
gallery-e0a64f2e1dcf9effdb9e0d6aea0eca7124bafadb.tar.xz
gallery-e0a64f2e1dcf9effdb9e0d6aea0eca7124bafadb.zip
WIP: Global duplicate search
-rw-r--r--main.go115
-rw-r--r--public/gallery.js9
2 files changed, 73 insertions, 51 deletions
diff --git a/main.go b/main.go
index 4f574cc..ae0a5f7 100644
--- a/main.go
+++ b/main.go
@@ -531,14 +531,24 @@ type webDuplicateImage struct {
Occurences int64 `json:"occurences"`
}
-type webDuplicateGroup struct {
- Main webDuplicateImage `json:"main"`
- Similar []webDuplicateImage `json:"similar"`
-}
-
-func getDuplicateSimilar(stmt *sql.Stmt, sha1 string, dhash int64) (
+// A hamming distance of zero (direct dhash match) will be more than sufficient.
+const duplicatesCTE = `WITH
+ duplicated(dhash, count) AS (
+ SELECT dhash, COUNT(*) AS count FROM image
+ GROUP BY dhash HAVING count > 1
+ ),
+ multipathed(sha1, count) AS (
+ SELECT n.sha1, COUNT(*) AS count FROM node AS n
+ JOIN image AS i ON i.sha1 = n.sha1
+ WHERE i.dhash IS NULL
+ OR i.dhash NOT IN (SELECT dhash FROM duplicated)
+ GROUP BY n.sha1 HAVING count > 1
+ )
+`
+
+func getDuplicatesSimilar(stmt *sql.Stmt, dhash int64) (
result []webDuplicateImage, err error) {
- rows, err := stmt.Query(dhash, sha1)
+ rows, err := stmt.Query(dhash)
if err != nil {
return nil, err
}
@@ -556,67 +566,75 @@ func getDuplicateSimilar(stmt *sql.Stmt, sha1 string, dhash int64) (
return result, rows.Err()
}
-// A hamming distance of zero (direct dhash match) will be more than sufficient.
-const duplicatesCTE = `WITH
- multiplied(sha1, count) AS (
- SELECT sha1, COUNT(*) AS count FROM node
- GROUP BY sha1 HAVING count > 1
- ),
- similarized(sha1, count) AS (
- SELECT i1.sha1, COUNT(*) AS count FROM image AS i1
- JOIN image AS i2 ON i1.dhash = i2.dhash AND i1.sha1 <> i2.sha1
- GROUP BY i1.sha1
- ),
- duplicates(sha1) AS (
- SELECT sha1 FROM multiplied
- UNION
- SELECT sha1 FROM similarized
- )`
-
-func getDuplicates() (result []webDuplicateGroup, err error) {
+func getDuplicates1(result [][]webDuplicateImage) (
+ [][]webDuplicateImage, error) {
stmt, err := db.Prepare(`
SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0),
- COUNT(*) AS count
+ COUNT(*) AS occurences
FROM image AS i
JOIN node AS n ON n.sha1 = i.sha1
- WHERE i.dhash = ? AND i.sha1 <> ?
+ WHERE i.dhash = ?
GROUP BY n.sha1`)
if err != nil {
return nil, err
}
+ defer stmt.Close()
+
+ rows, err := db.Query(duplicatesCTE + `SELECT dhash FROM duplicated`)
+ if err != nil {
+ return nil, err
+ }
+ defer rows.Close()
+
+ for rows.Next() {
+ var (
+ group []webDuplicateImage
+ dhash int64
+ )
+ if err = rows.Scan(&dhash); err != nil {
+ return nil, err
+ }
+ if group, err = getDuplicatesSimilar(stmt, dhash); err != nil {
+ return nil, err
+ }
+ result = append(result, group)
+ }
+ return result, rows.Err()
+}
- // FIXME: Never duplicate images.
- rows, err := db.Query(duplicatesCTE + `
+func getDuplicates2(result [][]webDuplicateImage) (
+ [][]webDuplicateImage, error) {
+ stmt, err := db.Prepare(`
SELECT i.sha1, IFNULL(i.thumbw, 0), IFNULL(i.thumbh, 0),
- i.dhash, COUNT(*) AS count
+ COUNT(*) AS occurences
FROM image AS i
- JOIN duplicates AS d ON d.sha1 = i.sha1
JOIN node AS n ON n.sha1 = i.sha1
+ WHERE i.sha1 = ?
GROUP BY n.sha1`)
if err != nil {
return nil, err
}
+ defer stmt.Close()
+
+ rows, err := db.Query(duplicatesCTE + `SELECT sha1 FROM multipathed`)
+ if err != nil {
+ return nil, err
+ }
defer rows.Close()
- result = []webDuplicateGroup{}
for rows.Next() {
var (
- image webDuplicateImage
- similar []webDuplicateImage
- dhash int64
+ image webDuplicateImage
+ sha1 string
)
- if err = rows.Scan(&image.SHA1, &image.ThumbW, &image.ThumbH,
- &dhash, &image.Occurences); err != nil {
+ if err = rows.Scan(&sha1); err != nil {
return nil, err
}
- if similar, err = getDuplicateSimilar(
- stmt, image.SHA1, dhash); err != nil {
+ if err := stmt.QueryRow(sha1).Scan(&image.SHA1,
+ &image.ThumbW, &image.ThumbH, &image.Occurences); err != nil {
return nil, err
}
- result = append(result, webDuplicateGroup{
- Main: image,
- Similar: similar,
- })
+ result = append(result, []webDuplicateImage{image})
}
return result, rows.Err()
}
@@ -628,8 +646,15 @@ func handleAPIDuplicates(w http.ResponseWriter, r *http.Request) {
return
}
- result, err := getDuplicates()
- if err != nil {
+ var (
+ result [][]webDuplicateImage
+ err error
+ )
+ if result, err = getDuplicates1(result); err != nil {
+ http.Error(w, err.Error(), http.StatusInternalServerError)
+ return
+ }
+ if result, err = getDuplicates2(result); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
diff --git a/public/gallery.js b/public/gallery.js
index 009eb1e..19d63bb 100644
--- a/public/gallery.js
+++ b/public/gallery.js
@@ -336,12 +336,9 @@ let DuplicatesList = {
if (DuplicatesModel.entries.length == 0)
return "No duplicates"
- return m('.duplicates', {}, DuplicatesModel.entries.map(entry =>
- m('.row', [
- m(DuplicatesThumbnail, {info: entry.main}),
- entry.similar.map(entry =>
- m(DuplicatesThumbnail, {info: entry})),
- ]),
+ return m('.duplicates', {}, DuplicatesModel.entries.map(group =>
+ m('.row', group.map(entry =>
+ m(DuplicatesThumbnail, {info: entry}))),
))
},
}