ketr.photos/scanner/scanner.c
James Ketrenos 94776ca233 Advanced filtering
Signed-off-by: James Ketrenos <james_gitlab@ketrenos.com>
2020-01-09 21:37:00 -08:00

569 lines
14 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <sys/types.h>
#include <string.h>
#include <math.h>
#include <sqlite3.h>
#ifndef MIN_PTS
#define MIN_PTS 20
#endif
#ifndef MAX_DISTANCE
#define MAX_DISTANCE 0.354667L
#endif
typedef enum {
UNDEFINED = 0,
CORE = 1,
EDGE = 2,
NOISE = 3
} ClusterTypes;
typedef struct Face {
long double descriptor[128];
long int clusterId;
long faceId;
long photoId;
double confidence;
ClusterTypes clusterType;
double *distances;
} Face;
typedef struct FaceLink {
struct FaceLink *pNext;
Face *pFace;
} FaceLink;
char fileBuf[5000];
char pathBuf[1028];
Face *readFaceDescriptor(Face *pFace, long id, char *path) {
FILE *f;
f = fopen(path, "r");
if (!f) {
return NULL;
}
size_t s = fread(fileBuf, 1, sizeof(fileBuf), f);
fclose(f);
char *p = fileBuf;
fileBuf[s] = 0;
while (*p && *p != '-' && *p != '+' && (*p < '0' || *p > '9')) {
p++;
}
int i = 0;
for (i = 0; i < 128; i++) {
char *start = p;
while (*p && *p != ',' && *p != ']' && *p != ' ' && *p != '\n') {
p++;
}
if (!*p) {
break;
}
*p++ = 0;
sscanf(start, "%Lf", &pFace->descriptor[i]);
}
if (i != 128) {
return NULL;
}
pFace->faceId = id;
return pFace;
}
long double euclideanDistance(long double *a, long double *b) {
long double sum = 0.0L;
for (int i = 0; i < 128; i++) {
long double delta = a[i] - b[i];
sum += delta * delta;
}
return sqrtl(sum);
}
/* https://en.wikipedia.org/wiki/DBSCAN */
#if 0
DBSCAN(DB, distFunc, eps, minPts) {
C = 0 /* Cluster counter */
for each point P in database DB {
if label(P) ≠ undefined then continue /* Previously processed in inner loop */
Neighbors N = RangeQuery(DB, distFunc, P, eps) /* Find neighbors */
if |N| < minPts then { /* Density check */
label(P) = Noise /* Label as Noise */
continue
}
C = C + 1 /* next cluster label */
label(P) = C /* Label initial point */
Seed set S = N \ {P} /* Neighbors to expand */
for each point Q in S { /* Process every seed point */
if label(Q) = Noise then label(Q) = C /* Change Noise to border point */
if label(Q) ≠ undefined then continue /* Previously processed */
label(Q) = C /* Label neighbor */
Neighbors N = RangeQuery(DB, distFunc, Q, eps) /* Find neighbors */
if |N| ≥ minPts then { /* Density check */
S = S N /* Add new neighbors to seed set */
}
}
}
}
RangeQuery(DB, distFunc, Q, eps) {
Neighbors = empty list
for each point P in database DB { /* Scan all points in the database */
if distFunc(Q, P) ≤ eps then { /* Compute distance and check epsilon */
Neighbors = Neighbors {P} /* Add to result */
}
}
return Neighbors
}
#endif
FaceLink *RangeQuery(Face **ppFaces, long int faceCount, Face *pQ, double eps) {
FaceLink *pNeighbors = NULL;
for (long int i = 0; i < faceCount; i++) {
Face *pFace = ppFaces[i];
if (pFace->confidence <= 0.9) {
continue;
}
if (pFace->faceId == pQ->faceId) {
continue;
}
if (pQ->distances[i] > 0.0 && pQ->distances[i] <= eps) {
FaceLink *pLink = malloc(sizeof(*pLink));
memset(pLink, 0, sizeof(*pLink));
pLink->pFace = pFace;
pLink->pNext = pNeighbors;
pNeighbors = pLink;
}
}
return pNeighbors;
}
void freeChain(FaceLink *pLink) {
while (pLink) {
FaceLink *tmp = pLink->pNext;
free(pLink);
pLink = tmp;
}
}
long int chainLength(FaceLink *pLink) {
long int count = 0;
while (pLink) {
count++;
pLink = pLink->pNext;
}
return count;
}
long int maxPts = 0;
long int DBSCAN(Face **ppFaces, long int faceCount, double eps, int minPts) {
long int C = 0;
for (long int i = 0; i < faceCount; i++) {
Face *pFace = ppFaces[i];
if (pFace->confidence <= 0.9) {
continue;
}
if (pFace->clusterType != UNDEFINED) {
continue;
}
FaceLink *pNeighbors = RangeQuery(ppFaces, faceCount, pFace, eps);
long neighborCount = chainLength(pNeighbors);
if (neighborCount > maxPts) {
maxPts = neighborCount;
fprintf(stderr, "New max with id %ld: %ld\n", pFace->faceId, maxPts);
}
if (neighborCount < minPts) {
pFace->clusterType = NOISE;
freeChain(pNeighbors);
continue;
}
//printf("%ld has %ld neighbors.\n", pFace->faceId, neighborCount);
C++;
pFace->clusterId = C;
pFace->clusterType = CORE;
FaceLink *pLink = pNeighbors;
while (pLink) {
Face *pQ = pLink->pFace;
if (pQ->faceId == pFace->faceId) {
pLink = pLink->pNext;
continue;
}
if (pQ->clusterType == NOISE) {
pQ->clusterId = C;
pQ->clusterType = EDGE;
}
if (pQ->clusterType != UNDEFINED) {
pLink = pLink->pNext;
continue;
}
pQ->clusterId = C;
pQ->clusterType = EDGE;
FaceLink *pSubNeighbors = RangeQuery(ppFaces, faceCount, pQ, eps);
neighborCount = chainLength(pSubNeighbors);
if (neighborCount >= minPts) {
if (neighborCount > maxPts) {
maxPts = neighborCount;
fprintf(stderr, "New max with id %ld: %ld\n", pFace->faceId, maxPts);
}
pQ->clusterType = CORE;
/* Append these neighbors to the end of the chain */
FaceLink *pTmp = pLink;
while (pTmp->pNext) {
pTmp = pTmp->pNext;
}
pTmp->pNext = pSubNeighbors;
} else {
freeChain(pSubNeighbors);
}
pLink = pLink->pNext;
}
freeChain(pNeighbors);
}
return C;
}
typedef struct {
Face **ppFaces;
long int count;
} FaceCallbackData;
int parseFaceIdRow(void *data, int argc, char **argv, char **column) {
FaceCallbackData *map = data;
long int faceId = strtol(argv[0] ? argv[0] : "0", NULL, 10);
long int photoId = strtol(argv[1] ? argv[1] : "0", NULL, 10);
double confidence = strtod(argv[2] ? argv[2] : "0.0", NULL);
for (long int i = 0; i < map->count; i++) {
if (map->ppFaces[i]->faceId == faceId) {
map->ppFaces[i]->photoId = photoId;
map->ppFaces[i]->confidence = confidence;
break;
}
}
return 0;
}
/*
* 1. Count how many entries there are
* 2. Allocate storage to hold all entries
* 3. Read all entries into flat array
* 4. Allocate MxM matrix and pre-calculate distances
* 5. Perform DBSCAN across MxM matrix to cluster
*/
int main(int argc, char *argv[]) {
long maxId = 0;
long i;
long entries = 0;
long int minPts = MIN_PTS;
long double maxDistance = MAX_DISTANCE;
if (argc == 1) {
fprintf(stderr, "usage: scanner PATH MAX_DISTANCE MIN_PTS\n");
return -1;
}
if (argc > 2) {
sscanf(argv[2], "%Lf", &maxDistance);
}
if (argc > 3) {
sscanf(argv[3], "%ld", &minPts);
}
fprintf(stderr, "\nmaxDistance : %Lf\nminPts : %ld\n", maxDistance, minPts);
for (i = 0; i < 100; i++) {
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
DIR *faceDir = opendir(pathBuf);
if (!faceDir) {
continue;
}
struct dirent *ent;
while ((ent = readdir(faceDir)) != NULL) {
if (strstr(ent->d_name, ".json") == NULL) {
continue;
}
entries++;
}
closedir(faceDir);
}
Face **ppFaces = malloc(sizeof(Face *) * entries);
if (!ppFaces) {
fprintf(stderr, "Unable to allocate storage face descriptors.");
return -1;
}
for (i = 0; i < entries; i++) {
ppFaces[i] = malloc(sizeof(Face));
memset(ppFaces[i], 0, sizeof(Face));
}
for (i = 0; i < entries; i++) {
ppFaces[i]->distances = malloc(sizeof(*ppFaces[i]->distances) * entries);
if (!ppFaces[i]->distances) {
fprintf(stderr, "Unable to allocate storage for distance dictionary.");
return -1;
}
memset(ppFaces[i]->distances, 0, sizeof(*ppFaces[i]->distances) * entries);
}
long int processed = 0;
int last = 0;
for (i = 0; i < 100; i++) {
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
DIR *faceDir = opendir(pathBuf);
// fprintf(stderr, "Reading %s...\n", pathBuf);
if (!faceDir) {
fprintf(stderr, "Can not open %s\n", pathBuf);
continue;
}
struct dirent *ent;
while (processed < entries && (ent = readdir(faceDir)) != NULL) {
if (strstr(ent->d_name, ".json") == NULL) {
continue;
}
long id = 0;
char *p = ent->d_name;
while (*p && *p != '-') {
id *= 10;
id += *p - '0';
p++;
}
char path[1028*2];
sprintf(path, "%s/%s", pathBuf, ent->d_name);
maxId = maxId > id ? maxId : id;
if (!readFaceDescriptor(ppFaces[processed], id, path)) {
fprintf(stderr, "Unable to read %s.\n", path);
continue;
}
processed++;
if (processed % 1000 == 0) {
int perc = 100 * processed / (entries * entries);
if (perc != last) {
fprintf(stderr, "\rRead %d%% of descriptors.", perc);
last = perc;
}
}
}
closedir(faceDir);
}
fprintf(stderr, "\nRead %ld face descriptors...\n", entries);
/* Allocate storage for all distances */
sqlite3 *db;
int rc = sqlite3_open("db/photos.db", &db);
if (rc != SQLITE_OK) {
fprintf(stderr, "Cannot open database: %s\n", sqlite3_errmsg(db));
sqlite3_close(db);
return 1;
}
fprintf(stderr, "DB opened.");
char *err_msg = NULL;
FaceCallbackData data = {
ppFaces: ppFaces,
count: entries
};
rc = sqlite3_exec(db, "SELECT id,photoId,faceConfidence FROM faces", parseFaceIdRow, &data, &err_msg);
if (rc != SQLITE_OK) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
sqlite3_close(db);
return 1;
}
fprintf(stderr, "Face data loaded from DB\n");
processed = 0;
long double total = 0.0;
for (long i = 0; i < entries; i++) {
Face *pLink = ppFaces[i];
for (long j = 0; j < entries; j++) {
Face *pTarget = ppFaces[j];
processed++;
if (processed % 1000 == 0) {
int perc = 100 * processed / (entries * entries);
if (perc != last) {
fprintf(stderr, "\rComputed %d%% complete.", perc);
last = perc;
}
}
if (pLink->confidence <= 0.9 || pTarget->confidence <= 0.9) {
pLink->distances[i] = 0.0;
pTarget->distances[j] = 0.0;
continue;
}
if (i == j) {
pLink->distances[i] = 0.0;
pTarget->distances[j] = 0.0;
continue;
}
if (pLink->distances[j] != 0.0) {
continue;
}
pLink->distances[j] =
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
total += pLink->distances[j];
}
}
fprintf(stderr, "\nAverage distance: %Lf\n", 1. * total / (entries * entries));
long int clusters = DBSCAN(ppFaces, entries, maxDistance, minPts);
long int undefined = 0, outlier = 0, core = 0, reachable = 0;
for (i = 0; i < entries; i++) {
switch (ppFaces[i]->clusterType) {
case NOISE:
outlier++;
break;
case UNDEFINED:
undefined++;
break;
case CORE:
core++;
break;
case EDGE:
reachable++;
break;
}
}
fprintf(stderr, "%ld clusters identified!\n", clusters);
fprintf(stderr, "%ld NOISE\n", outlier);
fprintf(stderr, "%ld UNDEFINED\n", undefined);
fprintf(stderr, "%ld CORE\n", core);
fprintf(stderr, "%ld EDGE\n", reachable);
fprintf(stdout, "<script>\nvar clusters = [\n");
for (long i = 1; i <= clusters; i++) {
long nodes = 0;
fprintf(stdout, "/* %ld. */ [", i);
for (long int j = 0; j < entries; j++) {
if (ppFaces[j]->clusterId == i) {
if (nodes == 0) {
fprintf(stdout, "[%ld,%ld]", ppFaces[j]->faceId, ppFaces[j]->photoId);
} else {
fprintf(stdout, ",[%ld,%ld]", ppFaces[j]->faceId, ppFaces[j]->photoId);
}
nodes++;
}
}
if (i < clusters) {
fprintf(stdout, "],\n");
} else {
fprintf(stdout, "]\n");
}
}
fprintf(stdout, "];\n</script>\n");
char *sql =
"DELETE FROM facedistances;"
"BEGIN TRANSACTION;";
rc = sqlite3_exec(db, sql, 0, 0, &err_msg);
if (rc != SQLITE_OK ) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
sqlite3_close(db);
return 1;
}
fprintf(stderr, "facedistances deleted and transaction started.\n");
for (long i = 0; i < entries; i++) {
memset(ppFaces[i]->distances, 0, sizeof(*ppFaces[i]->distances) * entries);
}
char sqlBuf[1024];
processed = 0;
last = 0;
for (long i = 0; i < entries; i++) {
Face *pLink = ppFaces[i];
for (long j = 0; j < entries; j++) {
Face *pTarget = ppFaces[j];
processed++;
if (processed % 1000 == 0) {
int perc = 100 * processed / (entries * entries);
if (perc != last) {
fprintf(stderr, "\rComputed %d%% complete.", perc);
last = perc;
}
}
if (i == j) {
pLink->distances[i] = 0.0;
pTarget->distances[j] = 0.0;
continue;
}
if (pLink->distances[j] != 0.0) {
continue;
}
pLink->distances[j] =
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
if (pLink->distances[j] < 0.5) {
sprintf(sqlBuf, "INSERT INTO facedistances (face1Id,face2Id,distance) VALUES (%ld,%ld,%lf);",
((pLink->faceId < pTarget->faceId) ? pLink->faceId : pTarget->faceId),
((pLink->faceId < pTarget->faceId) ? pTarget->faceId : pLink->faceId),
pLink->distances[j]);
rc = sqlite3_exec(db, sqlBuf, 0, 0, &err_msg);
if (rc != SQLITE_OK ) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
sqlite3_close(db);
return 1;
}
}
}
}
fprintf(stderr, "\n");
sprintf(sqlBuf, "UPDATE faces SET lastComparedId=%ld;", maxId);
rc = sqlite3_exec(db, "COMMIT;", 0, 0, &err_msg);
if (rc != SQLITE_OK ) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
sqlite3_close(db);
return 1;
}
sqlite3_close(db);
return 0;
}