ketr.photos/scanner/scanner.c
James Ketrenos 1b32b1ea7c Limit faces to only those with confidence > 0.9
Signed-off-by: James Ketrenos <james_gitlab@ketrenos.com>
2020-01-08 21:33:09 -08:00

474 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <sys/types.h>
#include <string.h>
#include <math.h>
#include <sqlite3.h>
typedef enum {
UNDEFINED = 0,
CORE = 1,
EDGE = 2,
NOISE = 3
} ClusterTypes;
typedef struct Face {
long double descriptor[128];
long int clusterId;
long faceId;
ClusterTypes clusterType;
double *distances;
} Face;
typedef struct FaceLink {
struct FaceLink *pNext;
Face *pFace;
} FaceLink;
char fileBuf[5000];
char pathBuf[1028];
Face *readFaceDescriptor(Face *pFace, long id, char *path) {
FILE *f;
f = fopen(path, "r");
if (!f) {
return NULL;
}
size_t s = fread(fileBuf, 1, sizeof(fileBuf), f);
fclose(f);
char *p = fileBuf;
fileBuf[s] = 0;
while (*p && *p != '-' && *p != '+' && (*p < '0' || *p > '9')) {
p++;
}
int i = 0;
for (i = 0; i < 128; i++) {
char *start = p;
while (*p && *p != ',' && *p != ']' && *p != ' ' && *p != '\n') {
p++;
}
if (!*p) {
break;
}
*p++ = 0;
sscanf(start, "%Lf", &pFace->descriptor[i]);
}
if (i != 128) {
return NULL;
}
pFace->faceId = id;
return pFace;
}
long double euclideanDistance(long double *a, long double *b) {
long double sum = 0.0L;
for (int i = 0; i < 128; i++) {
long double delta = a[i] - b[i];
sum += delta * delta;
}
return sqrtl(sum);
}
/* https://en.wikipedia.org/wiki/DBSCAN */
#if 0
DBSCAN(DB, distFunc, eps, minPts) {
C = 0 /* Cluster counter */
for each point P in database DB {
if label(P) ≠ undefined then continue /* Previously processed in inner loop */
Neighbors N = RangeQuery(DB, distFunc, P, eps) /* Find neighbors */
if |N| < minPts then { /* Density check */
label(P) = Noise /* Label as Noise */
continue
}
C = C + 1 /* next cluster label */
label(P) = C /* Label initial point */
Seed set S = N \ {P} /* Neighbors to expand */
for each point Q in S { /* Process every seed point */
if label(Q) = Noise then label(Q) = C /* Change Noise to border point */
if label(Q) ≠ undefined then continue /* Previously processed */
label(Q) = C /* Label neighbor */
Neighbors N = RangeQuery(DB, distFunc, Q, eps) /* Find neighbors */
if |N| ≥ minPts then { /* Density check */
S = S N /* Add new neighbors to seed set */
}
}
}
}
RangeQuery(DB, distFunc, Q, eps) {
Neighbors = empty list
for each point P in database DB { /* Scan all points in the database */
if distFunc(Q, P) ≤ eps then { /* Compute distance and check epsilon */
Neighbors = Neighbors {P} /* Add to result */
}
}
return Neighbors
}
#endif
FaceLink *RangeQuery(Face **ppFaces, long int faceCount, Face *pQ, double eps) {
FaceLink *pNeighbors = NULL;
for (long int i = 0; i < faceCount; i++) {
Face *pFace = ppFaces[i];
if (pFace->faceId == pQ->faceId) {
continue;
}
if (pQ->distances[i] <= eps) {
FaceLink *pLink = malloc(sizeof(*pLink));
memset(pLink, 0, sizeof(*pLink));
pLink->pFace = pFace;
pLink->pNext = pNeighbors;
pNeighbors = pLink;
}
}
return pNeighbors;
}
void freeChain(FaceLink *pLink) {
while (pLink) {
FaceLink *tmp = pLink->pNext;
free(pLink);
pLink = tmp;
}
}
long int chainLength(FaceLink *pLink) {
long int count = 0;
while (pLink) {
count++;
pLink = pLink->pNext;
}
return count;
}
long int maxPts = 0;
long int DBSCAN(Face **ppFaces, long int faceCount, double eps, int minPts) {
long int C = 0;
for (long int i = 0; i < faceCount; i++) {
Face *pFace = ppFaces[i];
if (pFace->clusterType != UNDEFINED) {
continue;
}
FaceLink *pNeighbors = RangeQuery(ppFaces, faceCount, pFace, eps);
long neighborCount = chainLength(pNeighbors);
if (neighborCount > maxPts) {
maxPts = neighborCount;
fprintf(stderr, "New max with id %ld: %ld\n", pFace->faceId, maxPts);
}
if (neighborCount < minPts) {
pFace->clusterType = NOISE;
freeChain(pNeighbors);
continue;
}
//printf("%ld has %ld neighbors.\n", pFace->faceId, neighborCount);
C++;
pFace->clusterId = C;
pFace->clusterType = CORE;
FaceLink *pLink = pNeighbors;
while (pLink) {
Face *pQ = pLink->pFace;
if (pQ->faceId == pFace->faceId) {
pLink = pLink->pNext;
continue;
}
if (pQ->clusterType == NOISE) {
pQ->clusterId = C;
pQ->clusterType = EDGE;
}
if (pQ->clusterType != UNDEFINED) {
pLink = pLink->pNext;
continue;
}
pQ->clusterId = C;
pQ->clusterType = EDGE;
FaceLink *pSubNeighbors = RangeQuery(ppFaces, faceCount, pQ, eps);
neighborCount = chainLength(pSubNeighbors);
if (neighborCount >= minPts) {
if (neighborCount > maxPts) {
maxPts = neighborCount;
fprintf(stderr, "New max with id %ld: %ld\n", pFace->faceId, maxPts);
}
pQ->clusterType = CORE;
/* Append these neighbors to the end of the chain */
FaceLink *pTmp = pLink;
while (pTmp->pNext) {
pTmp = pTmp->pNext;
}
pTmp->pNext = pSubNeighbors;
} else {
freeChain(pSubNeighbors);
}
pLink = pLink->pNext;
}
freeChain(pNeighbors);
}
return C;
}
/*
* 1. Count how many entries there are
* 2. Allocate storage to hold all entries
* 3. Read all entries into flat array
* 4. Allocate MxM matrix and pre-calculate distances
* 5. Perform DBSCAN across MxM matrix to cluster
*/
int main(int argc, char *argv[]) {
long maxId = 0;
long i;
long entries = 0;
for (i = 0; i < 100; i++) {
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
DIR *faceDir = opendir(pathBuf);
if (!faceDir) {
continue;
}
struct dirent *ent;
while ((ent = readdir(faceDir)) != NULL) {
if (strstr(ent->d_name, ".json") == NULL) {
continue;
}
entries++;
}
closedir(faceDir);
}
Face **ppFaces = malloc(sizeof(Face *) * entries);
if (!ppFaces) {
fprintf(stderr, "Unable to allocate storage face descriptors.");
return -1;
}
for (i = 0; i < entries; i++) {
ppFaces[i] = malloc(sizeof(Face));
memset(ppFaces[i], 0, sizeof(Face));
}
for (i = 0; i < entries; i++) {
ppFaces[i]->distances = malloc(sizeof(*ppFaces[i]->distances) * entries);
if (!ppFaces[i]->distances) {
fprintf(stderr, "Unable to allocate storage for distance dictionary.");
return -1;
}
memset(ppFaces[i]->distances, 0, sizeof(*ppFaces[i]->distances) * entries);
}
long int count = 0;
for (i = 0; i < 100; i++) {
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
DIR *faceDir = opendir(pathBuf);
// fprintf(stderr, "Reading %s...\n", pathBuf);
if (!faceDir) {
fprintf(stderr, "Can not open %s\n", pathBuf);
continue;
}
struct dirent *ent;
while (count < entries && (ent = readdir(faceDir)) != NULL) {
if (strstr(ent->d_name, ".json") == NULL) {
continue;
}
long id = 0;
char *p = ent->d_name;
while (*p && *p != '-') {
id *= 10;
id += *p - '0';
p++;
}
char path[1028*2];
sprintf(path, "%s/%s", pathBuf, ent->d_name);
maxId = maxId > id ? maxId : id;
if (!readFaceDescriptor(ppFaces[count], id, path)) {
fprintf(stderr, "Unable to read %s.\n", path);
continue;
}
count++;
if (count % 1000 == 0) {
fprintf(stderr, "Read %ld%% of descriptors.\n", (100 * count / entries));
}
}
closedir(faceDir);
}
fprintf(stderr, "Read %ld face descriptors...\n", entries);
long double total = 0.0;
long int processed = 0;
for (long i = 0; i < entries; i++) {
Face *pLink = ppFaces[i];
for (long j = 0; j < entries; j++) {
Face *pTarget = ppFaces[j];
processed++;
if (processed % 1000 == 0) {
fprintf(stderr, "Computed %ld%% complete.\n", (100 * processed / (entries * entries)));
}
if (i == j) {
pLink->distances[i] = 0.0;
pTarget->distances[j] = 0.0;
continue;
}
if (pLink->distances[j] != 0.0) {
continue;
}
pLink->distances[j] =
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
total += pLink->distances[j];
}
}
fprintf(stderr, "Average distance: %Lf\n", 1. * total / (entries * entries));
long int clusters = DBSCAN(ppFaces, entries, 0.44L, 2);
long int undefined = 0, outlier = 0, core = 0, reachable = 0;
for (i = 0; i < entries; i++) {
switch (ppFaces[i]->clusterType) {
case NOISE:
outlier++;
break;
case UNDEFINED:
undefined++;
break;
case CORE:
core++;
break;
case EDGE:
reachable++;
break;
}
}
fprintf(stderr, "%ld clusters identified!\n", clusters);
fprintf(stderr, "%ld NOISE\n", outlier);
fprintf(stderr, "%ld UNDEFINED\n", undefined);
fprintf(stderr, "%ld CORE\n", core);
fprintf(stderr, "%ld EDGE\n", reachable);
fprintf(stdout, "<script>\nvar clusters = [\n");
for (long i = 1; i <= clusters; i++) {
long nodes = 0;
fprintf(stdout, "/* %ld. */ [", i);
for (long int j = 0; j < entries; j++) {
if (ppFaces[j]->clusterId == i) {
if (nodes == 0) {
fprintf(stdout, "%ld", ppFaces[j]->faceId);
} else {
fprintf(stdout, ",%ld", ppFaces[j]->faceId);
}
nodes++;
}
}
if (i < clusters) {
fprintf(stdout, "],\n");
} else {
fprintf(stdout, "]\n");
}
}
fprintf(stdout, "];\n</script>\n");
/* Allocate storage for all distances */
sqlite3 *db;
int rc = sqlite3_open("db/photos.db", &db);
if (rc != SQLITE_OK) {
fprintf(stderr, "Cannot open database: %s\n", sqlite3_errmsg(db));
sqlite3_close(db);
return 1;
}
fprintf(stderr, "DB opened.");
char *err_msg = NULL;
char *sql =
"DELETE FROM facedistances;"
"BEGIN TRANSACTION;";
rc = sqlite3_exec(db, sql, 0, 0, &err_msg);
if (rc != SQLITE_OK ) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
sqlite3_close(db);
return 1;
}
fprintf(stderr, "facedistances deleted and transaction started.\n");
for (long i = 0; i < entries; i++) {
memset(ppFaces[i]->distances, 0, sizeof(*ppFaces[i]->distances) * entries);
}
char sqlBuf[1024];
processed = 0;
for (long i = 0; i < entries; i++) {
Face *pLink = ppFaces[i];
for (long j = 0; j < entries; j++) {
Face *pTarget = ppFaces[j];
processed++;
if (processed % 1000 == 0) {
fprintf(stderr, "Computed %ld%% complete.\n", (100 * processed / (entries * entries)));
}
if (i == j) {
pLink->distances[i] = 0.0;
pTarget->distances[j] = 0.0;
continue;
}
if (pLink->distances[j] != 0.0) {
continue;
}
pLink->distances[j] =
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
if (pLink->distances[j] < 0.5) {
sprintf(sqlBuf, "INSERT INTO facedistances (face1Id,face2Id,distance) VALUES (%ld,%ld,%lf);",
((pLink->faceId < pTarget->faceId) ? pLink->faceId : pTarget->faceId),
((pLink->faceId < pTarget->faceId) ? pTarget->faceId : pLink->faceId),
pLink->distances[j]);
rc = sqlite3_exec(db, sqlBuf, 0, 0, &err_msg);
if (rc != SQLITE_OK ) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
sqlite3_close(db);
return 1;
}
}
}
}
sprintf(sqlBuf, "UPDATE faces SET lastComparedId=%ld;", maxId);
rc = sqlite3_exec(db, "COMMIT;", 0, 0, &err_msg);
if (rc != SQLITE_OK ) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
sqlite3_close(db);
return 1;
}
sqlite3_close(db);
return 0;
}