569 lines
14 KiB
C
569 lines
14 KiB
C
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <dirent.h>
|
||
#include <sys/types.h>
|
||
#include <string.h>
|
||
#include <math.h>
|
||
#include <sqlite3.h>
|
||
|
||
#ifndef MIN_PTS
|
||
#define MIN_PTS 20
|
||
#endif
|
||
|
||
#ifndef MAX_DISTANCE
|
||
#define MAX_DISTANCE 0.354667L
|
||
#endif
|
||
|
||
typedef enum {
|
||
UNDEFINED = 0,
|
||
CORE = 1,
|
||
EDGE = 2,
|
||
NOISE = 3
|
||
} ClusterTypes;
|
||
|
||
typedef struct Face {
|
||
long double descriptor[128];
|
||
long int clusterId;
|
||
long faceId;
|
||
long photoId;
|
||
double confidence;
|
||
ClusterTypes clusterType;
|
||
double *distances;
|
||
} Face;
|
||
|
||
typedef struct FaceLink {
|
||
struct FaceLink *pNext;
|
||
Face *pFace;
|
||
} FaceLink;
|
||
|
||
char fileBuf[5000];
|
||
char pathBuf[1028];
|
||
|
||
Face *readFaceDescriptor(Face *pFace, long id, char *path) {
|
||
FILE *f;
|
||
f = fopen(path, "r");
|
||
if (!f) {
|
||
return NULL;
|
||
}
|
||
size_t s = fread(fileBuf, 1, sizeof(fileBuf), f);
|
||
fclose(f);
|
||
|
||
char *p = fileBuf;
|
||
fileBuf[s] = 0;
|
||
while (*p && *p != '-' && *p != '+' && (*p < '0' || *p > '9')) {
|
||
p++;
|
||
}
|
||
int i = 0;
|
||
for (i = 0; i < 128; i++) {
|
||
char *start = p;
|
||
while (*p && *p != ',' && *p != ']' && *p != ' ' && *p != '\n') {
|
||
p++;
|
||
}
|
||
if (!*p) {
|
||
break;
|
||
}
|
||
*p++ = 0;
|
||
sscanf(start, "%Lf", &pFace->descriptor[i]);
|
||
}
|
||
|
||
if (i != 128) {
|
||
return NULL;
|
||
}
|
||
|
||
pFace->faceId = id;
|
||
|
||
return pFace;
|
||
}
|
||
|
||
long double euclideanDistance(long double *a, long double *b) {
|
||
long double sum = 0.0L;
|
||
for (int i = 0; i < 128; i++) {
|
||
long double delta = a[i] - b[i];
|
||
sum += delta * delta;
|
||
}
|
||
return sqrtl(sum);
|
||
}
|
||
|
||
/* https://en.wikipedia.org/wiki/DBSCAN */
|
||
#if 0
|
||
DBSCAN(DB, distFunc, eps, minPts) {
|
||
C = 0 /* Cluster counter */
|
||
for each point P in database DB {
|
||
if label(P) ≠ undefined then continue /* Previously processed in inner loop */
|
||
Neighbors N = RangeQuery(DB, distFunc, P, eps) /* Find neighbors */
|
||
if |N| < minPts then { /* Density check */
|
||
label(P) = Noise /* Label as Noise */
|
||
continue
|
||
}
|
||
C = C + 1 /* next cluster label */
|
||
label(P) = C /* Label initial point */
|
||
Seed set S = N \ {P} /* Neighbors to expand */
|
||
for each point Q in S { /* Process every seed point */
|
||
if label(Q) = Noise then label(Q) = C /* Change Noise to border point */
|
||
if label(Q) ≠ undefined then continue /* Previously processed */
|
||
label(Q) = C /* Label neighbor */
|
||
Neighbors N = RangeQuery(DB, distFunc, Q, eps) /* Find neighbors */
|
||
if |N| ≥ minPts then { /* Density check */
|
||
S = S ∪ N /* Add new neighbors to seed set */
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
RangeQuery(DB, distFunc, Q, eps) {
|
||
Neighbors = empty list
|
||
for each point P in database DB { /* Scan all points in the database */
|
||
if distFunc(Q, P) ≤ eps then { /* Compute distance and check epsilon */
|
||
Neighbors = Neighbors ∪ {P} /* Add to result */
|
||
}
|
||
}
|
||
return Neighbors
|
||
}
|
||
#endif
|
||
|
||
FaceLink *RangeQuery(Face **ppFaces, long int faceCount, Face *pQ, double eps) {
|
||
FaceLink *pNeighbors = NULL;
|
||
for (long int i = 0; i < faceCount; i++) {
|
||
Face *pFace = ppFaces[i];
|
||
if (pFace->confidence <= 0.9) {
|
||
continue;
|
||
}
|
||
|
||
if (pFace->faceId == pQ->faceId) {
|
||
continue;
|
||
}
|
||
|
||
if (pQ->distances[i] > 0.0 && pQ->distances[i] <= eps) {
|
||
FaceLink *pLink = malloc(sizeof(*pLink));
|
||
memset(pLink, 0, sizeof(*pLink));
|
||
pLink->pFace = pFace;
|
||
pLink->pNext = pNeighbors;
|
||
pNeighbors = pLink;
|
||
}
|
||
}
|
||
return pNeighbors;
|
||
}
|
||
|
||
void freeChain(FaceLink *pLink) {
|
||
while (pLink) {
|
||
FaceLink *tmp = pLink->pNext;
|
||
free(pLink);
|
||
pLink = tmp;
|
||
}
|
||
}
|
||
|
||
long int chainLength(FaceLink *pLink) {
|
||
long int count = 0;
|
||
while (pLink) {
|
||
count++;
|
||
pLink = pLink->pNext;
|
||
}
|
||
return count;
|
||
}
|
||
|
||
long int maxPts = 0;
|
||
|
||
long int DBSCAN(Face **ppFaces, long int faceCount, double eps, int minPts) {
|
||
long int C = 0;
|
||
for (long int i = 0; i < faceCount; i++) {
|
||
Face *pFace = ppFaces[i];
|
||
if (pFace->confidence <= 0.9) {
|
||
continue;
|
||
}
|
||
|
||
if (pFace->clusterType != UNDEFINED) {
|
||
continue;
|
||
}
|
||
|
||
FaceLink *pNeighbors = RangeQuery(ppFaces, faceCount, pFace, eps);
|
||
long neighborCount = chainLength(pNeighbors);
|
||
if (neighborCount > maxPts) {
|
||
maxPts = neighborCount;
|
||
fprintf(stderr, "New max with id %ld: %ld\n", pFace->faceId, maxPts);
|
||
}
|
||
if (neighborCount < minPts) {
|
||
pFace->clusterType = NOISE;
|
||
freeChain(pNeighbors);
|
||
continue;
|
||
}
|
||
|
||
//printf("%ld has %ld neighbors.\n", pFace->faceId, neighborCount);
|
||
|
||
C++;
|
||
|
||
pFace->clusterId = C;
|
||
pFace->clusterType = CORE;
|
||
|
||
FaceLink *pLink = pNeighbors;
|
||
while (pLink) {
|
||
Face *pQ = pLink->pFace;
|
||
|
||
if (pQ->faceId == pFace->faceId) {
|
||
pLink = pLink->pNext;
|
||
continue;
|
||
}
|
||
if (pQ->clusterType == NOISE) {
|
||
pQ->clusterId = C;
|
||
pQ->clusterType = EDGE;
|
||
}
|
||
if (pQ->clusterType != UNDEFINED) {
|
||
pLink = pLink->pNext;
|
||
continue;
|
||
}
|
||
|
||
pQ->clusterId = C;
|
||
pQ->clusterType = EDGE;
|
||
|
||
FaceLink *pSubNeighbors = RangeQuery(ppFaces, faceCount, pQ, eps);
|
||
neighborCount = chainLength(pSubNeighbors);
|
||
if (neighborCount >= minPts) {
|
||
if (neighborCount > maxPts) {
|
||
maxPts = neighborCount;
|
||
fprintf(stderr, "New max with id %ld: %ld\n", pFace->faceId, maxPts);
|
||
}
|
||
pQ->clusterType = CORE;
|
||
/* Append these neighbors to the end of the chain */
|
||
FaceLink *pTmp = pLink;
|
||
while (pTmp->pNext) {
|
||
pTmp = pTmp->pNext;
|
||
}
|
||
pTmp->pNext = pSubNeighbors;
|
||
} else {
|
||
freeChain(pSubNeighbors);
|
||
}
|
||
|
||
pLink = pLink->pNext;
|
||
}
|
||
freeChain(pNeighbors);
|
||
}
|
||
|
||
return C;
|
||
}
|
||
|
||
typedef struct {
|
||
Face **ppFaces;
|
||
long int count;
|
||
} FaceCallbackData;
|
||
|
||
int parseFaceIdRow(void *data, int argc, char **argv, char **column) {
|
||
FaceCallbackData *map = data;
|
||
long int faceId = strtol(argv[0] ? argv[0] : "0", NULL, 10);
|
||
long int photoId = strtol(argv[1] ? argv[1] : "0", NULL, 10);
|
||
double confidence = strtod(argv[2] ? argv[2] : "0.0", NULL);
|
||
for (long int i = 0; i < map->count; i++) {
|
||
if (map->ppFaces[i]->faceId == faceId) {
|
||
map->ppFaces[i]->photoId = photoId;
|
||
map->ppFaces[i]->confidence = confidence;
|
||
break;
|
||
}
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
|
||
/*
|
||
* 1. Count how many entries there are
|
||
* 2. Allocate storage to hold all entries
|
||
* 3. Read all entries into flat array
|
||
* 4. Allocate MxM matrix and pre-calculate distances
|
||
* 5. Perform DBSCAN across MxM matrix to cluster
|
||
*/
|
||
int main(int argc, char *argv[]) {
|
||
long maxId = 0;
|
||
long i;
|
||
long entries = 0;
|
||
long int minPts = MIN_PTS;
|
||
long double maxDistance = MAX_DISTANCE;
|
||
|
||
if (argc == 1) {
|
||
fprintf(stderr, "usage: scanner PATH MAX_DISTANCE MIN_PTS\n");
|
||
return -1;
|
||
}
|
||
|
||
if (argc > 2) {
|
||
sscanf(argv[2], "%Lf", &maxDistance);
|
||
}
|
||
|
||
if (argc > 3) {
|
||
sscanf(argv[3], "%ld", &minPts);
|
||
}
|
||
|
||
fprintf(stderr, "\nmaxDistance : %Lf\nminPts : %ld\n", maxDistance, minPts);
|
||
|
||
for (i = 0; i < 100; i++) {
|
||
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
|
||
DIR *faceDir = opendir(pathBuf);
|
||
if (!faceDir) {
|
||
continue;
|
||
}
|
||
|
||
struct dirent *ent;
|
||
while ((ent = readdir(faceDir)) != NULL) {
|
||
if (strstr(ent->d_name, ".json") == NULL) {
|
||
continue;
|
||
}
|
||
entries++;
|
||
}
|
||
closedir(faceDir);
|
||
}
|
||
|
||
Face **ppFaces = malloc(sizeof(Face *) * entries);
|
||
if (!ppFaces) {
|
||
fprintf(stderr, "Unable to allocate storage face descriptors.");
|
||
return -1;
|
||
}
|
||
for (i = 0; i < entries; i++) {
|
||
ppFaces[i] = malloc(sizeof(Face));
|
||
memset(ppFaces[i], 0, sizeof(Face));
|
||
}
|
||
|
||
for (i = 0; i < entries; i++) {
|
||
ppFaces[i]->distances = malloc(sizeof(*ppFaces[i]->distances) * entries);
|
||
if (!ppFaces[i]->distances) {
|
||
fprintf(stderr, "Unable to allocate storage for distance dictionary.");
|
||
return -1;
|
||
}
|
||
memset(ppFaces[i]->distances, 0, sizeof(*ppFaces[i]->distances) * entries);
|
||
}
|
||
|
||
long int processed = 0;
|
||
int last = 0;
|
||
for (i = 0; i < 100; i++) {
|
||
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
|
||
DIR *faceDir = opendir(pathBuf);
|
||
// fprintf(stderr, "Reading %s...\n", pathBuf);
|
||
if (!faceDir) {
|
||
fprintf(stderr, "Can not open %s\n", pathBuf);
|
||
continue;
|
||
}
|
||
|
||
struct dirent *ent;
|
||
while (processed < entries && (ent = readdir(faceDir)) != NULL) {
|
||
if (strstr(ent->d_name, ".json") == NULL) {
|
||
continue;
|
||
}
|
||
long id = 0;
|
||
char *p = ent->d_name;
|
||
while (*p && *p != '-') {
|
||
id *= 10;
|
||
id += *p - '0';
|
||
p++;
|
||
}
|
||
char path[1028*2];
|
||
sprintf(path, "%s/%s", pathBuf, ent->d_name);
|
||
maxId = maxId > id ? maxId : id;
|
||
if (!readFaceDescriptor(ppFaces[processed], id, path)) {
|
||
fprintf(stderr, "Unable to read %s.\n", path);
|
||
continue;
|
||
}
|
||
processed++;
|
||
if (processed % 1000 == 0) {
|
||
int perc = 100 * processed / (entries * entries);
|
||
if (perc != last) {
|
||
fprintf(stderr, "\rRead %d%% of descriptors.", perc);
|
||
last = perc;
|
||
}
|
||
}
|
||
}
|
||
closedir(faceDir);
|
||
}
|
||
|
||
fprintf(stderr, "\nRead %ld face descriptors...\n", entries);
|
||
|
||
/* Allocate storage for all distances */
|
||
sqlite3 *db;
|
||
|
||
int rc = sqlite3_open("db/photos.db", &db);
|
||
if (rc != SQLITE_OK) {
|
||
fprintf(stderr, "Cannot open database: %s\n", sqlite3_errmsg(db));
|
||
sqlite3_close(db);
|
||
return 1;
|
||
}
|
||
|
||
fprintf(stderr, "DB opened.");
|
||
|
||
char *err_msg = NULL;
|
||
FaceCallbackData data = {
|
||
ppFaces: ppFaces,
|
||
count: entries
|
||
};
|
||
|
||
rc = sqlite3_exec(db, "SELECT id,photoId,faceConfidence FROM faces", parseFaceIdRow, &data, &err_msg);
|
||
if (rc != SQLITE_OK) {
|
||
fprintf(stderr, "SQL error: %s\n", err_msg);
|
||
sqlite3_free(err_msg);
|
||
sqlite3_close(db);
|
||
return 1;
|
||
}
|
||
|
||
fprintf(stderr, "Face data loaded from DB\n");
|
||
|
||
processed = 0;
|
||
long double total = 0.0;
|
||
for (long i = 0; i < entries; i++) {
|
||
Face *pLink = ppFaces[i];
|
||
for (long j = 0; j < entries; j++) {
|
||
Face *pTarget = ppFaces[j];
|
||
processed++;
|
||
if (processed % 1000 == 0) {
|
||
int perc = 100 * processed / (entries * entries);
|
||
if (perc != last) {
|
||
fprintf(stderr, "\rComputed %d%% complete.", perc);
|
||
last = perc;
|
||
}
|
||
}
|
||
|
||
if (pLink->confidence <= 0.9 || pTarget->confidence <= 0.9) {
|
||
pLink->distances[i] = 0.0;
|
||
pTarget->distances[j] = 0.0;
|
||
continue;
|
||
}
|
||
|
||
if (i == j) {
|
||
pLink->distances[i] = 0.0;
|
||
pTarget->distances[j] = 0.0;
|
||
continue;
|
||
}
|
||
|
||
if (pLink->distances[j] != 0.0) {
|
||
continue;
|
||
}
|
||
|
||
pLink->distances[j] =
|
||
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
|
||
|
||
total += pLink->distances[j];
|
||
}
|
||
}
|
||
|
||
fprintf(stderr, "\nAverage distance: %Lf\n", 1. * total / (entries * entries));
|
||
|
||
long int clusters = DBSCAN(ppFaces, entries, maxDistance, minPts);
|
||
long int undefined = 0, outlier = 0, core = 0, reachable = 0;
|
||
for (i = 0; i < entries; i++) {
|
||
switch (ppFaces[i]->clusterType) {
|
||
case NOISE:
|
||
outlier++;
|
||
break;
|
||
case UNDEFINED:
|
||
undefined++;
|
||
break;
|
||
case CORE:
|
||
core++;
|
||
break;
|
||
case EDGE:
|
||
reachable++;
|
||
break;
|
||
}
|
||
}
|
||
|
||
fprintf(stderr, "%ld clusters identified!\n", clusters);
|
||
fprintf(stderr, "%ld NOISE\n", outlier);
|
||
fprintf(stderr, "%ld UNDEFINED\n", undefined);
|
||
fprintf(stderr, "%ld CORE\n", core);
|
||
fprintf(stderr, "%ld EDGE\n", reachable);
|
||
|
||
fprintf(stdout, "<script>\nvar clusters = [\n");
|
||
for (long i = 1; i <= clusters; i++) {
|
||
long nodes = 0;
|
||
fprintf(stdout, "/* %ld. */ [", i);
|
||
for (long int j = 0; j < entries; j++) {
|
||
if (ppFaces[j]->clusterId == i) {
|
||
if (nodes == 0) {
|
||
fprintf(stdout, "[%ld,%ld]", ppFaces[j]->faceId, ppFaces[j]->photoId);
|
||
} else {
|
||
fprintf(stdout, ",[%ld,%ld]", ppFaces[j]->faceId, ppFaces[j]->photoId);
|
||
}
|
||
nodes++;
|
||
}
|
||
}
|
||
if (i < clusters) {
|
||
fprintf(stdout, "],\n");
|
||
} else {
|
||
fprintf(stdout, "]\n");
|
||
}
|
||
}
|
||
fprintf(stdout, "];\n</script>\n");
|
||
|
||
|
||
char *sql =
|
||
"DELETE FROM facedistances;"
|
||
"BEGIN TRANSACTION;";
|
||
|
||
rc = sqlite3_exec(db, sql, 0, 0, &err_msg);
|
||
if (rc != SQLITE_OK ) {
|
||
fprintf(stderr, "SQL error: %s\n", err_msg);
|
||
sqlite3_free(err_msg);
|
||
sqlite3_close(db);
|
||
return 1;
|
||
}
|
||
|
||
fprintf(stderr, "facedistances deleted and transaction started.\n");
|
||
for (long i = 0; i < entries; i++) {
|
||
memset(ppFaces[i]->distances, 0, sizeof(*ppFaces[i]->distances) * entries);
|
||
}
|
||
|
||
char sqlBuf[1024];
|
||
|
||
processed = 0;
|
||
last = 0;
|
||
for (long i = 0; i < entries; i++) {
|
||
Face *pLink = ppFaces[i];
|
||
for (long j = 0; j < entries; j++) {
|
||
Face *pTarget = ppFaces[j];
|
||
|
||
processed++;
|
||
if (processed % 1000 == 0) {
|
||
int perc = 100 * processed / (entries * entries);
|
||
if (perc != last) {
|
||
fprintf(stderr, "\rComputed %d%% complete.", perc);
|
||
last = perc;
|
||
}
|
||
}
|
||
|
||
if (i == j) {
|
||
pLink->distances[i] = 0.0;
|
||
pTarget->distances[j] = 0.0;
|
||
continue;
|
||
}
|
||
|
||
if (pLink->distances[j] != 0.0) {
|
||
continue;
|
||
}
|
||
|
||
pLink->distances[j] =
|
||
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
|
||
|
||
if (pLink->distances[j] < 0.5) {
|
||
sprintf(sqlBuf, "INSERT INTO facedistances (face1Id,face2Id,distance) VALUES (%ld,%ld,%lf);",
|
||
((pLink->faceId < pTarget->faceId) ? pLink->faceId : pTarget->faceId),
|
||
((pLink->faceId < pTarget->faceId) ? pTarget->faceId : pLink->faceId),
|
||
pLink->distances[j]);
|
||
rc = sqlite3_exec(db, sqlBuf, 0, 0, &err_msg);
|
||
if (rc != SQLITE_OK ) {
|
||
fprintf(stderr, "SQL error: %s\n", err_msg);
|
||
sqlite3_free(err_msg);
|
||
sqlite3_close(db);
|
||
return 1;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
fprintf(stderr, "\n");
|
||
|
||
sprintf(sqlBuf, "UPDATE faces SET lastComparedId=%ld;", maxId);
|
||
|
||
rc = sqlite3_exec(db, "COMMIT;", 0, 0, &err_msg);
|
||
if (rc != SQLITE_OK ) {
|
||
fprintf(stderr, "SQL error: %s\n", err_msg);
|
||
sqlite3_free(err_msg);
|
||
sqlite3_close(db);
|
||
return 1;
|
||
}
|
||
|
||
sqlite3_close(db);
|
||
|
||
return 0;
|
||
}
|