451 lines
12 KiB
C
451 lines
12 KiB
C
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <dirent.h>
|
||
#include <sys/types.h>
|
||
#include <string.h>
|
||
#include <math.h>
|
||
#include <sqlite3.h>
|
||
|
||
typedef enum {
|
||
UNDEFINED = 0,
|
||
CORE = 1,
|
||
EDGE = 2,
|
||
NOISE = 3
|
||
} ClusterTypes;
|
||
|
||
typedef struct Face {
|
||
long faceId;
|
||
long double descriptor[128];
|
||
long int clusterId;
|
||
ClusterTypes clusterType;
|
||
double *distances;
|
||
} Face;
|
||
|
||
typedef struct FaceLink {
|
||
struct FaceLink *pNext;
|
||
Face *pFace;
|
||
} FaceLink;
|
||
|
||
char fileBuf[5000];
|
||
char pathBuf[1028];
|
||
|
||
Face *readFaceDescriptor(Face *pFace, long id, char *path) {
|
||
FILE *f;
|
||
f = fopen(path, "r");
|
||
if (!f) {
|
||
return NULL;
|
||
}
|
||
size_t s = fread(fileBuf, 1, sizeof(fileBuf), f);
|
||
fclose(f);
|
||
|
||
char *p = fileBuf;
|
||
fileBuf[s] = 0;
|
||
while (*p && *p != '-' && *p != '+' && (*p < '0' || *p > '9')) {
|
||
p++;
|
||
}
|
||
int i = 0;
|
||
for (i = 0; i < 128; i++) {
|
||
char *start = p;
|
||
while (*p && *p != ',' && *p != ']' && *p != ' ' && *p != '\n') {
|
||
p++;
|
||
}
|
||
if (!*p) {
|
||
break;
|
||
}
|
||
*p++ = 0;
|
||
sscanf(start, "%Lf", &pFace->descriptor[i]);
|
||
}
|
||
|
||
if (i != 128) {
|
||
return NULL;
|
||
}
|
||
|
||
pFace->faceId = id;
|
||
|
||
return pFace;
|
||
}
|
||
|
||
long double euclideanDistance(long double *a, long double *b) {
|
||
long double sum = 0.0L;
|
||
for (int i = 0; i < 128; i++) {
|
||
long double delta = a[i] - b[i];
|
||
sum += delta * delta;
|
||
}
|
||
return sqrtl(sum);
|
||
}
|
||
|
||
/* https://en.wikipedia.org/wiki/DBSCAN */
|
||
#if 0
|
||
DBSCAN(DB, distFunc, eps, minPts) {
|
||
C = 0 /* Cluster counter */
|
||
for each point P in database DB {
|
||
if label(P) ≠ undefined then continue /* Previously processed in inner loop */
|
||
Neighbors N = RangeQuery(DB, distFunc, P, eps) /* Find neighbors */
|
||
if |N| < minPts then { /* Density check */
|
||
label(P) = Noise /* Label as Noise */
|
||
continue
|
||
}
|
||
C = C + 1 /* next cluster label */
|
||
label(P) = C /* Label initial point */
|
||
Seed set S = N \ {P} /* Neighbors to expand */
|
||
for each point Q in S { /* Process every seed point */
|
||
if label(Q) = Noise then label(Q) = C /* Change Noise to border point */
|
||
if label(Q) ≠ undefined then continue /* Previously processed */
|
||
label(Q) = C /* Label neighbor */
|
||
Neighbors N = RangeQuery(DB, distFunc, Q, eps) /* Find neighbors */
|
||
if |N| ≥ minPts then { /* Density check */
|
||
S = S ∪ N /* Add new neighbors to seed set */
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
RangeQuery(DB, distFunc, Q, eps) {
|
||
Neighbors = empty list
|
||
for each point P in database DB { /* Scan all points in the database */
|
||
if distFunc(Q, P) ≤ eps then { /* Compute distance and check epsilon */
|
||
Neighbors = Neighbors ∪ {P} /* Add to result */
|
||
}
|
||
}
|
||
return Neighbors
|
||
}
|
||
#endif
|
||
|
||
FaceLink *RangeQuery(Face *pFaces, long int faceCount, Face *pQ, double eps) {
|
||
FaceLink *pNeighbors = NULL;
|
||
for (long int i = 0; i < faceCount; i++) {
|
||
Face *pFace = &pFaces[i];
|
||
if (pFace->faceId == pQ->faceId) {
|
||
continue;
|
||
}
|
||
|
||
if (pQ->distances[i] <= eps) {
|
||
FaceLink *pLink = malloc(sizeof(*pLink));
|
||
memset(pLink, 0, sizeof(*pLink));
|
||
pLink->pFace = pFace;
|
||
pLink->pNext = pNeighbors;
|
||
pNeighbors = pLink;
|
||
}
|
||
}
|
||
return pNeighbors;
|
||
}
|
||
|
||
void freeChain(FaceLink *pLink) {
|
||
while (pLink) {
|
||
FaceLink *tmp = pLink->pNext;
|
||
free(pLink);
|
||
pLink = tmp;
|
||
}
|
||
}
|
||
|
||
long int chainLength(FaceLink *pLink) {
|
||
long int count = 0;
|
||
while (pLink) {
|
||
count++;
|
||
pLink = pLink->pNext;
|
||
}
|
||
return count;
|
||
}
|
||
|
||
long int DBSCAN(Face *faces, long int faceCount, double eps, int minPts) {
|
||
long int C = 0;
|
||
for (long int i = 0; i < faceCount; i++) {
|
||
Face *pFace = &faces[i];
|
||
if (pFace->clusterType != UNDEFINED) {
|
||
continue;
|
||
}
|
||
|
||
FaceLink *pNeighbors = RangeQuery(faces, faceCount, pFace, eps);
|
||
long neighborCount = chainLength(pNeighbors);
|
||
if (neighborCount < minPts) {
|
||
pFace->clusterType = NOISE;
|
||
freeChain(pNeighbors);
|
||
continue;
|
||
}
|
||
|
||
//printf("%ld has %ld neighbors.\n", pFace->faceId, neighborCount);
|
||
|
||
C++;
|
||
|
||
pFace->clusterId = C;
|
||
pFace->clusterType = CORE;
|
||
|
||
FaceLink *pLink = pNeighbors;
|
||
while (pLink) {
|
||
Face *pQ = pLink->pFace;
|
||
|
||
if (pQ->faceId == pFace->faceId) {
|
||
pLink = pLink->pNext;
|
||
continue;
|
||
}
|
||
if (pQ->clusterType == NOISE) {
|
||
pQ->clusterId = C;
|
||
pQ->clusterType = EDGE;
|
||
}
|
||
if (pQ->clusterType != UNDEFINED) {
|
||
pLink = pLink->pNext;
|
||
continue;
|
||
}
|
||
|
||
pQ->clusterId = C;
|
||
pQ->clusterType = EDGE;
|
||
|
||
FaceLink *pSubNeighbors = RangeQuery(faces, faceCount, pQ, eps);
|
||
neighborCount = chainLength(pSubNeighbors);
|
||
if (neighborCount >= minPts) {
|
||
pQ->clusterType = CORE;
|
||
/* Append these neighbors to the end of the chain */
|
||
FaceLink *pTmp = pLink;
|
||
while (pTmp->pNext) {
|
||
pTmp = pTmp->pNext;
|
||
}
|
||
pTmp->pNext = pSubNeighbors;
|
||
} else {
|
||
freeChain(pSubNeighbors);
|
||
}
|
||
|
||
pLink = pLink->pNext;
|
||
}
|
||
freeChain(pNeighbors);
|
||
}
|
||
|
||
return C;
|
||
}
|
||
|
||
/*
|
||
* 1. Count how many entries there are
|
||
* 2. Allocate storage to hold all entries
|
||
* 3. Read all entries into flat array
|
||
* 4. Allocate MxM matrix and pre-calculate distances
|
||
* 5. Perform DBSCAN across MxM matrix to cluster
|
||
*/
|
||
int main(int argc, char *argv[]) {
|
||
long maxId = 0;
|
||
long i;
|
||
long entries = 0;
|
||
|
||
for (i = 0; i < 100; i++) {
|
||
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
|
||
DIR *faceDir = opendir(pathBuf);
|
||
if (!faceDir) {
|
||
continue;
|
||
}
|
||
|
||
struct dirent *ent;
|
||
while ((ent = readdir(faceDir)) != NULL) {
|
||
if (strstr(ent->d_name, ".json") == NULL) {
|
||
continue;
|
||
}
|
||
entries++;
|
||
}
|
||
closedir(faceDir);
|
||
}
|
||
|
||
Face *pFaces = malloc(sizeof(Face) * entries);
|
||
if (!pFaces) {
|
||
fprintf(stderr, "Unable to allocate storage face descriptors.");
|
||
return -1;
|
||
}
|
||
memset(pFaces, 0, sizeof(Face) * entries);
|
||
for (i = 0; i < entries; i++) {
|
||
pFaces[i].distances = malloc(sizeof(*pFaces[i].distances) * entries);
|
||
if (!pFaces[i].distances) {
|
||
fprintf(stderr, "Unable to allocate storage for distance dictionary.");
|
||
return -1;
|
||
}
|
||
memset(pFaces[i].distances, 0, sizeof(*pFaces[i].distances) * entries);
|
||
}
|
||
|
||
long int count = 0;
|
||
for (i = 0; i < 100; i++) {
|
||
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
|
||
DIR *faceDir = opendir(pathBuf);
|
||
// fprintf(stderr, "Reading %s...\n", pathBuf);
|
||
if (!faceDir) {
|
||
fprintf(stderr, "Can not open %s\n", pathBuf);
|
||
continue;
|
||
}
|
||
|
||
struct dirent *ent;
|
||
while (count < entries && (ent = readdir(faceDir)) != NULL) {
|
||
if (strstr(ent->d_name, ".json") == NULL) {
|
||
continue;
|
||
}
|
||
long id = 0;
|
||
char *p = ent->d_name;
|
||
while (*p && *p != '-') {
|
||
id *= 10;
|
||
id += *p - '0';
|
||
p++;
|
||
}
|
||
char path[1028*2];
|
||
sprintf(path, "%s/%s", pathBuf, ent->d_name);
|
||
maxId = maxId > id ? maxId : id;
|
||
if (!readFaceDescriptor(&pFaces[count], id, path)) {
|
||
fprintf(stderr, "Unable to read %s.\n", path);
|
||
continue;
|
||
}
|
||
count++;
|
||
if (count % 1000 == 0) {
|
||
fprintf(stderr, "...read %ld...\n", count);
|
||
}
|
||
}
|
||
closedir(faceDir);
|
||
}
|
||
|
||
fprintf(stderr, "Read %ld face descriptors...\n", entries);
|
||
|
||
long double total = 0.0;
|
||
for (long i = 0; i < entries; i++) {
|
||
Face *pLink = &pFaces[i];
|
||
for (long j = 0; j < entries; j++) {
|
||
Face *pTarget = &pFaces[j];
|
||
if (i == j) {
|
||
pLink->distances[i] = 0.0;
|
||
pTarget->distances[j] = 0.0;
|
||
continue;
|
||
}
|
||
|
||
if (pLink->distances[j] != 0.0) {
|
||
continue;
|
||
}
|
||
|
||
pLink->distances[j] =
|
||
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
|
||
|
||
total += pLink->distances[j];
|
||
}
|
||
}
|
||
|
||
fprintf(stderr, "Average distance: %Lf\n", 1. * total / (entries * entries));
|
||
|
||
long int clusters = DBSCAN(pFaces, entries, 0.44L, 2);
|
||
long int undefined = 0, outlier = 0, core = 0, reachable = 0;
|
||
for (i = 0; i < entries; i++) {
|
||
switch (pFaces[i].clusterType) {
|
||
case NOISE:
|
||
outlier++;
|
||
break;
|
||
case UNDEFINED:
|
||
undefined++;
|
||
break;
|
||
case CORE:
|
||
core++;
|
||
break;
|
||
case EDGE:
|
||
reachable++;
|
||
break;
|
||
}
|
||
}
|
||
|
||
fprintf(stderr, "%ld clusters identified!\n", clusters);
|
||
fprintf(stderr, "%ld NOISE\n", outlier);
|
||
fprintf(stderr, "%ld UNDEFINED\n", undefined);
|
||
fprintf(stderr, "%ld CORE\n", core);
|
||
fprintf(stderr, "%ld EDGE\n", reachable);
|
||
|
||
fprintf(stdout, "<script>\nvar clusters = [\n");
|
||
for (long i = 1; i <= clusters; i++) {
|
||
long nodes = 0;
|
||
fprintf(stdout, "/* %ld. */ [", i);
|
||
for (long int j = 0; j < entries; j++) {
|
||
if (pFaces[j].clusterId == i) {
|
||
if (nodes == 0) {
|
||
fprintf(stdout, "%ld", pFaces[j].faceId);
|
||
} else {
|
||
fprintf(stdout, ",%ld", pFaces[j].faceId);
|
||
}
|
||
nodes++;
|
||
}
|
||
}
|
||
if (i < clusters) {
|
||
fprintf(stdout, "],\n");
|
||
} else {
|
||
fprintf(stdout, "]\n");
|
||
}
|
||
}
|
||
fprintf(stdout, "];\n</script>\n");
|
||
|
||
/* Allocate storage for all distances */
|
||
sqlite3 *db;
|
||
|
||
int rc = sqlite3_open("db/photos.db", &db);
|
||
if (rc != SQLITE_OK) {
|
||
fprintf(stderr, "Cannot open database: %s\n", sqlite3_errmsg(db));
|
||
sqlite3_close(db);
|
||
return 1;
|
||
}
|
||
|
||
fprintf(stderr, "DB opened.");
|
||
|
||
char *err_msg = NULL;
|
||
char *sql =
|
||
"DELETE FROM facedistances;"
|
||
"BEGIN TRANSACTION;";
|
||
|
||
rc = sqlite3_exec(db, sql, 0, 0, &err_msg);
|
||
if (rc != SQLITE_OK ) {
|
||
fprintf(stderr, "SQL error: %s\n", err_msg);
|
||
sqlite3_free(err_msg);
|
||
sqlite3_close(db);
|
||
return 1;
|
||
}
|
||
|
||
fprintf(stderr, "facedistances deleted and transaction started.\n");
|
||
|
||
char sqlBuf[1024];
|
||
int sourceIndex = 0, lines = 0;
|
||
for (long i = 0; i < entries; i++) {
|
||
Face *pLink = &pFaces[i];
|
||
int targetIndex = 0;
|
||
for (long j = 0; j < entries; j++) {
|
||
Face *pTarget = &pFaces[j];
|
||
if (i == j) {
|
||
pLink->distances[i] = 0.0;
|
||
pTarget->distances[j] = 0.0;
|
||
continue;
|
||
}
|
||
|
||
if (pLink->distances[j] != 0.0) {
|
||
// continue;
|
||
}
|
||
|
||
pLink->distances[j] =
|
||
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
|
||
|
||
if (pLink->distances[j] < 0.5) {
|
||
sprintf(sqlBuf, "INSERT INTO facedistances (face1Id,face2Id,distance) VALUES (%ld,%ld,%lf);",
|
||
((pLink->faceId < pTarget->faceId) ? pLink->faceId : pTarget->faceId),
|
||
((pLink->faceId < pTarget->faceId) ? pTarget->faceId : pLink->faceId),
|
||
pLink->distances[j]);
|
||
rc = sqlite3_exec(db, sqlBuf, 0, 0, &err_msg);
|
||
if (rc != SQLITE_OK ) {
|
||
fprintf(stderr, "SQL error: %s\n", err_msg);
|
||
sqlite3_free(err_msg);
|
||
sqlite3_close(db);
|
||
return 1;
|
||
}
|
||
lines++;
|
||
if (lines % 1000 == 0) {
|
||
fprintf(stderr, "...output %d DB lines (%0.2f complete)...\n", lines,
|
||
(float)(1. * sourceIndex / (1. * entries)));
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
sprintf(sqlBuf, "UPDATE faces SET lastComparedId=%ld;", maxId);
|
||
|
||
rc = sqlite3_exec(db, "COMMIT;", 0, 0, &err_msg);
|
||
if (rc != SQLITE_OK ) {
|
||
fprintf(stderr, "SQL error: %s\n", err_msg);
|
||
sqlite3_free(err_msg);
|
||
sqlite3_close(db);
|
||
return 1;
|
||
}
|
||
|
||
sqlite3_close(db);
|
||
|
||
return 0;
|
||
}
|