#include #include #include #include #include #include #include #ifndef MIN_PTS #define MIN_PTS 20 #endif #ifndef MAX_DISTANCE #define MAX_DISTANCE 0.354667L #endif typedef enum { UNDEFINED = 0, CORE = 1, EDGE = 2, NOISE = 3 } ClusterTypes; typedef struct Face { long double descriptor[128]; long int clusterId; long faceId; ClusterTypes clusterType; double *distances; } Face; typedef struct FaceLink { struct FaceLink *pNext; Face *pFace; } FaceLink; char fileBuf[5000]; char pathBuf[1028]; Face *readFaceDescriptor(Face *pFace, long id, char *path) { FILE *f; f = fopen(path, "r"); if (!f) { return NULL; } size_t s = fread(fileBuf, 1, sizeof(fileBuf), f); fclose(f); char *p = fileBuf; fileBuf[s] = 0; while (*p && *p != '-' && *p != '+' && (*p < '0' || *p > '9')) { p++; } int i = 0; for (i = 0; i < 128; i++) { char *start = p; while (*p && *p != ',' && *p != ']' && *p != ' ' && *p != '\n') { p++; } if (!*p) { break; } *p++ = 0; sscanf(start, "%Lf", &pFace->descriptor[i]); } if (i != 128) { return NULL; } pFace->faceId = id; return pFace; } long double euclideanDistance(long double *a, long double *b) { long double sum = 0.0L; for (int i = 0; i < 128; i++) { long double delta = a[i] - b[i]; sum += delta * delta; } return sqrtl(sum); } /* https://en.wikipedia.org/wiki/DBSCAN */ #if 0 DBSCAN(DB, distFunc, eps, minPts) { C = 0 /* Cluster counter */ for each point P in database DB { if label(P) ≠ undefined then continue /* Previously processed in inner loop */ Neighbors N = RangeQuery(DB, distFunc, P, eps) /* Find neighbors */ if |N| < minPts then { /* Density check */ label(P) = Noise /* Label as Noise */ continue } C = C + 1 /* next cluster label */ label(P) = C /* Label initial point */ Seed set S = N \ {P} /* Neighbors to expand */ for each point Q in S { /* Process every seed point */ if label(Q) = Noise then label(Q) = C /* Change Noise to border point */ if label(Q) ≠ undefined then continue /* Previously processed */ label(Q) = C /* Label neighbor */ Neighbors N = RangeQuery(DB, distFunc, Q, eps) /* Find neighbors */ if |N| ≥ minPts then { /* Density check */ S = S ∪ N /* Add new neighbors to seed set */ } } } } RangeQuery(DB, distFunc, Q, eps) { Neighbors = empty list for each point P in database DB { /* Scan all points in the database */ if distFunc(Q, P) ≤ eps then { /* Compute distance and check epsilon */ Neighbors = Neighbors ∪ {P} /* Add to result */ } } return Neighbors } #endif FaceLink *RangeQuery(Face **ppFaces, long int faceCount, Face *pQ, double eps) { FaceLink *pNeighbors = NULL; for (long int i = 0; i < faceCount; i++) { Face *pFace = ppFaces[i]; if (pFace->faceId == pQ->faceId) { continue; } if (pQ->distances[i] <= eps) { FaceLink *pLink = malloc(sizeof(*pLink)); memset(pLink, 0, sizeof(*pLink)); pLink->pFace = pFace; pLink->pNext = pNeighbors; pNeighbors = pLink; } } return pNeighbors; } void freeChain(FaceLink *pLink) { while (pLink) { FaceLink *tmp = pLink->pNext; free(pLink); pLink = tmp; } } long int chainLength(FaceLink *pLink) { long int count = 0; while (pLink) { count++; pLink = pLink->pNext; } return count; } long int maxPts = 0; long int DBSCAN(Face **ppFaces, long int faceCount, double eps, int minPts) { long int C = 0; for (long int i = 0; i < faceCount; i++) { Face *pFace = ppFaces[i]; if (pFace->clusterType != UNDEFINED) { continue; } FaceLink *pNeighbors = RangeQuery(ppFaces, faceCount, pFace, eps); long neighborCount = chainLength(pNeighbors); if (neighborCount > maxPts) { maxPts = neighborCount; fprintf(stderr, "New max with id %ld: %ld\n", pFace->faceId, maxPts); } if (neighborCount < minPts) { pFace->clusterType = NOISE; freeChain(pNeighbors); continue; } //printf("%ld has %ld neighbors.\n", pFace->faceId, neighborCount); C++; pFace->clusterId = C; pFace->clusterType = CORE; FaceLink *pLink = pNeighbors; while (pLink) { Face *pQ = pLink->pFace; if (pQ->faceId == pFace->faceId) { pLink = pLink->pNext; continue; } if (pQ->clusterType == NOISE) { pQ->clusterId = C; pQ->clusterType = EDGE; } if (pQ->clusterType != UNDEFINED) { pLink = pLink->pNext; continue; } pQ->clusterId = C; pQ->clusterType = EDGE; FaceLink *pSubNeighbors = RangeQuery(ppFaces, faceCount, pQ, eps); neighborCount = chainLength(pSubNeighbors); if (neighborCount >= minPts) { if (neighborCount > maxPts) { maxPts = neighborCount; fprintf(stderr, "New max with id %ld: %ld\n", pFace->faceId, maxPts); } pQ->clusterType = CORE; /* Append these neighbors to the end of the chain */ FaceLink *pTmp = pLink; while (pTmp->pNext) { pTmp = pTmp->pNext; } pTmp->pNext = pSubNeighbors; } else { freeChain(pSubNeighbors); } pLink = pLink->pNext; } freeChain(pNeighbors); } return C; } /* * 1. Count how many entries there are * 2. Allocate storage to hold all entries * 3. Read all entries into flat array * 4. Allocate MxM matrix and pre-calculate distances * 5. Perform DBSCAN across MxM matrix to cluster */ int main(int argc, char *argv[]) { long maxId = 0; long i; long entries = 0; for (i = 0; i < 100; i++) { sprintf(pathBuf, "%s/face-data/%ld", argv[1], i); DIR *faceDir = opendir(pathBuf); if (!faceDir) { continue; } struct dirent *ent; while ((ent = readdir(faceDir)) != NULL) { if (strstr(ent->d_name, ".json") == NULL) { continue; } entries++; } closedir(faceDir); } Face **ppFaces = malloc(sizeof(Face *) * entries); if (!ppFaces) { fprintf(stderr, "Unable to allocate storage face descriptors."); return -1; } for (i = 0; i < entries; i++) { ppFaces[i] = malloc(sizeof(Face)); memset(ppFaces[i], 0, sizeof(Face)); } for (i = 0; i < entries; i++) { ppFaces[i]->distances = malloc(sizeof(*ppFaces[i]->distances) * entries); if (!ppFaces[i]->distances) { fprintf(stderr, "Unable to allocate storage for distance dictionary."); return -1; } memset(ppFaces[i]->distances, 0, sizeof(*ppFaces[i]->distances) * entries); } long int processed = 0; int last = 0; for (i = 0; i < 100; i++) { sprintf(pathBuf, "%s/face-data/%ld", argv[1], i); DIR *faceDir = opendir(pathBuf); // fprintf(stderr, "Reading %s...\n", pathBuf); if (!faceDir) { fprintf(stderr, "Can not open %s\n", pathBuf); continue; } struct dirent *ent; while (processed < entries && (ent = readdir(faceDir)) != NULL) { if (strstr(ent->d_name, ".json") == NULL) { continue; } long id = 0; char *p = ent->d_name; while (*p && *p != '-') { id *= 10; id += *p - '0'; p++; } char path[1028*2]; sprintf(path, "%s/%s", pathBuf, ent->d_name); maxId = maxId > id ? maxId : id; if (!readFaceDescriptor(ppFaces[processed], id, path)) { fprintf(stderr, "Unable to read %s.\n", path); continue; } processed++; if (processed % 1000 == 0) { int perc = 100 * processed / (entries * entries); if (perc != last) { fprintf(stderr, "Read %d%% of descriptors.\n", perc); last = perc; } } } closedir(faceDir); } fprintf(stderr, "Read %ld face descriptors...\n", entries); processed = 0; long double total = 0.0; for (long i = 0; i < entries; i++) { Face *pLink = ppFaces[i]; for (long j = 0; j < entries; j++) { Face *pTarget = ppFaces[j]; processed++; if (processed % 1000 == 0) { int perc = 100 * processed / (entries * entries); if (perc != last) { fprintf(stderr, "Computed %d%% complete.\n", perc); last = perc; } } if (i == j) { pLink->distances[i] = 0.0; pTarget->distances[j] = 0.0; continue; } if (pLink->distances[j] != 0.0) { continue; } pLink->distances[j] = pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor); total += pLink->distances[j]; } } fprintf(stderr, "Average distance: %Lf\n", 1. * total / (entries * entries)); long int clusters = DBSCAN(ppFaces, entries, MAX_DISTANCE, MIN_PTS); long int undefined = 0, outlier = 0, core = 0, reachable = 0; for (i = 0; i < entries; i++) { switch (ppFaces[i]->clusterType) { case NOISE: outlier++; break; case UNDEFINED: undefined++; break; case CORE: core++; break; case EDGE: reachable++; break; } } fprintf(stderr, "%ld clusters identified!\n", clusters); fprintf(stderr, "%ld NOISE\n", outlier); fprintf(stderr, "%ld UNDEFINED\n", undefined); fprintf(stderr, "%ld CORE\n", core); fprintf(stderr, "%ld EDGE\n", reachable); fprintf(stdout, "\n"); /* Allocate storage for all distances */ sqlite3 *db; int rc = sqlite3_open("db/photos.db", &db); if (rc != SQLITE_OK) { fprintf(stderr, "Cannot open database: %s\n", sqlite3_errmsg(db)); sqlite3_close(db); return 1; } fprintf(stderr, "DB opened."); char *err_msg = NULL; char *sql = "DELETE FROM facedistances;" "BEGIN TRANSACTION;"; rc = sqlite3_exec(db, sql, 0, 0, &err_msg); if (rc != SQLITE_OK ) { fprintf(stderr, "SQL error: %s\n", err_msg); sqlite3_free(err_msg); sqlite3_close(db); return 1; } fprintf(stderr, "facedistances deleted and transaction started.\n"); for (long i = 0; i < entries; i++) { memset(ppFaces[i]->distances, 0, sizeof(*ppFaces[i]->distances) * entries); } char sqlBuf[1024]; processed = 0; last = 0; for (long i = 0; i < entries; i++) { Face *pLink = ppFaces[i]; for (long j = 0; j < entries; j++) { Face *pTarget = ppFaces[j]; processed++; if (processed % 1000 == 0) { int perc = 100 * processed / (entries * entries); if (perc != last) { fprintf(stderr, "Computed %d%% complete.\n", perc); last = perc; } } if (i == j) { pLink->distances[i] = 0.0; pTarget->distances[j] = 0.0; continue; } if (pLink->distances[j] != 0.0) { continue; } pLink->distances[j] = pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor); if (pLink->distances[j] < 0.5) { sprintf(sqlBuf, "INSERT INTO facedistances (face1Id,face2Id,distance) VALUES (%ld,%ld,%lf);", ((pLink->faceId < pTarget->faceId) ? pLink->faceId : pTarget->faceId), ((pLink->faceId < pTarget->faceId) ? pTarget->faceId : pLink->faceId), pLink->distances[j]); rc = sqlite3_exec(db, sqlBuf, 0, 0, &err_msg); if (rc != SQLITE_OK ) { fprintf(stderr, "SQL error: %s\n", err_msg); sqlite3_free(err_msg); sqlite3_close(db); return 1; } } } } sprintf(sqlBuf, "UPDATE faces SET lastComparedId=%ld;", maxId); rc = sqlite3_exec(db, "COMMIT;", 0, 0, &err_msg); if (rc != SQLITE_OK ) { fprintf(stderr, "SQL error: %s\n", err_msg); sqlite3_free(err_msg); sqlite3_close(db); return 1; } sqlite3_close(db); return 0; }