diff --git a/scanner/scanner.c b/scanner/scanner.c index b803251..5d1e523 100644 --- a/scanner/scanner.c +++ b/scanner/scanner.c @@ -96,11 +96,11 @@ RangeQuery(DB, distFunc, Q, eps) { } #endif -FaceLink *RangeQuery(Face **ppFaces, long int faceCount, Face *pQ, float eps) { +FaceLink *RangeQuery(Face **ppFaces, long int faceCount, Face *pQ, float eps, long int clusterToBreak) { FaceLink *pNeighbors = NULL; for (long int i = 0; i < faceCount; i++) { Face *pFace = ppFaces[i]; - if (pFace->profileDistance <= 0.5) { + if (clusterToBreak > 0 && pFace->clusterId != clusterToBreak) { continue; } @@ -137,11 +137,14 @@ long int chainLength(FaceLink *pLink) { return count; } -long int DBSCAN(Face **ppFaces, long int faceCount, float eps, int minPts) { - long int C = 0; +long int C = 0; + +long int DBSCAN(Face **ppFaces, long int faceCount, float eps, int minPts, long int clusterToBreak) { + int count = 0; + for (long int i = 0; i < faceCount; i++) { Face *pFace = ppFaces[i]; - if (pFace->profileDistance <= 0.5) { + if (clusterToBreak > 0 && pFace->clusterId != clusterToBreak) { continue; } @@ -150,26 +153,16 @@ long int DBSCAN(Face **ppFaces, long int faceCount, float eps, int minPts) { } float threshold = eps; - FaceLink *pNeighbors = RangeQuery(ppFaces, faceCount, pFace, eps); + FaceLink *pNeighbors = RangeQuery(ppFaces, faceCount, pFace, eps, clusterToBreak); long neighborCount = chainLength(pNeighbors); -/* - while (neighborCount > minPts * 5) { - threshold *= 0.9; - freeChain(pNeighbors); - pNeighbors = RangeQuery(ppFaces, faceCount, pFace, threshold); - neighborCount = chainLength(pNeighbors); - fprintf(stderr, "\rWith eps of %f, %ld has %ld neighbors.", threshold, pFace->faceId, neighborCount); - } -*/ - if (neighborCount < minPts) { pFace->clusterType = NOISE; freeChain(pNeighbors); continue; } - //printf("%ld has %ld neighbors.\n", pFace->faceId, neighborCount); C++; + count++; pFace->clusterId = C; pFace->clusterType = CORE; @@ -193,18 +186,9 @@ long int DBSCAN(Face **ppFaces, long int faceCount, float eps, int minPts) { pQ->clusterId = C; pQ->clusterType = EDGE; - FaceLink *pSubNeighbors = RangeQuery(ppFaces, faceCount, pQ, eps); + FaceLink *pSubNeighbors = RangeQuery(ppFaces, faceCount, pQ, eps, clusterToBreak); neighborCount = chainLength(pSubNeighbors); -/* - threshold = eps; - while (neighborCount > minPts * 1.25) { - threshold *= 0.9; - freeChain(pSubNeighbors); - pSubNeighbors = RangeQuery(ppFaces, faceCount, pQ, threshold); - neighborCount = chainLength(pSubNeighbors); - fprintf(stderr, "\rWith eps of %f, %ld has %ld neighbors.", threshold, pQ->faceId, neighborCount); - } -*/ + if (neighborCount >= minPts) { pQ->clusterType = CORE; /* Append these neighbors to the end of the chain */ @@ -222,7 +206,7 @@ long int DBSCAN(Face **ppFaces, long int faceCount, float eps, int minPts) { freeChain(pNeighbors); } - return C; + return count; } typedef struct { @@ -297,6 +281,27 @@ int parseFaceIdRow(void *data, int argc, char **argv, char **column) { return 0; } +void getClusterCounts(int *stats, Face **ppFaces, long int entries) { + for (int i = 0; i < entries; i++) { + if (ppFaces[i]->clusterType != CORE && ppFaces[i]->clusterType != EDGE) { + continue; + } + stats[ppFaces[i]->clusterId - 1]++; + } +} + +long int getClusterCount(Face **ppFaces, long int entries, int clusterId) { + long int count = 0; + for (long int i = 0; i < entries; i++) { + if (ppFaces[i]->clusterId == clusterId && + ppFaces[i]->clusterType != NOISE && + ppFaces[i]->clusterType != UNDEFINED) { + count++; + } + } + return count; +} + /* * 1. Count how many entries there are * 2. Allocate storage to hold all entries @@ -421,6 +426,7 @@ int main(int argc, char *argv[]) { ppFaces[dst++] = pFace; } } + fprintf(stderr, "Dropped %ld faces as too close to profile photos (set of %ld).\n", (entries - dst), (sizeof(profileDescriptors) / sizeof(profileDescriptors[0]))); entries = dst; @@ -466,14 +472,55 @@ int main(int argc, char *argv[]) { fprintf(stderr, "Calculating clusters: MAX_DISTANCE(%f) MIN_PTS(%ld)\n", maxDistance, minPts); - long int clusters = DBSCAN(ppFaces, entries, maxDistance, minPts); + long int clusters = DBSCAN(ppFaces, entries, maxDistance, minPts, -1); + fprintf(stderr, "\n%ld clusters identified before size-split.\n", clusters); + + if (clusters > 0) { + int *stats = malloc(sizeof(int) * clusters), delta = 0; + memset(stats, 0, sizeof(int) * clusters); + getClusterCounts(stats, ppFaces, entries); + + for (int i = 0; i < clusters; i++) { + if (stats[i] < 100) { + continue; + } + + for (int j = 0; j < entries; j++) { + Face *pFace = ppFaces[j]; + if (pFace->clusterId == i + 1) { + pFace->clusterType = UNDEFINED; + } + } + + int split = DBSCAN(ppFaces, entries, maxDistance * 0.99L, minPts, i + 1); + fprintf(stderr, "Cluster %d had %d units. Split into %d clusters.\n", i + 1, stats[i], split); + for (int c = 0; c < split; c++) { + fprintf(stderr, "%ld. %ld\n", c + clusters, getClusterCount(ppFaces, entries, c + clusters)); + } + + for (int j = 0; j < entries; j++) { + Face *pFace = ppFaces[j]; + if (pFace->clusterId == i + 1) { + pFace->clusterType = CORE; + } + } + + delta += split; + } + + clusters += delta; + free(stats); + } + long int undefined = 0, outlier = 0, core = 0, reachable = 0; for (i = 0; i < entries; i++) { switch (ppFaces[i]->clusterType) { case NOISE: + ppFaces[i]->clusterId = 0; outlier++; break; case UNDEFINED: + ppFaces[i]->clusterId = 0; undefined++; break; case CORE: @@ -485,7 +532,7 @@ int main(int argc, char *argv[]) { } } - fprintf(stderr, "\n%ld clusters identified!\n", clusters); + fprintf(stderr, "\n%ld clusters being written:\n", clusters); fprintf(stderr, "%ld NOISE\n", outlier); fprintf(stderr, "%ld UNDEFINED\n", undefined); fprintf(stderr, "%ld CORE\n", core); @@ -494,10 +541,10 @@ int main(int argc, char *argv[]) { fprintf(stdout, "\n");