Clustering scanner

Signed-off-by: James Ketrenos <james_gitlab@ketrenos.com>
This commit is contained in:
James Ketrenos 2020-01-07 23:17:24 -08:00
parent f28293c3f1
commit 0ed48fd7f7
4 changed files with 371 additions and 88 deletions

34
clusters-pre Normal file
View File

@ -0,0 +1,34 @@
<html>
<script>'<base href="BASEPATH">';</script>
<script>
document.addEventListener("DOMContentLoaded", (event) => {
clusters.forEach((cluster, index) => {
var div = document.createElement("div");
div.textContent = "Cluster " + (index + 1);
document.body.appendChild(div);
cluster.forEach((id) => {
var div = document.createElement("div");
div.classList.add("face");
div.style.backgroundImage = "url(face-data/" + (id % 100) + "/" + id + "-original.png)";
document.body.appendChild(div);
});
});
});
</script>
<style>
body {
margin: 0;
padding: 0;
}
.face {
width: 128px;
height: 128px;
background-size: contain;
background-position: center center;
display: inline-block;
border: 1px solid black;
margin: 0.5em;
}
</style>

View File

@ -6,4 +6,7 @@ ifeq ("$(wildcard /usr/include/sqlite3.h)","")
endif endif
scanner: scanner.c scanner: scanner.c
gcc -o scanner scanner.c -lm -lsqlite3 gcc -o scanner -g scanner.c -lm -lsqlite3
clean:
rm scanner

View File

@ -6,24 +6,33 @@
#include <math.h> #include <math.h>
#include <sqlite3.h> #include <sqlite3.h>
typedef enum {
UNDEFINED = 0,
CORE = 1,
EDGE = 2,
NOISE = 3
} ClusterTypes;
typedef struct Face { typedef struct Face {
long faceId;
long double descriptor[128]; long double descriptor[128];
int faceId; long int clusterId;
ClusterTypes clusterType;
long double *distances; long double *distances;
struct Face *next;
struct Face *prev;
} Face; } Face;
typedef struct FaceLink {
struct FaceLink *pNext;
Face *pFace;
} FaceLink;
char fileBuf[5000]; char fileBuf[5000];
char pathBuf[1028]; char pathBuf[1028];
Face *readFaceDescriptor(int id, char *path) { Face *readFaceDescriptor(Face *pFace, long id, char *path) {
FILE *f; FILE *f;
Face *pFace = (Face *)malloc(sizeof(Face));
memset(pFace, 0, sizeof(Face));
f = fopen(path, "r"); f = fopen(path, "r");
if (!f) { if (!f) {
free(pFace);
return NULL; return NULL;
} }
size_t s = fread(fileBuf, 1, sizeof(fileBuf), f); size_t s = fread(fileBuf, 1, sizeof(fileBuf), f);
@ -34,7 +43,8 @@ Face *readFaceDescriptor(int id, char *path) {
while (*p && *p != '-' && *p != '+' && (*p < '0' || *p > '9')) { while (*p && *p != '-' && *p != '+' && (*p < '0' || *p > '9')) {
p++; p++;
} }
for (int i = 0; i < 128; i++) { int i = 0;
for (i = 0; i < 128; i++) {
char *start = p; char *start = p;
while (*p && *p != ',' && *p != ']' && *p != ' ' && *p != '\n') { while (*p && *p != ',' && *p != ']' && *p != ' ' && *p != '\n') {
p++; p++;
@ -46,14 +56,17 @@ Face *readFaceDescriptor(int id, char *path) {
sscanf(start, "%Lf", &pFace->descriptor[i]); sscanf(start, "%Lf", &pFace->descriptor[i]);
} }
if (i != 128) {
return NULL;
}
pFace->faceId = id; pFace->faceId = id;
pFace->next = pFace->prev = NULL;
return pFace; return pFace;
} }
long double euclideanDistance(long double *a, long double *b) { long double euclideanDistance(long double *a, long double *b) {
long double sum = 0.0; long double sum = 0.0L;
for (int i = 0; i < 128; i++) { for (int i = 0; i < 128; i++) {
long double delta = a[i] - b[i]; long double delta = a[i] - b[i];
sum += delta * delta; sum += delta * delta;
@ -61,16 +74,193 @@ long double euclideanDistance(long double *a, long double *b) {
return sqrtl(sum); return sqrtl(sum);
} }
int main(int argc, char *argv[]) { /* https://en.wikipedia.org/wiki/DBSCAN */
int maxId = 0; #if 0
int len = 0; DBSCAN(DB, distFunc, eps, minPts) {
int i; C = 0 /* Cluster counter */
Face *pChain = NULL; for each point P in database DB {
for (i = 0; i < 100; i++) { if label(P) undefined then continue /* Previously processed in inner loop */
sprintf(pathBuf, "%s/face-data/%d", argv[1], i); Neighbors N = RangeQuery(DB, distFunc, P, eps) /* Find neighbors */
DIR *faceDir = opendir(pathBuf); if |N| < minPts then { /* Density check */
fprintf(stderr, "Reading %s...\n", pathBuf); label(P) = Noise /* Label as Noise */
continue
}
C = C + 1 /* next cluster label */
label(P) = C /* Label initial point */
Seed set S = N \ {P} /* Neighbors to expand */
for each point Q in S { /* Process every seed point */
if label(Q) = Noise then label(Q) = C /* Change Noise to border point */
if label(Q) undefined then continue /* Previously processed */
label(Q) = C /* Label neighbor */
Neighbors N = RangeQuery(DB, distFunc, Q, eps) /* Find neighbors */
if |N| minPts then { /* Density check */
S = S N /* Add new neighbors to seed set */
}
}
}
}
RangeQuery(DB, distFunc, Q, eps) {
Neighbors = empty list
for each point P in database DB { /* Scan all points in the database */
if distFunc(Q, P) eps then { /* Compute distance and check epsilon */
Neighbors = Neighbors {P} /* Add to result */
}
}
return Neighbors
}
#endif
FaceLink *RangeQuery(Face *pFaces, long int faceCount, Face *pQ, double eps) {
FaceLink *pNeighbors = NULL;
for (long int i = 0; i < faceCount; i++) {
Face *pFace = &pFaces[i];
if (pFace->faceId == pQ->faceId) {
continue;
}
if (pQ->distances[i] <= eps) {
FaceLink *pLink = malloc(sizeof(*pLink));
memset(pLink, 0, sizeof(*pLink));
pLink->pFace = pFace;
pLink->pNext = pNeighbors;
pNeighbors = pLink;
}
}
return pNeighbors;
}
void freeChain(FaceLink *pLink) {
while (pLink) {
FaceLink *tmp = pLink->pNext;
free(pLink);
pLink = tmp;
}
}
long int chainLength(FaceLink *pLink) {
long int count = 0;
while (pLink) {
count++;
pLink = pLink->pNext;
}
return count;
}
long int DBSCAN(Face *faces, long int faceCount, double eps, int minPts) {
long int C = 0;
for (long int i = 0; i < faceCount; i++) {
Face *pFace = &faces[i];
if (pFace->clusterType != UNDEFINED) {
continue;
}
FaceLink *pNeighbors = RangeQuery(faces, faceCount, pFace, eps);
long neighborCount = chainLength(pNeighbors);
if (neighborCount < minPts) {
pFace->clusterType = NOISE;
freeChain(pNeighbors);
continue;
}
//printf("%ld has %ld neighbors.\n", pFace->faceId, neighborCount);
C++;
pFace->clusterId = C;
pFace->clusterType = CORE;
FaceLink *pLink = pNeighbors;
while (pLink) {
Face *pQ = pLink->pFace;
if (pQ->faceId == pFace->faceId) {
pLink = pLink->pNext;
continue;
}
if (pQ->clusterType == NOISE) {
pQ->clusterId = C;
pQ->clusterType = EDGE;
}
if (pQ->clusterType != UNDEFINED) {
pLink = pLink->pNext;
continue;
}
pQ->clusterId = C;
pQ->clusterType = EDGE;
FaceLink *pSubNeighbors = RangeQuery(faces, faceCount, pQ, eps);
neighborCount = chainLength(pSubNeighbors);
if (neighborCount >= minPts) {
pQ->clusterType = CORE;
/* Append these neighbors to the end of the chain */
FaceLink *pTmp = pLink;
while (pTmp->pNext) {
pTmp = pTmp->pNext;
}
pTmp->pNext = pSubNeighbors;
} else {
freeChain(pSubNeighbors);
}
pLink = pLink->pNext;
}
freeChain(pNeighbors);
}
return C;
}
/*
* 1. Count how many entries there are
* 2. Allocate storage to hold all entries
* 3. Read all entries into flat array
* 4. Allocate MxM matrix and pre-calculate distances
* 5. Perform DBSCAN across MxM matrix to cluster
*/
int main(int argc, char *argv[]) {
long maxId = 0;
long i;
long entries = 0;
for (i = 0; i < 100; i++) {
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
DIR *faceDir = opendir(pathBuf);
if (!faceDir) {
continue;
}
struct dirent *ent;
while ((ent = readdir(faceDir)) != NULL) {
if (strstr(ent->d_name, ".json") == NULL) {
continue;
}
entries++;
}
closedir(faceDir);
}
Face *pFaces = malloc(sizeof(Face) * entries);
if (!pFaces) {
fprintf(stderr, "Unable to allocate storage face descriptors.");
return -1;
}
memset(pFaces, 0, sizeof(Face) * entries);
for (i = 0; i < entries; i++) {
pFaces[i].distances = malloc(sizeof(*pFaces[i].distances) * entries);
if (!pFaces[i].distances) {
fprintf(stderr, "Unable to allocate storage for distance dictionary.");
return -1;
}
memset(pFaces[i].distances, 0, sizeof(*pFaces[i].distances) * entries);
}
entries = 0;
for (i = 0; i < 100; i++) {
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
DIR *faceDir = opendir(pathBuf);
// fprintf(stderr, "Reading %s...\n", pathBuf);
if (!faceDir) { if (!faceDir) {
fprintf(stderr, "Can not open %s\n", pathBuf); fprintf(stderr, "Can not open %s\n", pathBuf);
continue; continue;
@ -81,7 +271,7 @@ int main(int argc, char *argv[]) {
if (strstr(ent->d_name, ".json") == NULL) { if (strstr(ent->d_name, ".json") == NULL) {
continue; continue;
} }
int id = 0; long id = 0;
char *p = ent->d_name; char *p = ent->d_name;
while (*p && *p != '-') { while (*p && *p != '-') {
id *= 10; id *= 10;
@ -91,31 +281,92 @@ int main(int argc, char *argv[]) {
char path[1028*2]; char path[1028*2];
sprintf(path, "%s/%s", pathBuf, ent->d_name); sprintf(path, "%s/%s", pathBuf, ent->d_name);
maxId = maxId > id ? maxId : id; maxId = maxId > id ? maxId : id;
Face *pFace = readFaceDescriptor(id, path); if (!readFaceDescriptor(&pFaces[entries], id, path)) {
if (!pFace) { fprintf(stderr, "Unable to read %s.\n", path);
continue; continue;
} }
len++; entries++;
if (len % 1000 == 0) { if (entries % 1000 == 0) {
fprintf(stderr, "...read %d...\n", len); fprintf(stderr, "...read %ld...\n", entries);
} }
if (pChain) {
pFace->next = pChain;
}
pChain = pFace;
} }
closedir(faceDir); closedir(faceDir);
} }
fprintf(stderr, "Read %d face descriptors...\n", len); fprintf(stderr, "Read %ld face descriptors...\n", entries);
/* Allocate storage for all distances */ long double total = 0.0;
Face *pLink = pChain; for (long i = 0; i < entries; i++) {
while (pLink) { Face *pLink = &pFaces[i];
pLink->distances = (long double *)malloc(sizeof(long double) * len); for (long j = 0; j < entries; j++) {
pLink = pLink->next; Face *pTarget = &pFaces[j];
if (i == j) {
pLink->distances[i] = 0.0L;
pTarget->distances[j] = 0.0L;
continue;
} }
if (pLink->distances[j] != 0.0L) {
continue;
}
pLink->distances[j] =
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
total += pLink->distances[j];
}
}
fprintf(stderr, "Average distance: %Lf\n", (long double)1. * total / (entries * entries));
long int clusters = DBSCAN(pFaces, entries, 0.44L, 2);
long int undefined = 0, outlier = 0, core = 0, reachable = 0;
for (i = 0; i < entries; i++) {
switch (pFaces[i].clusterType) {
case NOISE:
outlier++;
break;
case UNDEFINED:
undefined++;
break;
case CORE:
core++;
break;
case EDGE:
reachable++;
break;
}
}
fprintf(stderr, "%ld clusters identified!\n", clusters);
fprintf(stderr, "%ld NOISE\n", outlier);
fprintf(stderr, "%ld UNDEFINED\n", undefined);
fprintf(stderr, "%ld CORE\n", core);
fprintf(stderr, "%ld EDGE\n", reachable);
fprintf(stdout, "<script>\nvar clusters = [\n");
for (long i = 1; i <= clusters; i++) {
long nodes = 0;
fprintf(stdout, "/* %ld. */ [", i);
for (long int j = 0; j < entries; j++) {
if (pFaces[j].clusterId == i) {
if (nodes == 0) {
fprintf(stdout, "%ld", pFaces[j].faceId);
} else {
fprintf(stdout, ",%ld", pFaces[j].faceId);
}
nodes++;
}
}
if (i < clusters) {
fprintf(stdout, "],\n");
} else {
fprintf(stdout, "]\n");
}
}
fprintf(stdout, "];\n</script>\n");
/* Allocate storage for all distances */
sqlite3 *db; sqlite3 *db;
int rc = sqlite3_open("db/photos.db", &db); int rc = sqlite3_open("db/photos.db", &db);
@ -143,24 +394,30 @@ int main(int argc, char *argv[]) {
fprintf(stderr, "facedistances deleted and transaction started.\n"); fprintf(stderr, "facedistances deleted and transaction started.\n");
char sqlBuf[1024]; char sqlBuf[1024];
pLink = pChain;
int sourceIndex = 0, lines = 0; int sourceIndex = 0, lines = 0;
while (pLink) { for (long i = 0; i < entries; i++) {
Face *pLink = &pFaces[i];
int targetIndex = 0; int targetIndex = 0;
Face *pTarget = pChain; for (long j = 0; j < entries; j++) {
while (pTarget) { Face *pTarget = &pFaces[j];
if (targetIndex == sourceIndex) { if (i == j) {
pLink->distances[targetIndex] = 0.0; pLink->distances[i] = 0.0L;
pTarget->distances[sourceIndex] = 0.0; pTarget->distances[j] = 0.0L;
} else { continue;
if (pLink->distances[targetIndex] == 0.0) { }
pLink->distances[targetIndex] =
pTarget->distances[sourceIndex] = euclideanDistance(pLink->descriptor, pTarget->descriptor); if (pLink->distances[j] != 0.0L) {
if (pLink->distances[targetIndex] < 0.5) { // continue;
sprintf(sqlBuf, "INSERT INTO facedistances (face1Id,face2Id,distance) VALUES (%d,%d,%Lf);", }
pLink->distances[j] =
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
if (pLink->distances[j] < 0.5) {
sprintf(sqlBuf, "INSERT INTO facedistances (face1Id,face2Id,distance) VALUES (%ld,%ld,%Lf);",
((pLink->faceId < pTarget->faceId) ? pLink->faceId : pTarget->faceId), ((pLink->faceId < pTarget->faceId) ? pLink->faceId : pTarget->faceId),
((pLink->faceId < pTarget->faceId) ? pTarget->faceId : pLink->faceId), ((pLink->faceId < pTarget->faceId) ? pTarget->faceId : pLink->faceId),
pLink->distances[targetIndex]); pLink->distances[j]);
rc = sqlite3_exec(db, sqlBuf, 0, 0, &err_msg); rc = sqlite3_exec(db, sqlBuf, 0, 0, &err_msg);
if (rc != SQLITE_OK ) { if (rc != SQLITE_OK ) {
fprintf(stderr, "SQL error: %s\n", err_msg); fprintf(stderr, "SQL error: %s\n", err_msg);
@ -170,19 +427,14 @@ int main(int argc, char *argv[]) {
} }
lines++; lines++;
if (lines % 1000 == 0) { if (lines % 1000 == 0) {
fprintf(stderr, "...output %d DB lines (%0.2f complete)...\n", lines, (float)(1. * sourceIndex / (1. * len))); fprintf(stderr, "...output %d DB lines (%0.2f complete)...\n", lines,
(float)(1. * sourceIndex / (1. * entries)));
} }
} }
} }
} }
pTarget = pTarget->next;
targetIndex++;
}
pLink = pLink->next;
sourceIndex++;
}
sprintf(sqlBuf, "UPDATE faces SET lastComparedId=%d;", maxId); sprintf(sqlBuf, "UPDATE faces SET lastComparedId=%ld;", maxId);
rc = sqlite3_exec(db, "COMMIT;", 0, 0, &err_msg); rc = sqlite3_exec(db, "COMMIT;", 0, 0, &err_msg);
if (rc != SQLITE_OK ) { if (rc != SQLITE_OK ) {

View File

@ -55,7 +55,7 @@ function alignFromLandmarks(image, landmarks) {
ctx.rotate(rotation); ctx.rotate(rotation);
ctx.scale(scale, scale); ctx.scale(scale, scale);
ctx.drawImage(image, 0, 0); ctx.drawImage(image, 0, 0);
/*
ctx.strokeStyle = "red"; ctx.strokeStyle = "red";
ctx.strokeWidth = "1"; ctx.strokeWidth = "1";
ctx.beginPath(); ctx.beginPath();
@ -67,7 +67,7 @@ function alignFromLandmarks(image, landmarks) {
} }
}); });
ctx.stroke(); ctx.stroke();
*/
return canvas; return canvas;
} }
@ -140,20 +140,6 @@ require("./db/photos").then(function(db) {
const file = photo.path + photo.filename; const file = photo.path + photo.filename;
return canvas.loadImage(picturesPath + file).then(async (image) => { return canvas.loadImage(picturesPath + file).then(async (image) => {
const detectors = await faceapi.detectAllFaces(image,
new faceapi.SsdMobilenetv1Options({
minConfidence: 0.8
})
).withFaceLandmarks();
detectors.forEach(async (detector) => {
const canvas = alignFromLandmarks(image, detector.landmarks);
const descriptor = await faceapi.computeFaceDescriptor(canvas);
const data = [];
/* Confert from sparse object to dense array */
for (let i = 0; i < 128; i++) {
data.push(descriptor[i]);
}
const detectors = [ { const detectors = [ {
detection: { detection: {
_box: { _box: {
@ -172,7 +158,6 @@ require("./db/photos").then(function(db) {
/* This is a file */ /* This is a file */
console.log(`Loading ${file}...`); console.log(`Loading ${file}...`);
id = undefined; id = undefined;
loader = canvas.loadImage(picturesPath + file).then(async (image) => { loader = canvas.loadImage(picturesPath + file).then(async (image) => {
const detectors = await faceapi.detectAllFaces(image, const detectors = await faceapi.detectAllFaces(image,
new faceapi.SsdMobilenetv1Options({ new faceapi.SsdMobilenetv1Options({
@ -183,10 +168,19 @@ require("./db/photos").then(function(db) {
detectors.forEach(async (detector) => { detectors.forEach(async (detector) => {
const canvas = alignFromLandmarks(image, detector.landmarks); const canvas = alignFromLandmarks(image, detector.landmarks);
const descriptor = await faceapi.computeFaceDescriptor(canvas); const descriptor = await faceapi.computeFaceDescriptor(canvas);
fs.writeFileSync("rotation.png", canvas.toBuffer("image/png", {
quality: 0.95,
chromaSubsampling: false
}));
process.exit(-1);
// .withFaceDescriptors();
const data = []; const data = [];
/* Confert from sparse object to dense array */ /* Confert from sparse object to dense array */
for (let i = 0; i < 128; i++) { for (let i = 0; i < 128; i++) {
data.push(descriptor[i]); data.push(detector.descriptor[i]);
} }
detector.descriptor = data; detector.descriptor = data;
}); });