Clustering scanner

Signed-off-by: James Ketrenos <james_gitlab@ketrenos.com>
This commit is contained in:
James Ketrenos 2020-01-07 23:17:24 -08:00
parent f28293c3f1
commit 0ed48fd7f7
4 changed files with 371 additions and 88 deletions

34
clusters-pre Normal file
View File

@ -0,0 +1,34 @@
<html>
<script>'<base href="BASEPATH">';</script>
<script>
document.addEventListener("DOMContentLoaded", (event) => {
clusters.forEach((cluster, index) => {
var div = document.createElement("div");
div.textContent = "Cluster " + (index + 1);
document.body.appendChild(div);
cluster.forEach((id) => {
var div = document.createElement("div");
div.classList.add("face");
div.style.backgroundImage = "url(face-data/" + (id % 100) + "/" + id + "-original.png)";
document.body.appendChild(div);
});
});
});
</script>
<style>
body {
margin: 0;
padding: 0;
}
.face {
width: 128px;
height: 128px;
background-size: contain;
background-position: center center;
display: inline-block;
border: 1px solid black;
margin: 0.5em;
}
</style>

View File

@ -6,4 +6,7 @@ ifeq ("$(wildcard /usr/include/sqlite3.h)","")
endif
scanner: scanner.c
gcc -o scanner scanner.c -lm -lsqlite3
gcc -o scanner -g scanner.c -lm -lsqlite3
clean:
rm scanner

View File

@ -6,35 +6,45 @@
#include <math.h>
#include <sqlite3.h>
typedef enum {
UNDEFINED = 0,
CORE = 1,
EDGE = 2,
NOISE = 3
} ClusterTypes;
typedef struct Face {
long faceId;
long double descriptor[128];
int faceId;
long int clusterId;
ClusterTypes clusterType;
long double *distances;
struct Face *next;
struct Face *prev;
} Face;
typedef struct FaceLink {
struct FaceLink *pNext;
Face *pFace;
} FaceLink;
char fileBuf[5000];
char pathBuf[1028];
Face *readFaceDescriptor(int id, char *path) {
Face *readFaceDescriptor(Face *pFace, long id, char *path) {
FILE *f;
Face *pFace = (Face *)malloc(sizeof(Face));
memset(pFace, 0, sizeof(Face));
f = fopen(path, "r");
if (!f) {
free(pFace);
return NULL;
}
size_t s = fread(fileBuf, 1, sizeof(fileBuf), f);
fclose(f);
char *p = fileBuf;
fileBuf[s] = 0;
while (*p && *p != '-' && *p != '+' && (*p < '0' || *p > '9')) {
p++;
}
for (int i = 0; i < 128; i++) {
int i = 0;
for (i = 0; i < 128; i++) {
char *start = p;
while (*p && *p != ',' && *p != ']' && *p != ' ' && *p != '\n') {
p++;
@ -46,14 +56,17 @@ Face *readFaceDescriptor(int id, char *path) {
sscanf(start, "%Lf", &pFace->descriptor[i]);
}
if (i != 128) {
return NULL;
}
pFace->faceId = id;
pFace->next = pFace->prev = NULL;
return pFace;
}
long double euclideanDistance(long double *a, long double *b) {
long double sum = 0.0;
long double sum = 0.0L;
for (int i = 0; i < 128; i++) {
long double delta = a[i] - b[i];
sum += delta * delta;
@ -61,16 +74,193 @@ long double euclideanDistance(long double *a, long double *b) {
return sqrtl(sum);
}
int main(int argc, char *argv[]) {
int maxId = 0;
int len = 0;
int i;
Face *pChain = NULL;
for (i = 0; i < 100; i++) {
sprintf(pathBuf, "%s/face-data/%d", argv[1], i);
DIR *faceDir = opendir(pathBuf);
fprintf(stderr, "Reading %s...\n", pathBuf);
/* https://en.wikipedia.org/wiki/DBSCAN */
#if 0
DBSCAN(DB, distFunc, eps, minPts) {
C = 0 /* Cluster counter */
for each point P in database DB {
if label(P) undefined then continue /* Previously processed in inner loop */
Neighbors N = RangeQuery(DB, distFunc, P, eps) /* Find neighbors */
if |N| < minPts then { /* Density check */
label(P) = Noise /* Label as Noise */
continue
}
C = C + 1 /* next cluster label */
label(P) = C /* Label initial point */
Seed set S = N \ {P} /* Neighbors to expand */
for each point Q in S { /* Process every seed point */
if label(Q) = Noise then label(Q) = C /* Change Noise to border point */
if label(Q) undefined then continue /* Previously processed */
label(Q) = C /* Label neighbor */
Neighbors N = RangeQuery(DB, distFunc, Q, eps) /* Find neighbors */
if |N| minPts then { /* Density check */
S = S N /* Add new neighbors to seed set */
}
}
}
}
RangeQuery(DB, distFunc, Q, eps) {
Neighbors = empty list
for each point P in database DB { /* Scan all points in the database */
if distFunc(Q, P) eps then { /* Compute distance and check epsilon */
Neighbors = Neighbors {P} /* Add to result */
}
}
return Neighbors
}
#endif
FaceLink *RangeQuery(Face *pFaces, long int faceCount, Face *pQ, double eps) {
FaceLink *pNeighbors = NULL;
for (long int i = 0; i < faceCount; i++) {
Face *pFace = &pFaces[i];
if (pFace->faceId == pQ->faceId) {
continue;
}
if (pQ->distances[i] <= eps) {
FaceLink *pLink = malloc(sizeof(*pLink));
memset(pLink, 0, sizeof(*pLink));
pLink->pFace = pFace;
pLink->pNext = pNeighbors;
pNeighbors = pLink;
}
}
return pNeighbors;
}
void freeChain(FaceLink *pLink) {
while (pLink) {
FaceLink *tmp = pLink->pNext;
free(pLink);
pLink = tmp;
}
}
long int chainLength(FaceLink *pLink) {
long int count = 0;
while (pLink) {
count++;
pLink = pLink->pNext;
}
return count;
}
long int DBSCAN(Face *faces, long int faceCount, double eps, int minPts) {
long int C = 0;
for (long int i = 0; i < faceCount; i++) {
Face *pFace = &faces[i];
if (pFace->clusterType != UNDEFINED) {
continue;
}
FaceLink *pNeighbors = RangeQuery(faces, faceCount, pFace, eps);
long neighborCount = chainLength(pNeighbors);
if (neighborCount < minPts) {
pFace->clusterType = NOISE;
freeChain(pNeighbors);
continue;
}
//printf("%ld has %ld neighbors.\n", pFace->faceId, neighborCount);
C++;
pFace->clusterId = C;
pFace->clusterType = CORE;
FaceLink *pLink = pNeighbors;
while (pLink) {
Face *pQ = pLink->pFace;
if (pQ->faceId == pFace->faceId) {
pLink = pLink->pNext;
continue;
}
if (pQ->clusterType == NOISE) {
pQ->clusterId = C;
pQ->clusterType = EDGE;
}
if (pQ->clusterType != UNDEFINED) {
pLink = pLink->pNext;
continue;
}
pQ->clusterId = C;
pQ->clusterType = EDGE;
FaceLink *pSubNeighbors = RangeQuery(faces, faceCount, pQ, eps);
neighborCount = chainLength(pSubNeighbors);
if (neighborCount >= minPts) {
pQ->clusterType = CORE;
/* Append these neighbors to the end of the chain */
FaceLink *pTmp = pLink;
while (pTmp->pNext) {
pTmp = pTmp->pNext;
}
pTmp->pNext = pSubNeighbors;
} else {
freeChain(pSubNeighbors);
}
pLink = pLink->pNext;
}
freeChain(pNeighbors);
}
return C;
}
/*
* 1. Count how many entries there are
* 2. Allocate storage to hold all entries
* 3. Read all entries into flat array
* 4. Allocate MxM matrix and pre-calculate distances
* 5. Perform DBSCAN across MxM matrix to cluster
*/
int main(int argc, char *argv[]) {
long maxId = 0;
long i;
long entries = 0;
for (i = 0; i < 100; i++) {
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
DIR *faceDir = opendir(pathBuf);
if (!faceDir) {
continue;
}
struct dirent *ent;
while ((ent = readdir(faceDir)) != NULL) {
if (strstr(ent->d_name, ".json") == NULL) {
continue;
}
entries++;
}
closedir(faceDir);
}
Face *pFaces = malloc(sizeof(Face) * entries);
if (!pFaces) {
fprintf(stderr, "Unable to allocate storage face descriptors.");
return -1;
}
memset(pFaces, 0, sizeof(Face) * entries);
for (i = 0; i < entries; i++) {
pFaces[i].distances = malloc(sizeof(*pFaces[i].distances) * entries);
if (!pFaces[i].distances) {
fprintf(stderr, "Unable to allocate storage for distance dictionary.");
return -1;
}
memset(pFaces[i].distances, 0, sizeof(*pFaces[i].distances) * entries);
}
entries = 0;
for (i = 0; i < 100; i++) {
sprintf(pathBuf, "%s/face-data/%ld", argv[1], i);
DIR *faceDir = opendir(pathBuf);
// fprintf(stderr, "Reading %s...\n", pathBuf);
if (!faceDir) {
fprintf(stderr, "Can not open %s\n", pathBuf);
continue;
@ -81,7 +271,7 @@ int main(int argc, char *argv[]) {
if (strstr(ent->d_name, ".json") == NULL) {
continue;
}
int id = 0;
long id = 0;
char *p = ent->d_name;
while (*p && *p != '-') {
id *= 10;
@ -91,31 +281,92 @@ int main(int argc, char *argv[]) {
char path[1028*2];
sprintf(path, "%s/%s", pathBuf, ent->d_name);
maxId = maxId > id ? maxId : id;
Face *pFace = readFaceDescriptor(id, path);
if (!pFace) {
if (!readFaceDescriptor(&pFaces[entries], id, path)) {
fprintf(stderr, "Unable to read %s.\n", path);
continue;
}
len++;
if (len % 1000 == 0) {
fprintf(stderr, "...read %d...\n", len);
entries++;
if (entries % 1000 == 0) {
fprintf(stderr, "...read %ld...\n", entries);
}
if (pChain) {
pFace->next = pChain;
}
pChain = pFace;
}
closedir(faceDir);
}
fprintf(stderr, "Read %d face descriptors...\n", len);
fprintf(stderr, "Read %ld face descriptors...\n", entries);
/* Allocate storage for all distances */
Face *pLink = pChain;
while (pLink) {
pLink->distances = (long double *)malloc(sizeof(long double) * len);
pLink = pLink->next;
long double total = 0.0;
for (long i = 0; i < entries; i++) {
Face *pLink = &pFaces[i];
for (long j = 0; j < entries; j++) {
Face *pTarget = &pFaces[j];
if (i == j) {
pLink->distances[i] = 0.0L;
pTarget->distances[j] = 0.0L;
continue;
}
if (pLink->distances[j] != 0.0L) {
continue;
}
pLink->distances[j] =
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
total += pLink->distances[j];
}
}
fprintf(stderr, "Average distance: %Lf\n", (long double)1. * total / (entries * entries));
long int clusters = DBSCAN(pFaces, entries, 0.44L, 2);
long int undefined = 0, outlier = 0, core = 0, reachable = 0;
for (i = 0; i < entries; i++) {
switch (pFaces[i].clusterType) {
case NOISE:
outlier++;
break;
case UNDEFINED:
undefined++;
break;
case CORE:
core++;
break;
case EDGE:
reachable++;
break;
}
}
fprintf(stderr, "%ld clusters identified!\n", clusters);
fprintf(stderr, "%ld NOISE\n", outlier);
fprintf(stderr, "%ld UNDEFINED\n", undefined);
fprintf(stderr, "%ld CORE\n", core);
fprintf(stderr, "%ld EDGE\n", reachable);
fprintf(stdout, "<script>\nvar clusters = [\n");
for (long i = 1; i <= clusters; i++) {
long nodes = 0;
fprintf(stdout, "/* %ld. */ [", i);
for (long int j = 0; j < entries; j++) {
if (pFaces[j].clusterId == i) {
if (nodes == 0) {
fprintf(stdout, "%ld", pFaces[j].faceId);
} else {
fprintf(stdout, ",%ld", pFaces[j].faceId);
}
nodes++;
}
}
if (i < clusters) {
fprintf(stdout, "],\n");
} else {
fprintf(stdout, "]\n");
}
}
fprintf(stdout, "];\n</script>\n");
/* Allocate storage for all distances */
sqlite3 *db;
int rc = sqlite3_open("db/photos.db", &db);
@ -143,46 +394,47 @@ int main(int argc, char *argv[]) {
fprintf(stderr, "facedistances deleted and transaction started.\n");
char sqlBuf[1024];
pLink = pChain;
int sourceIndex = 0, lines = 0;
while (pLink) {
for (long i = 0; i < entries; i++) {
Face *pLink = &pFaces[i];
int targetIndex = 0;
Face *pTarget = pChain;
while (pTarget) {
if (targetIndex == sourceIndex) {
pLink->distances[targetIndex] = 0.0;
pTarget->distances[sourceIndex] = 0.0;
} else {
if (pLink->distances[targetIndex] == 0.0) {
pLink->distances[targetIndex] =
pTarget->distances[sourceIndex] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
if (pLink->distances[targetIndex] < 0.5) {
sprintf(sqlBuf, "INSERT INTO facedistances (face1Id,face2Id,distance) VALUES (%d,%d,%Lf);",
((pLink->faceId < pTarget->faceId) ? pLink->faceId : pTarget->faceId),
((pLink->faceId < pTarget->faceId) ? pTarget->faceId : pLink->faceId),
pLink->distances[targetIndex]);
rc = sqlite3_exec(db, sqlBuf, 0, 0, &err_msg);
if (rc != SQLITE_OK ) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
sqlite3_close(db);
return 1;
}
lines++;
if (lines % 1000 == 0) {
fprintf(stderr, "...output %d DB lines (%0.2f complete)...\n", lines, (float)(1. * sourceIndex / (1. * len)));
}
}
for (long j = 0; j < entries; j++) {
Face *pTarget = &pFaces[j];
if (i == j) {
pLink->distances[i] = 0.0L;
pTarget->distances[j] = 0.0L;
continue;
}
if (pLink->distances[j] != 0.0L) {
// continue;
}
pLink->distances[j] =
pTarget->distances[i] = euclideanDistance(pLink->descriptor, pTarget->descriptor);
if (pLink->distances[j] < 0.5) {
sprintf(sqlBuf, "INSERT INTO facedistances (face1Id,face2Id,distance) VALUES (%ld,%ld,%Lf);",
((pLink->faceId < pTarget->faceId) ? pLink->faceId : pTarget->faceId),
((pLink->faceId < pTarget->faceId) ? pTarget->faceId : pLink->faceId),
pLink->distances[j]);
rc = sqlite3_exec(db, sqlBuf, 0, 0, &err_msg);
if (rc != SQLITE_OK ) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
sqlite3_close(db);
return 1;
}
lines++;
if (lines % 1000 == 0) {
fprintf(stderr, "...output %d DB lines (%0.2f complete)...\n", lines,
(float)(1. * sourceIndex / (1. * entries)));
}
}
pTarget = pTarget->next;
targetIndex++;
}
pLink = pLink->next;
sourceIndex++;
}
sprintf(sqlBuf, "UPDATE faces SET lastComparedId=%d;", maxId);
sprintf(sqlBuf, "UPDATE faces SET lastComparedId=%ld;", maxId);
rc = sqlite3_exec(db, "COMMIT;", 0, 0, &err_msg);
if (rc != SQLITE_OK ) {

View File

@ -55,7 +55,7 @@ function alignFromLandmarks(image, landmarks) {
ctx.rotate(rotation);
ctx.scale(scale, scale);
ctx.drawImage(image, 0, 0);
/*
ctx.strokeStyle = "red";
ctx.strokeWidth = "1";
ctx.beginPath();
@ -67,7 +67,7 @@ function alignFromLandmarks(image, landmarks) {
}
});
ctx.stroke();
*/
return canvas;
}
@ -140,20 +140,6 @@ require("./db/photos").then(function(db) {
const file = photo.path + photo.filename;
return canvas.loadImage(picturesPath + file).then(async (image) => {
const detectors = await faceapi.detectAllFaces(image,
new faceapi.SsdMobilenetv1Options({
minConfidence: 0.8
})
).withFaceLandmarks();
detectors.forEach(async (detector) => {
const canvas = alignFromLandmarks(image, detector.landmarks);
const descriptor = await faceapi.computeFaceDescriptor(canvas);
const data = [];
/* Confert from sparse object to dense array */
for (let i = 0; i < 128; i++) {
data.push(descriptor[i]);
}
const detectors = [ {
detection: {
_box: {
@ -172,7 +158,6 @@ require("./db/photos").then(function(db) {
/* This is a file */
console.log(`Loading ${file}...`);
id = undefined;
loader = canvas.loadImage(picturesPath + file).then(async (image) => {
const detectors = await faceapi.detectAllFaces(image,
new faceapi.SsdMobilenetv1Options({
@ -183,10 +168,19 @@ require("./db/photos").then(function(db) {
detectors.forEach(async (detector) => {
const canvas = alignFromLandmarks(image, detector.landmarks);
const descriptor = await faceapi.computeFaceDescriptor(canvas);
fs.writeFileSync("rotation.png", canvas.toBuffer("image/png", {
quality: 0.95,
chromaSubsampling: false
}));
process.exit(-1);
// .withFaceDescriptors();
const data = [];
/* Confert from sparse object to dense array */
for (let i = 0; i < 128; i++) {
data.push(descriptor[i]);
data.push(detector.descriptor[i]);
}
detector.descriptor = data;
});