import sys import json import os import piexif import sqlite3 from sqlite3 import Error from PIL import Image import numpy as np import functools from ketrface.util import * from ketrface.dbscan import * from ketrface.db import * from ketrface.config import * config = read_config() html_path = merge_config_path(config['path'], 'frontend') pictures_path = merge_config_path(config['path'], config['picturesPath']) faces_path = merge_config_path(config['path'], config['facesPath']) db_path = merge_config_path(config['path'], config["db"]["photos"]["host"]) html_base = config['basePath'] # TODO # Switch to using DBSCAN # # Thoughts for determining number of clusters to try and target... # # Augment DBSCAN to rule out identity matching for the same face # appearing more than once in a photo # # NOTE: This means twins or reflections won't both identify in the # same photo -- those faces would then identify as a second face pairing # which could merge with a cluster, but can not be used to match def gen_html(identities): for identity in identities: print('
') print(f'
Identity {identity["id"]} has {len(identity["faces"])}
') print('
') for face in identity['faces']: faceId = face['id'] photoId = face['photoId'] distance = "{:0.4f}".format(face['distance']) confidence = "{:0.3f}".format(face['confidence']) focus = int(face['focus']) label = face['cluster'] if type(label) != str: label = f'Cluster ({face["cluster"]["id"]})' print('
') path = f'{html_base}/faces/{"{:02d}".format(faceId % 100)}' print(f'') print(f'
{label}: {distance}
') print(f'
{faceId} {photoId} {confidence} {focus}
') print('
') print('
') print('
') def update_cluster_averages(identities): for identity in identities: average = [] for face in identity['faces']: if len(average) == 0: average = face['descriptors'] else: average = np.add(average, face['descriptors']) average = np.divide(average, len(identity['faces'])) identity['descriptors'] = average return identities def load_faces(db_path = db_path): print(f'Connecting to database: {db_path}') conn = create_connection(db_path) faces = [] with conn: print('Querying faces') cur = conn.cursor() res = cur.execute(''' SELECT faces.id,facedescriptors.descriptors,faces.faceConfidence,faces.photoId,faces.focus FROM faces JOIN facedescriptors ON (faces.descriptorId=facedescriptors.id) WHERE faces.identityId IS null AND faces.faceConfidence>0.99 ''') for row in res.fetchall(): id, descriptors, confidence, photoId, focus = row if focus is None: focus = 100 # Assume full focus if focus not set face = { 'id': id, 'type': 'face', 'confidence': confidence, 'distance': 0, 'photoId': photoId, 'descriptors': np.frombuffer(descriptors), 'cluster': Undefined, 'focus': focus } face['faces'] = [ face ] faces.append(face) return faces def update_distances(identities, prune = False): removed = 0 for identity in identities: for face in identity['faces']: average = identity['descriptors'] distance = findCosineDistance(average, face['descriptors']) if prune and distance > MAX_EPOCH_DISTANCE: average = np.dot(average, len(identity['faces'])) average = np.subtract(average, face['descriptors']) face['cluster'] = Undefined face['distance'] = 0 identity['faces'].remove(face) identity['descriptors'] = np.divide(average, len(identity['faces'])) removed += 1 else: face['distance'] = distance return removed def sort_identities(identities): identities.sort(reverse = True, key = lambda x: len(x['faces'])) for identity in identities: identity['faces'].sort(reverse = False, key = lambda x: x['distance']) def cluster_sort(A, B): diff = A['cluster'] - B['cluster'] if diff > 0: return 1 elif diff < 0: return -1 diff = A['confidence'] - B['confidence'] if diff > 0: return 1 elif diff < 0: return -1 return 0 def build_straglers(faces): noise = [] undefined = [] for face in faces: if face['cluster'] == Noise: noise.append(face) elif face['cluster'] == Undefined: undefined.append(face) return noise + undefined print('Loading faces from database') faces = load_faces() print(f'{len(faces)} faces loaded') print('Scanning for clusters') identities = DBSCAN(faces) # process_faces(faces) print(f'{len(identities)} clusters grouped') MAX_CLUSTER_DISTANCE = 0.15 # Used to merge clusters MAX_EPOCH_DISTANCE = 0.14 # Used to prune outliers # Compute average center for all clusters identities = update_cluster_averages(identities) removed = -1 epoch = 1 # Filter each cluster removing any face that is > cluster_max_distance # from the average center point of the cluster while removed != 0: print(f'Epoch {epoch}...') epoch += 1 removed = update_distances(identities, prune = True) if removed > 0: print(f'Excluded {removed} faces this epoch') print(f'{len(identities)} identities seeded.') # Cluster the clusters... print('Reducing clusters via DBSCAN') reduced = DBSCAN(identities, eps = MAX_CLUSTER_DISTANCE, minPts = 2) if len(reduced) == 0: reduced = identities # For each cluster, merge the lists of faces referenced in the cluster's # "faces" field, which is pointing to clusters (and not actual faces) for cluster in reduced: merged = [] for identity in cluster['faces']: merged = merged + identity['faces'] cluster['faces'] = merged # Creating a set containing those faces which have not been bound # to an identity to recluster them in isolation from the rest of # the faces straglers = build_straglers(faces) reduced = reduced + DBSCAN(straglers) # Build a final cluster with all remaining uncategorized faces if False: remaining_cluster = { 'id': len(reduced) + 1, 'distance': 0, 'descriptors': [], 'cluster': Undefined, 'faces': [] } straglers = build_straglers(faces) for face in straglers: face['cluster'] = remaining_cluster remaining_cluster['faces'].append(face) reduced.append(remaining_cluster) # Give all merged identity lists a unique ID for id, identity in enumerate(reduced): identity['id'] = id for face in identity['faces']: face['cluster'] = identity reduced = update_cluster_averages(reduced) update_distances(reduced) sort_identities(reduced) # This generates a set of differences between clusters and makes # a recommendation to merge clusters (outside of DBSCAN) # # Worth testing on larger data set for i, A in enumerate(reduced): for k, B in enumerate(reduced): if k < i: continue if A == B: continue distance = findCosineDistance(A['descriptors'], B['descriptors']) if distance < MAX_CLUSTER_DISTANCE: distance = "{:0.4f}".format(distance) print(f'{A["id"]} to {B["id"]} = {distance}: MERGE') print('Writing to "auto-clusters.html"') redirect_on(os.path.join(html_path, 'auto-clusters.html')) gen_html(reduced) redirect_off() def create_identity(conn, identity): """ Create a new identity in the identities table :param conn: :param identity: :return: identity id """ sql = ''' INSERT INTO identities(descriptors,displayName) VALUES(?,?) ''' cur = conn.cursor() cur.execute(sql, ( np.array(identity['descriptors']), f'cluster-{identity["id"]}' )) conn.commit() return cur.lastrowid def update_face_identity(conn, faceId, identityId = None): """ Update the identity associated with this face :param conn: :param faceId: :param identityId: :return: None """ sql = ''' UPDATE faces SET identityId=? WHERE id=? ''' cur = conn.cursor() cur.execute(sql, (identityId, faceId)) conn.commit() return None print(f'Connecting to database: {db_path}') conn = create_connection(db_path) with conn: for identity in reduced: id = create_identity(conn, identity) for face in identity['faces']: update_face_identity(conn, face['id'], id)