From 7077862382ea148ab7158c622fec126dcc178a2b Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Fri, 28 Sep 2018 12:26:57 -0700 Subject: [PATCH] Prevent duplicate hash entries in hash transaction --- server/scanner.js | 101 +++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/server/scanner.js b/server/scanner.js index c6e26a6..3bcb34d 100755 --- a/server/scanner.js +++ b/server/scanner.js @@ -212,55 +212,74 @@ function processBlock(items) { lastMessage = moment(); } }); -}, { - concurrency: 5 + }, { + concurrency: 10 }).then(function() { let toProcess = processing.length, lastMessage = moment(); - /* Needs to be one at a time in case there are multiple HASH collisions */ - return photoDB.sequelize.transaction(function(transaction) { - return Promise.mapSeries(processing, function(asset) { - return photoDB.sequelize.query("SELECT photohashes.*,photos.filename,albums.path FROM photohashes " + - "LEFT JOIN photos ON (photos.id=photohashes.photoId) " + - "LEFT JOIN albums ON (albums.id=photos.albumId) " + - "WHERE hash=:hash OR photoId=:id", { - replacements: asset, - type: photoDB.sequelize.QueryTypes.SELECT - }).then(function(results) { - let query; - if (results.length == 0) { - query = "INSERT INTO photohashes (hash,photoId) VALUES(:hash,:id)"; - } else if (results[0].hash != asset.hash) { - query = "UPDATE photohashes SET hash=:hash WHERE photoId=:id)"; - } else if (results[0].photoId != asset.id) { - console.log("Duplicate asset: " + - "'" + asset.album.path + asset.filename + "' is a copy of " + - "'" + results[0].path + results[0].filename + "'"); - duplicates.push(asset); - return; - } - - /* Even if the hash doesn't need to be updated, the entry needs to be scanned */ - needsProcessing.push(asset); - - if (!query) { - return; - } - - return photoDB.sequelize.query(query, { + /* Needs to be one at a time in case there are multiple HASH collisions. To speed + * up commits to the DB we will batch these into 100 record transactions where HASH + * collions are done via DB query *AND* in-memory table lookup in the current batch */ + let batchSize = 100, batches = []; + while (processing.length) { + batches.push(processing.splice(0, batchSize)); + } + return Promise.mapSeries(batches, function(batch) { + return photoDB.sequelize.transaction(function(transaction) { + return Promise.mapSeries(batch, function(asset, index) { + return photoDB.sequelize.query("SELECT photohashes.*,photos.filename,albums.path FROM photohashes " + + "LEFT JOIN photos ON (photos.id=photohashes.photoId) " + + "LEFT JOIN albums ON (albums.id=photos.albumId) " + + "WHERE hash=:hash OR photoId=:id", { replacements: asset, - transaction: transaction + type: photoDB.sequelize.QueryTypes.SELECT + }).then(function(results) { + let query; + + /* If this asset exists in this transaction block, push it into the results */ + for (let i = 0; i < index; i++) { + if (batch[i].hash == asset.hash) { + results.push(batch[i]); + } + } + + if (results.length == 0) { + query = "INSERT INTO photohashes (hash,photoId) VALUES(:hash,:id)"; + } else if (results[0].hash != asset.hash) { + query = "UPDATE photohashes SET hash=:hash WHERE photoId=:id)"; + } else if (results[0].photoId != asset.id) { + console.log("Duplicate asset: " + + "'" + asset.album.path + asset.filename + "' is a copy of " + + "'" + results[0].path + results[0].filename + "'"); + duplicates.push(asset); + return; + } + + /* Even if the hash doesn't need to be updated, the entry needs to be scanned */ + needsProcessing.push(asset); + + if (!query) { + return; + } + + return photoDB.sequelize.query(query, { + replacements: asset, + transaction: transaction + }); + }).then(function() { + toProcess--; + if (moment().add(-5, 'seconds') > lastMessage) { + console.log("Hash items to be checked: " + toProcess); + lastMessage = moment(); + } }); - }).then(function() { - toProcess--; - if (moment().add(-5, 'seconds') > lastMessage) { - console.log("Hash items to be checked: " + toProcess); - lastMessage = moment(); - } }); }); + }).catch(function(error) { + console.log("Error commiting HASH transactions"); + throw error; }); }).then(function() { - let toProcess = processing.length, lastMessage = moment(); + let toProcess = needsProcessing.length, lastMessage = moment(); console.log(needsProcessing.length + " assets need to have metadata extracted"); return Promise.map(needsProcessing, function(asset) { var path = asset.album.path,