ketr.photos/server/scanner.js
James Ketrenos 8b89ce7507 Switch to using transactions
Signed-off-by: James Ketrenos <james_git@ketrenos.com>
2018-09-28 11:55:48 -07:00

742 lines
24 KiB
JavaScript
Executable File

"use strict";
const Promise = require("bluebird"),
fs = require("fs"),
config = require("config"),
moment = require("moment"),
crypto = require("crypto");
let photoDB = null;
const picturesPath = config.get("picturesPath").replace(/\/$/, "") + "/";
let processQueue = [], triedClean = [], lastScan;
//const rawExtension = /\.(nef|orf)$/i, extensions = [ "jpg", "jpeg", "png", "gif", "nef", "orf" ];
const rawExtension = /\.nef$/i, extensions = [ "jpg", "jpeg", "png", "gif", "nef" ];
function removeNewerFile(path, fileA, fileB) {
fs.stat(path + fileA, function(err, statsA) {
if (err) {
return;
}
fs.stat(path + fileB, function(err, statsB) {
if (err) {
return;
}
if (statsA.mtime > statsB.mtime) {
console.log("Removing file by moving to 'corrupt':" + fileA);
moveCorrupt(path, fileA);
} else {
console.log("Removing file by moving to 'corrupt':" + fileB);
moveCorrupt(path, fileB);
}
});
});
}
let processRunning = false;
const { spawn } = require('child_process');
const sharp = require("sharp"), exif = require("exif-reader");
const stat = function (_path) {
if (_path.indexOf(picturesPath.replace(/\/$/, "")) == 0) {
_path = _path.substring(picturesPath.length);
}
let path = picturesPath + _path;
return new Promise(function (resolve, reject) {
fs.stat(path, function (error, stats) {
if (error) {
return reject(error);
}
return resolve(stats);
});
});
}
const mkdir = function (_path) {
if (_path.indexOf(picturesPath) == 0) {
_path = _path.substring(picturesPath.length);
}
let parts = _path.split("/"), path;
parts.unshift(picturesPath);
return Promise.mapSeries(parts, function (part) {
if (!path) {
path = picturesPath.replace(/\/$/, "");
} else {
path += "/" + part;
}
return stat(path).catch(function (error) {
if (error.code != "ENOENT") {
throw error;
}
return new Promise(function (resolve, reject) {
fs.mkdir(path, function (error) {
if (error) {
return reject(error);
}
return resolve();
});
});
});
});
}
const exists = function(path) {
return stat(path).then(function() {
return true;
}).catch(function() {
return false;
});
}
function convertRawToJpg(path, file) {
console.log("Converting " + path + file);
path = picturesPath + path;
return new Promise(function(resolve, reject) {
return exists(path + file.replace(rawExtension, ".jpg")).then(function(exist) {
if (exist) {
console.log("Skipping already converted file: " + file);
return;
}
const ufraw = spawn("ufraw-batch", [
"--silent",
"--wb=camera",
"--rotate=camera",
"--out-type=jpg",
"--compression=90",
"--exif",
"--overwrite",
"--output", path + file.replace(rawExtension, ".jpg"),
path + file
]);
const stderr = [];
ufraw.stderr.on('data', function(data) {
stderr.push(data);
});
return new Promise(function(ufraw, resolve, reject) {
ufraw.on('exit', function(stderr, code, signal) {
if (signal || code != 0) {
let error = "UFRAW for " + path + file + " returned an error: " + code + "\n" + signal + "\n" + stderr.join("\n") + "\n";
console.error(error);
return moveCorrupt(path, file).then(function() {
console.warn("ufraw failed");
return reject(error);
}).catch(function(error) {
console.warn("moveCorrupt failed");
return reject(error);
});
}
return mkdir(path + "raw").then(function() {
fs.rename(path + file, path + "raw/" + file, function(err) {
if (err) {
console.error("Unable to move RAW file: " + path + file);
return reject(err);
}
return resolve();
});
}).catch(function(error) {
console.warn("mkdir failed");
return reject(error);
});
}.bind(this, ufraw));
}.bind(this, stderr));
});
});
}
function moveCorrupt(path, file) {
if (path.indexOf(picturesPath) != 0) {
path = picturesPath + path;
}
console.warn("Moving corrupt file '" + file + "' to " + path + "corrupt");
return mkdir(path + "corrupt").then(function() {
return new Promise(function(resolve, reject) {
fs.rename(path + file, path + "corrupt/" + file, function(err) {
if (err) {
console.error("Unable to move corrupt file: " + path + file);
return reject(err);
}
return resolve();
});
});
});
}
function processBlock(items) {
if (items) {
processQueue = processQueue.concat(items);
}
if (processRunning) {
/* Invoke once per second to check if there are items to process */
setTimeout(processBlock, 1000);
return;
}
let processing = processQueue.splice(0), needsProcessing = [], duplicates = [];
processRunning = true;
/* Sort to newest files to be processed first */
processing.sort(function(a, b) {
return a.stats.mtime - b.stats.mtime;
});
let toProcess = processing.length, lastMessage = moment();
return Promise.map(processing, function(asset) {
return computeHash(picturesPath + asset.album.path + asset.filename).then(function(hash) {
asset.hash = hash;
}).then(function() {
toProcess--;
if (moment().add(-5, 'seconds') > lastMessage) {
console.log("Hash items to be processed: " + toProcess);
lastMessage = moment();
}
});
}, {
concurrency: 5
}).then(function() {
let toProcess = processing.length, lastMessage = moment();
/* Needs to be one at a time in case there are multiple HASH collisions */
return photoDB.sequelize.transaction(function(transaction) {
return Promise.mapSeries(processing, function(asset) {
return photoDB.sequelize.query("SELECT photohashes.*,photos.filename,albums.path FROM photohashes " +
"LEFT JOIN photos ON (photos.id=photohashes.photoId) " +
"LEFT JOIN albums ON (albums.id=photos.albumId) " +
"WHERE hash=:hash OR photoId=:id", {
replacements: asset,
type: photoDB.sequelize.QueryTypes.SELECT
}).then(function(results) {
let query;
if (results.length == 0) {
query = "INSERT INTO photohashes (hash,photoId) VALUES(:hash,:id)";
} else if (results[0].hash != asset.hash) {
query = "UPDATE photohashes SET hash=:hash WHERE photoId=:id)";
} else if (results[0].photoId != asset.id) {
console.log("Duplicate asset: " +
"'" + asset.album.path + asset.filename + "' is a copy of " +
"'" + results[0].path + results[0].filename + "'");
duplicates.push(asset);
return;
}
/* Even if the hash doesn't need to be updated, the entry needs to be scanned */
needsProcessing.push(asset);
if (!query) {
return;
}
return photoDB.sequelize.query(query, {
replacements: asset,
transaction: transaction
});
}).then(function() {
toProcess--;
if (moment().add(-5, 'seconds') > lastMessage) {
console.log("Hash items to be checked: " + toProcess);
lastMessage = moment();
}
});
});
});
}).then(function() {
let toProcess = processing.length, lastMessage = moment();
console.log(needsProcessing.length + " assets need to have metadata extracted");
return Promise.map(needsProcessing, function(asset) {
var path = asset.album.path,
file = asset.filename,
created = asset.stats.mtime,
albumId = asset.album.id;
let tmp = Promise.resolve(file);
/* If this is a Nikon RAW file, convert it to JPG and move to /raw dir */
if (rawExtension.exec(file)) {
tmp = exists(picturesPath + path + file.replace(rawExtension, ".jpg")).then(function(exist) {
if (exist) {
return file.replace(rawExtension, ".jpg"); /* We converted from NEF/ORF => JPG */
}
return mkdir(picturesPath + path + "raw").then(function() {
return convertRawToJpg(path, file);
}).then(function() {
return file.replace(rawExtension, ".jpg"); /* We converted from NEF/ORF => JPG */
});
});
}
return tmp.then(function(file) {
var src = picturesPath + path + file,
image = sharp(src);
return image.limitInputPixels(1073741824).metadata().then(function(metadata) {
if (metadata.exif) {
metadata.exif = exif(metadata.exif);
delete metadata.exif.thumbnail;
delete metadata.exif.image;
for (var key in metadata.exif.exif) {
if (Buffer.isBuffer(metadata.exif.exif[key])) {
metadata.exif.exif[key] = "Buffer[" + metadata.exif.exif[key].length + "]";
}
}
}
asset.width = metadata.width;
asset.height = metadata.height;
asset.added = moment().format();
if (metadata.exif && metadata.exif.exif && metadata.exif.exif.DateTimeOriginal && !isNaN(metadata.exif.exif.DateTimeOriginal.valueOf())) {
asset.taken = moment(metadata.exif.exif.DateTimeOriginal).format();
asset.modified = moment(metadata.exif.exif.DateTimeOriginal).format();
if (asset.taken == "Invalid date" || asset.taken.replace(/T.*/, "") == "1899-11-30") {
console.log("Invalid EXIF date information: ", JSON.stringify(metadata.exif.exif));
asset.taken = asset.modified = moment(created).format();
}
} else {
/* Attempt to infer the datestamp from the filename */
let date = moment(created).format();
let match = file.match(/WhatsApp Image (20[0-9][0-9]-[0-9][0-9]-[0-9][0-9]) at (.*).(jpeg|jpg)/);
if (match) {
date = moment((match[1]+" "+match[2]), "YYYY-MM-DD h.mm.ss a").format();
if (date == "Invalid date") {
date = moment(created).format();
}
} else {
match = file.match(/(20[0-9][0-9]-?[0-9][0-9]-?[0-9][0-9])[_\-]?([0-9]{6})?/);
if (match) {
if (match[2]) { /* Stamp had time in it */
date = moment((match[1]+""+match[2]).replace(/-/g, ""), "YYYYMMDDHHmmss").format();
} else {
date = moment(match[1].replace(/-/g, ""), "YYYYMMDD").format();
}
if (date == "Invalid date") {
date = moment(created).format();
}
} else {
date = moment(created).format();
}
}
asset.taken = asset.modified = date;
}
let dst = picturesPath + path + "thumbs/" + file;
return exists(dst).then(function(exist) {
if (exist) {
return;
}
return image.resize(256, 256).toFile(dst).catch(function(error) {
console.error("Error resizing image: " + dst, error);
throw error;
});
}).then(function() {
let dst = picturesPath + path + "thumbs/scaled/" + file;
return exists(dst).then(function(exist) {
if (exist) {
return;
}
return image.resize(Math.min(1024, metadata.width)).toFile(dst).catch(function(error) {
console.error("Error resizing image: " + dst, error);
throw error;
});
});
}).then(function() {
return photoDB.sequelize.query("UPDATE photos SET " +
"added=:added,modified=:modified,taken=:taken,width=:width,height=:height,scanned=CURRENT_TIMESTAMP " +
"WHERE id=:id", {
replacements: asset,
});
}).then(function() {
toProcess--;
if (moment().add(-5, 'seconds') > lastMessage) {
console.log("Items to be processed: " + toProcess);
lastMessage = moment();
}
});
}).catch(function(error) {
console.error("Error reading image " + src + ": ", error);
return moveCorrupt(path, file).then(function() {
/* If the original file was not a NEF/ORF, we are done... */
if (!rawExtension.exec(asset.filename)) {
return;
}
/* ... otherwise, attempt to re-convert the NEF/ORF->JPG and then resize again */
for (var i = 0; i < triedClean.length; i++) {
if (triedClean[i] == path + file) {
/* Move the NEF/ORF to /corrupt as well so it isn't re-checked again and again... */
// return moveCorrupt(path, asset.filename);
console.error("Already attempted to convert NEF/ORF to JPG: " + path + file);
return;
}
}
console.warn("Adding " + path + file + " back onto processing queue.");
triedClean.push(path + file);
processBlock([ path, file, created, albumId ]);
});
});
}).catch(function() {
console.warn("Continuing file processing.");
});
}, {
concurrency: 1
});
}).then(function() {
console.log("Completed processing queue. Marking " + duplicates.length + " duplicates.");
let dups = [];
duplicates.forEach(function(asset) {
/* If not already marked as a duplicate, mark it. */
if (!asset.duplicate) {
dups.push(asset.id);
}
});
if (dups.length == 0) {
return;
}
return photoDB.sequelize.query("UPDATE photos SET duplicate=1,scanned=CURRENT_TIMESTAMP WHERE id IN (:dups)", {
replacements: {
dups: dups
}
});
}).then(function() {
console.log("Looking for removed assets");
return photoDB.sequelize.query("SELECT photos.scanned,photos.id,photos.filename,albums.path FROM photos " +
"LEFT JOIN albums ON (albums.id=photos.albumId) " +
"WHERE photos.deleted=0 AND DATETIME(photos.scanned)<DATETIME(:lastScan)", {
replacements: {
lastScan: lastScan
},
type: photoDB.sequelize.QueryTypes.SELECT
}).then(function(results) {
let deleted = [];
console.log("Checking " + results.length + " assets to see if they are on disk.");
return Promise.map(results, function(asset) {
return exists(asset.path + asset.filename).then(function(exist) {
if (!exist) {
console.log(asset.path + asset.filename + " no longer exists on disk. Marking as deleted.");
deleted.push(asset.id);
}
});
}).then(function() {
return photoDB.sequelize.query("UPDATE photos SET deleted=1,scanned=CURRENT_TIMESTAMP WHERE id IN (:deleted)", {
replacements: {
deleted: deleted
}
}).then(function() {
return photoDB.sequelize.query("DELETE FROM photohashes WHERE photoId IN (:deleted)", {
replacements: {
deleted: deleted
}
});
}).then(function() {
console.log(deleted.length + " assets deleted.");
});
});
});
}).then(function() {
processRunning = false;
});
}
function scanDir(parent, path) {
let re = new RegExp("\.((" + extensions.join(")|(") + "))$", "i"),
album = {
path: path.slice(picturesPath.length), /* path already ends in '/' */
name: path.replace(/\/$/, "").replace(/.*\//, "").replace(/_/g, " "),
parent: parent,
allAssetCount: 0,
allAlbumCount: 0
}, albums = [ album ], assets = [];
return new Promise(function(resolve, reject) {
fs.readdir(path, function(error, files) {
if (error) {
console.warn("Could not readdir: " + path);
return resolve([]);
}
/* Remove 'thumbs' and 'raw' directories from being processed */
files = files.filter(function(file) {
for (var i = 0; i < files.length; i++) {
/* If this file has an original NEF/ORF on the system, don't add the JPG to the DB */
if (rawExtension.exec(files[i]) && file == files[i].replace(rawExtension, ".jpg")) {
return false;
}
/* If there is a different CASE (eg. JPG vs jpg) don't add it, and remove the 'lower case'
* version from disk. */
if (file != files[i] && file.toUpperCase() == files[i]) {
removeNewerFile(path, file, files[i]);
console.log("Duplicate file in " + path + ": ", file, files[i]);
return false;
}
}
return file != "raw" && file != "thumbs" && file != ".git" && file != "corrupt";
});
return resolve(files);
});
}).then(function(files) {
return Promise.map(files, function(file) {
let filepath = path + file;
return stat(filepath).then(function(stats) {
if (stats.isDirectory()) {
filepath += "/";
return scanDir(album, filepath).spread(function(_albums, _assets) {
album.allAssetCount += _assets.length;
album.allAlbumCount += _albums.length + 1;
albums = albums.concat(_albums);
assets = assets.concat(_assets);
}).catch(function(error) {
console.warn("Could not scanDir " + filepath + ": " + error);
});
}
/* Check file extensions */
if (!re.exec(file)) {
return;
}
album.hasAssets = true;
assets.push({
filename: file.replace(rawExtension, ".jpg"), /* We will be converting from NEF/ORF => JPG */
name: file.replace(/.[^.]*$/, ""),
stats: {
mtime: stats.mtime
},
album: album
});
});
});
}).then(function() {
return Promise.map(albums, function(album) {
if (album.hasAssets) {
return mkdir(album.path + "thumbs/scaled");
}
});
}).then(function() {
return [ albums, assets ];
});
}
function findOrCreateDBAlbum(transaction, album) {
let query = "SELECT id FROM albums WHERE path=:path AND ";
if (!album.parent) {
query += "parentId IS NULL";
album.parentId = null;
} else {
if (!album.parent.id) {
let error = "Albums in array in non ancestral order!";
console.error(error);
throw error;
}
album.parentId = album.parent.id;
query += "parentId=:parentId";
}
return photoDB.sequelize.query(query, {
replacements: album,
type: photoDB.sequelize.QueryTypes.SELECT
}).then(function(results) {
if (results.length == 0) {
if (!album.parent) {
console.warn("Creating top level album: " + picturesPath);
}
return photoDB.sequelize.query("INSERT INTO albums (path,parentId,name) VALUES(:path,:parentId,:name)", {
replacements: album,
transaction: transaction
}).spread(function(results, metadata) {
return metadata.lastID;
});
} else {
return results[0].id;
}
}).then(function(id) {
album.id = id;
return id;
});
}
function findOrUpdateDBAsset(transaction, asset) {
let query = "SELECT id FROM photos WHERE albumId=:albumId AND filename=:filename";
if (!asset.album || !asset.album.id) {
let error = "Asset being processed without an album";
console.error(error);
throw error;
}
asset.albumId = asset.album.id;
return photoDB.sequelize.query(query, {
replacements: asset,
type: photoDB.sequelize.QueryTypes.SELECT
}).then(function(results) {
if (results.length == 0) {
return photoDB.sequelize.query("INSERT INTO photos " +
"(albumId,filename,name) " +
"VALUES(:albumId,:filename,:name)", {
replacements: asset,
transaction: transaction
}).spread(function(results, metadata) {
return [ metadata.lastID, null ];
});
} else {
return [ results[0].id, results[0].scanned ];
}
}).spread(function(id, scanned) {
asset.id = id;
asset.scanned = scanned;
return asset;
});
}
function computeHash(filepath) {
return new Promise(function(resolve, reject) {
let input = fs.createReadStream(filepath),
hash = crypto.createHash("sha256");
if (!input) {
return reject()
}
input.on("readable", function() {
const data = input.read();
if (data) {
hash.update(data);
} else {
input.close();
resolve(hash.digest("hex"));
hash = null;
input = null;
}
});
});
}
module.exports = {
scan: function (db) {
photoDB = db;
/* 1. Scan for all assets which will be managed by the system. readdir
* 2. Check if entry in DB. Check mod-time in DB vs. stats from #1
* - For albums
* - For assets
* 3. If not in DB, or mod-time changed, queue for HASH CHECK
*
* HASH CHECK
* 1. Compute HASH
* 2. Check for HASH in photohash -- skip?
* 3. Check for and create thumbs/FILE thumbs/scaled/FILE
* 4. If necessary, create JPG from RAW
* 5. Update last-scanned date in DB for entry
* 6. Look up all DB entries with last-scanned date < NOW -- purge from DB (they were
* removed on disk)? Also purge from the HASH table.
*/
let initialized = Date.now();
let now = Date.now();
let needsProcessing = [];
lastScan = new Date();
return scanDir(null, picturesPath).spread(function(albums, assets) {
console.log("Found " + assets.length + " assets in " + albums.length + " albums after " +
((Date.now() - now) / 1000) + "s");
/* One at a time, in series, as the album[] array has parents first, then descendants.
* Operating in parallel could result in a child being searched for prior to the parent */
now = Date.now();
let toProcess = albums.length, lastMessage = moment();
return photoDB.sequelize.transaction(function(transaction) {
return Promise.mapSeries(albums, function(album) {
return findOrCreateDBAlbum(transaction, album).then(function() {
toProcess--;
if (moment().add(-5, 'seconds') > lastMessage) {
console.log("Albums to be created in DB: " + toProcess);
lastMessage = moment();
}
});
});
}).then(function() {
console.log("Processed " + albums.length + " album DB entries in " +
((Date.now() - now) / 1000) + "s");
now = Date.now();
let processed = 0, start = Date.now(), last = 0, updateScanned = [];
return photoDB.sequelize.transaction(function(transaction) {
return Promise.map(assets, function(asset) {
return findOrUpdateDBAsset(transaction, asset).then(function(asset) {
if (asset.scanned < asset.stats.mtime) {
needsProcessing.push(asset);
} else {
updateScanned.push(asset.id);
}
});
}).then(function(asset) {
processed++;
let elapsed = Date.now() - start;
if (elapsed < 5000) {
return asset;
}
let remaining = assets.length - processed;
console.log(remaining + " assets remaining to have DB entries updated. ETA " +
Math.ceil((elapsed / 1000) * remaining / (processed - last)) + "s");
last = processed;
start = Date.now();
});
}, {
concurrency: 5
}).then(function() {
if (updateScanned.length) {
return photoDB.sequelize.query("UPDATE photos SET scanned=CURRENT_TIMESTAMP WHERE id IN (:ids)", {
replacements: {
ids: updateScanned
}
}).then(function() {
console.log("Updated scan date of " + updateScanned.length + " assets");
updateScanned = [];
});
}
}).then(function() {
console.log(needsProcessing.length + " assets need HASH computed");
processBlock(needsProcessing);
needsProcessing = [];
}).then(function() {
console.log("Scanned " + assets.length + " asset DB entries in " +
((Date.now() - now) / 1000) + "s");
assets = [];
});
});
}).then(function() {
console.log("Total time to initialize DB and all scans: " + ((Date.now() - initialized) / 1000) + "s");
});
}
};