[ATTACHMENTS][EVENT] Add onHashFile event, which can be used to deduplicate files

Currently, we simply hash the contents of the file with sha256, but in the future we can use something smarter,
which could find visual feature similarity between images
This commit is contained in:
Hugo Sales 2021-04-27 20:53:59 +00:00
parent 60a9085e56
commit 34059a8d3d
Signed by: someonewithpc
GPG Key ID: 7D0C7EAFC9D835A0
2 changed files with 20 additions and 7 deletions

View File

@ -29,6 +29,7 @@ use App\Core\GSFile;
use function App\Core\I18n\_m; use function App\Core\I18n\_m;
use App\Core\Modules\Component; use App\Core\Modules\Component;
use App\Core\Security; use App\Core\Security;
use App\Entity\Attachment;
use App\Entity\AttachmentToNote; use App\Entity\AttachmentToNote;
use App\Entity\Note; use App\Entity\Note;
use App\Util\Common; use App\Util\Common;
@ -124,7 +125,7 @@ END;
$matched_urls = []; $matched_urls = [];
preg_match_all(self::URL_REGEX, $content, $matched_urls, PREG_SET_ORDER); preg_match_all(self::URL_REGEX, $content, $matched_urls, PREG_SET_ORDER);
foreach ($matched_urls as $match) { foreach ($matched_urls as $match) {
$processed_attachments[] = GSFile::validateAndStoreURL($url); $processed_attachments[] = GSFile::validateAndStoreURL($match[0]);
} }
DB::persist($note); DB::persist($note);
@ -138,4 +139,15 @@ END;
DB::flush(); DB::flush();
} }
} }
/**
* Get a unique representation of a file on disk
*
* This can be used in the future to deduplicate images by visual content
*/
public static function onHashFile(string $filename, ?string &$out_hash)
{
$out_hash = hash_file(Attachment::FILEHASH_ALGO, $filename);
return Event::stop;
}
} }

View File

@ -43,9 +43,9 @@ class GSFile
bool $is_local = true, bool $is_local = true,
int $actor_id = null): Attachment int $actor_id = null): Attachment
{ {
Event::handle('HashFile', [$file->getPathname(), &$hash]);
// The following properly gets the mimetype with `file` or other // The following properly gets the mimetype with `file` or other
// available methods, so should be safe // available methods, so should be safe
$hash = hash_file(Attachment::FILEHASH_ALGO, $sfile->getPathname());
$mimetype = $sfile->getMimeType(); $mimetype = $sfile->getMimeType();
Event::handle('AttachmentValidation', [&$sfile, &$mimetype]); Event::handle('AttachmentValidation', [&$sfile, &$mimetype]);
$attachment = Attachment::create([ $attachment = Attachment::create([
@ -70,16 +70,17 @@ class GSFile
public static function validateAndStoreURL(string $url): Attachment public static function validateAndStoreURL(string $url): Attachment
{ {
if (Common::isValidHttpUrl($url)) { if (Common::isValidHttpUrl($url)) {
HTTPClient::head($url); $head = HTTPClient::head($url);
$headers = $head->getHeaders(); $headers = $head->getHeaders();
$headers = array_change_key_case($headers, CASE_LOWER); $headers = array_change_key_case($headers, CASE_LOWER);
$attachment = Attachment::create([ $attachment = Attachment::create([
'remote_url' => $match[0], 'remote_url' => $url,
'remote_url_hash' => hash('sha256', $match[0]), 'remote_url_hash' => hash(Attachment::URLHASH_ALGO, $url),
'mimetype' => $headers['content-type'], 'mimetype' => $headers['content-type'][0],
'is_local' => false,
]); ]);
DB::persist($attachment); DB::persist($attachment);
Event::handle('AttachmentStoreNew', [&$at]); Event::handle('AttachmentStoreNew', [&$attachment]);
return $attachment; return $attachment;
} else { } else {
throw new \InvalidArgumentException(); throw new \InvalidArgumentException();