From 4cd152f6403900a773dd17858cccb0639c670824 Mon Sep 17 00:00:00 2001 From: Hugo Sales Date: Tue, 27 Apr 2021 20:56:50 +0000 Subject: [PATCH] [Embed] Review and port v2 code --- plugins/Embed/Embed.php | 220 ++++++++++++----------- plugins/Embed/EmbedHelper.php | 191 -------------------- plugins/Embed/Entity/AttachmentEmbed.php | 105 +---------- 3 files changed, 125 insertions(+), 391 deletions(-) delete mode 100644 plugins/Embed/EmbedHelper.php diff --git a/plugins/Embed/Embed.php b/plugins/Embed/Embed.php index d956c1ef91..1700d5dac3 100644 --- a/plugins/Embed/Embed.php +++ b/plugins/Embed/Embed.php @@ -39,15 +39,20 @@ namespace Plugin\Embed; use App\Core\Cache; use App\Core\DB\DB; use App\Core\Event; +use App\Core\GSFile; use App\Core\HTTPClient; use App\Core\Log; use App\Core\Modules\Plugin; use App\Core\Router\RouteLoader; use App\Core\Router\Router; +use App\Core\Security; use App\Entity\Attachment; +use App\Entity\AttachmentThumbnail; +use App\Util\Common; use App\Util\Exception\DuplicateFoundException; use App\Util\Exception\NotFoundException; -use Plugin\Embed\Entity\FileEmbed; +use App\Util\TemporaryFile; +use Embed\Embed as LibEmbed; use Symfony\Component\HttpFoundation\Request; /** @@ -65,8 +70,7 @@ class Embed extends Plugin */ public $domain_allowlist = [ // hostname => service provider - '^i\d*\.ytimg\.com$' => 'YouTube', - '^i\d*\.vimeocdn\.com$' => 'Vimeo', + '.*' => '', // Default to allowing any host ]; /** @@ -86,67 +90,6 @@ class Embed extends Plugin return Event::next; } - /** - * This event executes when GNU social encounters a remote URL we then decide - * to interrogate for metadata. Embed gloms onto it to see if we have an - * oEmbed endpoint or image to try to represent in the post. - * - * @param $url string the remote URL we're looking at - * @param $dom DOMDocument the document we're getting metadata from - * @param $metadata stdClass class representing the metadata - * - * @return bool true if successful, the exception object if it isn't. - */ - public function onGetRemoteUrlMetadataFromDom(string $url, DOMDocument $dom, stdClass &$metadata) - { - try { - common_log(LOG_INFO, "Trying to find Embed data for {$url} with 'oscarotero/Embed'"); - $info = self::create($url); - - $metadata->version = '1.0'; // Yes. - $metadata->provider_name = $info->authorName; - $metadata->title = $info->title; - $metadata->html = common_purify($info->description); - $metadata->type = $info->type; - $metadata->url = $info->url; - $metadata->thumbnail_height = $info->imageHeight; - $metadata->thumbnail_width = $info->imageWidth; - - if (substr($info->image, 0, 4) === 'data') { - // Inline image - $imgData = base64_decode(substr($info->image, stripos($info->image, 'base64,') + 7)); - list($filename) = $this->validateAndWriteImage($imgData); - // Use a file URI for images, as file_embed can't store a filename - $metadata->thumbnail_url = 'file://' . File_thumbnail::path($filename); - } else { - $metadata->thumbnail_url = $info->image; - } - } catch (Exception $e) { - common_log(LOG_INFO, "Failed to find Embed data for {$url} with 'oscarotero/Embed'" . - ', got exception: ' . get_class($e)); - } - - if (isset($metadata->thumbnail_url)) { - // sometimes sites serve the path, not the full URL, for images - // let's "be liberal in what you accept from others"! - // add protocol and host if the thumbnail_url starts with / - if ($metadata->thumbnail_url[0] == '/') { - $thumbnail_url_parsed = parse_url($metadata->url); - $metadata->thumbnail_url = "{$thumbnail_url_parsed['scheme']}://" . - "{$thumbnail_url_parsed['host']}$metadata->thumbnail_url"; - } - - // some wordpress opengraph implementations sometimes return a white blank image - // no need for us to save that! - if ($metadata->thumbnail_url == 'https://s0.wp.com/i/blank.jpg') { - $metadata->thumbnail_url = null; - } - - // FIXME: this is also true of locally-installed wordpress so we should watch out for that. - } - return true; - } - /** * Insert oembed and opengraph tags in all HTML head elements */ @@ -195,20 +138,16 @@ class Embed extends Plugin if (!is_null($attachment->getRemoteUrl()) || (!is_null($mimetype = $attachment->getMimetype()) && (('text/html' === substr($mimetype, 0, 9) || 'application/xhtml+xml' === substr($mimetype, 0, 21))))) { try { - $embed_data = EmbedHelper::getEmbed($attachment->getRemoteUrl()); - dd($embed_data); - if ($embed_data === false) { - throw new Exception("Did not get Embed data from URL {$attachment->url}"); - } - $attachment->setTitle($embed_data['title']); + $embed_data = $this->getEmbed($attachment->getRemoteUrl(), $attachment); + $embed_data['attachment_id'] = $attachment->getId(); + DB::persist(Entity\AttachmentEmbed::create($embed_data)); + DB::flush(); } catch (Exception $e) { Log::warning($e); - return true; + return Event::next; } - - FileEmbed::saveNew($embed_data, $attachment->getId()); } - return true; + return Event::next; } /** @@ -297,7 +236,7 @@ END, ['embed' => $embed, 'thumbnail' => $thumbnail, 'attributes' => $attributes] */ protected function checkAllowlist(string $url) { - if (!$this->check_allowlist) { + if ($this->check_allowlist ?? false) { return false; // indicates "no check made" } @@ -333,7 +272,7 @@ END, ['embed' => $embed, 'thumbnail' => $thumbnail, 'attributes' => $attributes] $headers = $head->getHeaders(); $headers = array_change_key_case($headers, CASE_LOWER); } - return $headers['content-length'] ?? false; + return $headers['content-length'][0] ?? false; } catch (Exception $e) { Loog::error($e); return false; @@ -361,7 +300,7 @@ END, ['embed' => $embed, 'thumbnail' => $thumbnail, 'attributes' => $attributes] $headers = $head->getHeaders(); $headers = array_change_key_case($headers, CASE_LOWER); } - return !empty($headers['content-type']) && GSFile::mimetypeMajor($headers['content-type']) === 'image'; + return !empty($headers['content-type']) && GSFile::mimetypeMajor($headers['content-type'][0]) === 'image'; } catch (Exception $e) { Loog::error($e); return false; @@ -380,34 +319,32 @@ END, ['embed' => $embed, 'thumbnail' => $thumbnail, 'attributes' => $attributes] $file = new TemporaryFile(); $file->write($imgData); - if (array_key_exists('content-disposition', $headers) && preg_match('/^.+; filename="(.+?)"$/', $headers['content-disposition'], $matches) === 1) { - $original_name = $matches[1]; - } + $mimetype = $headers['content-type'][0]; + Event::handle('AttachmentValidation', [&$file, &$mimetype]); - $mimetype = $headers['content-type']; - Event::handle('AttachmentValidation', [$file, &$mimetype]); - - $hash = hash_file(Attachment::FILEHASH_ALGO, $file->getPathname()); + Event::handle('HashFile', [$file->getPathname(), &$hash]); $filename = Common::config('attachments', 'dir') . "embed/{$hash}"; $file->commit($filename); unset($file); - return [$filename, $width, $height, $original_name, $mimetype]; + if (array_key_exists('content-disposition', $headers) && preg_match('/^.+; filename="(.+?)"$/', $headers['content-disposition'][0], $matches) === 1) { + $original_name = $matches[1]; + } + + $info = getimagesize($filename); + $width = $info[0]; + $height = $info[1]; + + return [$filename, $width, $height, $original_name ?? null, $mimetype]; } /** - * Function to create and store a thumbnail representation of a remote image - * - * @param $thumbnail FileThumbnail object containing the file thumbnail - * - * @return bool true if it succeeded, the exception if it fails, or false if it - * is limited by system limits (ie the file is too large.) + * Create and store a thumbnail representation of a remote image */ - protected function storeRemoteThumbnail(Attachment $attachment): bool + protected function storeRemoteThumbnail(Attachment $attachment): array | bool { - $path = $attachment->getPath(); - if (file_exists($path)) { - throw new AlreadyFulfilledException(_m('A thumbnail seems to already exist for remote file with id=={id}', ['id' => $attachment->id])); + if ($attachment->haveFilename() && file_exists($attachment->getPath())) { + throw new AlreadyFulfilledException(_m('A thumbnail seems to already exist for remote file with id=={id}', ['id' => $attachment->getId()])); } $url = $attachment->getRemoteUrl(); @@ -430,8 +367,7 @@ END, ['embed' => $embed, 'thumbnail' => $thumbnail, 'attributes' => $attributes] $file_size = $this->getRemoteFileSize($url, $headers); $max_size = Common::config('attachments', 'file_quota'); if (($file_size != false) && ($file_size > $max_size)) { - Log::debug("Wanted to store remote thumbnail of size {$file_size} but the upload limit is {$max_size} so we aborted."); - return false; + throw new \Exception("Wanted to store remote thumbnail of size {$file_size} but the upload limit is {$max_size} so we aborted."); } } else { return false; @@ -442,9 +378,9 @@ END, ['embed' => $embed, 'thumbnail' => $thumbnail, 'attributes' => $attributes] } // First we download the file to memory and test whether it's actually an image file - Log::debug("Downloading remote thumbnail for file id=={$attachment->id} with thumbnail URL: {$url}"); + Log::debug('Downloading remote thumbnail for file id==' . $attachment->getId() . " with thumbnail URL: {$url}"); try { - $imgData = HTTPClient::get($url); + $imgData = HTTPClient::get($url)->getContent(); if (isset($imgData)) { [$filename, $width, $height, $original_name, $mimetype] = $this->validateAndWriteImage($imgData, $url, $headers); } else { @@ -457,10 +393,92 @@ END, ['embed' => $embed, 'thumbnail' => $thumbnail, 'attributes' => $attributes] } } - DB::persist(AttachmentThumbnail::create(['attachment_id' => $attachment->id, 'width' => $width, 'height' => $height])); + DB::persist(AttachmentThumbnail::create(['attachment_id' => $attachment->getId(), 'width' => $width, 'height' => $height])); $attachment->setFilename($filename); DB::flush(); - return true; + return [$filename, $width, $height, $original_name, $mimetype]; + } + + /** + * Perform an oEmbed or OpenGraph lookup for the given $url. + * + * Some known hosts are allowlisted with API endpoints where we + * know they exist but autodiscovery data isn't available. + * + * Throws exceptions on failure. + * + * @param string $url + * + * @throws EmbedHelper_BadHtmlException + * @throws HTTP_Request2_Exception + * + * @return object + */ + public function getEmbed(string $url, Attachment $attachment): array + { + Log::info('Checking for remote URL metadata for ' . $url); + + try { + Log::info("Trying to find Embed data for {$url} with 'oscarotero/Embed'"); + $embed = new LibEmbed(); + $info = $embed->get($url); + $metadata['title'] = $info->title; + $metadata['html'] = Security::sanitize($info->description); + $metadata['url'] = $info->url; + $metadata['author_name'] = $info->authorName; + $metadata['author_url'] = $info->authorUrl; + $metadata['provider_name'] = $info->providerName; + $metadata['provider_url'] = $info->providerUrl; + + if (!is_null($info->image)) { + if (substr($info->image, 0, 4) === 'data') { + // Inline image + $imgData = base64_decode(substr($info->image, stripos($info->image, 'base64,') + 7)); + [$filename, $width, $height, $original_name, $mimetype] = $this->validateAndWriteImage($imgData); + } else { + $attachment->setRemoteUrl((string) $info->image); + [$filename, $width, $height, $original_name, $mimetype] = $this->storeRemoteThumbnail($attachment); + } + $metadata['width'] = $height; + $metadata['height'] = $width; + $metadata['mimetype'] = $mimetype; + } + } catch (Exception $e) { + Log::info("Failed to find Embed data for {$url} with 'oscarotero/Embed', got exception: " . get_class($e)); + } + + $metadata = self::normalize($metadata); + $attachment->setTitle($metadata['title']); + return $metadata; + } + + /** + * Normalize fetched info. + */ + public static function normalize(array $data): array + { + if (isset($metadata['url'])) { + // sometimes sites serve the path, not the full URL, for images + // let's "be liberal in what you accept from others"! + // add protocol and host if the thumbnail_url starts with / + if ($metadata['url'][0] == '/') { + $thumbnail_url_parsed = parse_url($metadata['url']); + $metadata['url'] = "{$thumbnail_url_parsed['scheme']}://{$thumbnail_url_parsed['host']}{$metadata['url']}"; + } + + // Some wordpress opengraph implementations sometimes return a white blank image + // no need for us to save that! + if ($metadata['url'] == 'https://s0.wp.com/i/blank.jpg') { + $metadata['url'] = null; + } + + if (!isset($data['width'])) { + $data['width'] = Common::config('thumbnail', 'width'); + $data['height'] = Common::config('thumbnail', 'height'); + } + } + + return $data; } } diff --git a/plugins/Embed/EmbedHelper.php b/plugins/Embed/EmbedHelper.php deleted file mode 100644 index 6b576075c4..0000000000 --- a/plugins/Embed/EmbedHelper.php +++ /dev/null @@ -1,191 +0,0 @@ -. -// }}} - -/** - * OembedPlugin implementation for GNU social - * - * @package GNUsocial - * - * @author Mikael Nordfeldth - * @author hannes - * @author Diogo Cordeiro - * @author Hugo Sales - * @copyright 2019, 2021 Free Software Foundation, Inc http://www.fsf.org - * @license https://www.gnu.org/licenses/agpl.html GNU AGPL v3 or later - */ - -namespace Plugin\Embed; - -use App\Core\Event; -use App\Core\HTTPClient; -use App\Core\Log; - -/** - * Utility class to wrap basic embed lookups. - * - * Denylisted hosts will use an alternate lookup method. - * Allowlisted hosts will use known embed API endpoints. - * - * Sites that provide discovery links will use them directly; a bug - * in use of discovery links with query strings is worked around. - * - * Others will fall back to oohembed (unless disabled). - * The API endpoint can be configured or disabled through config - * as 'oohembed'/'endpoint'. - * - * @copyright 2019, 2021 Free Software Foundation, Inc http://www.fsf.org - * @license https://www.gnu.org/licenses/agpl.html GNU AGPL v3 or later - */ -class EmbedHelper -{ - /** - * Perform or fake an oEmbed lookup for the given $url. - * - * Some known hosts are allowlisted with API endpoints where we - * know they exist but autodiscovery data isn't available. - * - * A few hosts are denylisted due to known problems with oohembed, - * in which case we'll look up the info another way and return - * equivalent data. - * - * Throws exceptions on failure. - * - * @param string $url - * - * @throws EmbedHelper_BadHtmlException - * @throws HTTP_Request2_Exception - * - * @return object - */ - public static function getEmbed(string $url) - { - Log::info('Checking for remote URL metadata for ' . $url); - - $metadata = new \stdClass(); - - if (Event::handle('GetRemoteUrlMetadata', [$url, &$metadata])) { - // If that event didn't return anything, try downloading the body and parse it - - $response = HTTPClient::get($url); - $body = $response->getBody(); - - // DOMDocument::loadHTML may throw warnings on unrecognized elements, - // and notices on unrecognized namespaces. - $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE)); - - // DOMDocument assumes ISO-8859-1 per HTML spec - // use UTF-8 if we find any evidence of that encoding - $utf8_evidence = false; - $unicode_check_dom = new DOMDocument(); - $ok = $unicode_check_dom->loadHTML($body); - if (!$ok) { - throw new EmbedHelper_BadHtmlException(); - } - $metaNodes = $unicode_check_dom->getElementsByTagName('meta'); - foreach ($metaNodes as $metaNode) { - // case in-sensitive since Content-type and utf-8 can be written in many ways - if (stristr($metaNode->getAttribute('http-equiv'), 'content-type') - && stristr($metaNode->getAttribute('content'), 'utf-8')) { - $utf8_evidence = true; - break; - } elseif (stristr($metaNode->getAttribute('charset'), 'utf-8')) { - $utf8_evidence = true; - break; - } - } - unset($unicode_check_dom); - - // The Content-Type HTTP response header overrides encoding metatags in DOM - if (stristr($response->getHeader('Content-Type'), 'utf-8')) { - $utf8_evidence = true; - } - - // add utf-8 encoding prolog if we have reason to believe this is utf-8 content - // DOMDocument('1.0', 'UTF-8') does not work! - $utf8_tag = $utf8_evidence ? '' : ''; - - $dom = new DOMDocument(); - $ok = $dom->loadHTML($utf8_tag . $body); - unset($body); // storing the DOM in memory is enough... - error_reporting($old); - - if (!$ok) { - throw new EmbedHelper_BadHtmlException(); - } - - Event::handle('GetRemoteUrlMetadataFromDom', [$url, $dom, &$metadata]); - } - - return self::normalize($metadata); - } - - /** - * Normalize oEmbed format. - * - * @param stdClass $data - * - * @throws Exception - * - * @return object - */ - public static function normalize(stdClass $data) - { - if (empty($data->type)) { - throw new Exception('Invalid oEmbed data: no type field.'); - } - if ($data->type == 'image') { - // YFrog does this. - $data->type = 'photo'; - } - - if (isset($data->thumbnail_url)) { - if (!isset($data->thumbnail_width)) { - // !?!?! - $data->thumbnail_width = Common::config('thumbnail', 'width'); - $data->thumbnail_height = Common::config('thumbnail', 'height'); - } - } - - return $data; - } -} - -class EmbedHelper_Exception extends \Exception -{ - public function __construct($message = '', $code = 0, $previous = null) - { - parent::__construct($message, $code, $previous); - } -} - -class EmbedHelper_BadHtmlException extends EmbedHelper_Exception -{ - public function __construct($previous = null) - { - return parent::__construct('Bad HTML in discovery data.', 0, $previous); - } -} - -class EmbedHelper_DiscoveryException extends EmbedHelper_Exception -{ - public function __construct($previous = null) - { - return parent::__construct('No oEmbed discovery data.', 0, $previous); - } -} diff --git a/plugins/Embed/Entity/AttachmentEmbed.php b/plugins/Embed/Entity/AttachmentEmbed.php index 401fdce227..2d254e48d4 100644 --- a/plugins/Embed/Entity/AttachmentEmbed.php +++ b/plugins/Embed/Entity/AttachmentEmbed.php @@ -35,13 +35,15 @@ namespace Plugin\Embed\Entity; use App\Core\Entity; /** - * Table Definition for file_embed + * Table Definition for attachment_embed * - * @copyright 2019 Free Software Foundation, Inc http://www.fsf.org + * @author Hugo Sales + * @copyright 2019, 2021 Free Software Foundation, Inc http://www.fsf.org * @license https://www.gnu.org/licenses/agpl.html GNU AGPL v3 or later */ class AttachmentEmbed extends Entity { + // {{{ Autocode public $attachment_id; // int(4) primary_key not_null public $version; // varchar(20) public $type; // varchar(20) @@ -56,6 +58,7 @@ class AttachmentEmbed extends Entity public $author_url; // varchar(191) not 255 because utf8mb4 takes more space public $url; // varchar(191) not 255 because utf8mb4 takes more space public $modified; // timestamp() not_null default_CURRENT_TIMESTAMP + // }}} Autocode public static function schemaDef() { @@ -63,8 +66,6 @@ class AttachmentEmbed extends Entity 'name' => 'attachment_embed', 'fields' => [ 'attachment_id' => ['type' => 'int', 'not null' => true, 'description' => 'oEmbed for that URL/file'], - 'version' => ['type' => 'varchar', 'length' => 20, 'description' => 'oEmbed spec. version'], - 'type' => ['type' => 'varchar', 'length' => 20, 'description' => 'oEmbed type: photo, video, link, rich'], 'mimetype' => ['type' => 'varchar', 'length' => 50, 'description' => 'mime type of resource'], 'provider' => ['type' => 'text', 'description' => 'name of this oEmbed provider'], 'provider_url' => ['type' => 'text', 'description' => 'URL of this oEmbed provider'], @@ -79,102 +80,8 @@ class AttachmentEmbed extends Entity ], 'primary key' => ['attachment_id'], 'foreign keys' => [ - 'file_embed_file_id_fkey' => ['file', ['file_id' => 'id']], + 'attachment_embed_attachment_id_fkey' => ['attachment', ['attachment_id' => 'id']], ], ]; } - - /** - * Fetch an entry by using a File's id - */ - public static function getByFile(File $file) - { - $fo = new File_embed(); - $fo->file_id = $file->id; - if (!$fo->find(true)) { - throw new NoResultException($fo); - } - return $fo; - } - - public function getUrl() - { - return $this->url; - } - - /** - * Save embedding info for a new file. - * - * @param object $data Services_oEmbed_Object_* - * @param int $file_id - */ - public static function saveNew($data, $file_id) - { - $file_embed = new File_embed; - $file_embed->file_id = $file_id; - if (!isset($data->version)) { - common_debug('Embed: data->version undefined in variable $data: ' . var_export($data, true)); - } - $file_embed->version = $data->version; - $file_embed->type = $data->type; - if (!empty($data->provider)) { - $file_embed->provider = $data->provider; - } - if (!empty($data->provider_name)) { - $file_embed->provider = $data->provider_name; - } - if (!empty($data->provider_url)) { - $file_embed->provider_url = $data->provider_url; - } - if (!empty($data->width)) { - $file_embed->width = (int) ($data->width); - } - if (!empty($data->height)) { - $file_embed->height = (int) ($data->height); - } - if (!empty($data->html)) { - $file_embed->html = $data->html; - } - if (!empty($data->title)) { - $file_embed->title = $data->title; - } - if (!empty($data->author_name)) { - $file_embed->author_name = $data->author_name; - } - if (!empty($data->author_url)) { - $file_embed->author_url = $data->author_url; - } - if (!empty($data->url)) { - $file_embed->url = $data->url; - $given_url = File_redirection::_canonUrl($file_embed->url); - if (!empty($given_url)) { - try { - $file = File::getByUrl($given_url); - $file_embed->mimetype = $file->mimetype; - } catch (NoResultException $e) { - // File_redirection::where argument 'discover' is false to avoid loops - $redir = File_redirection::where($given_url, false); - if (!empty($redir->file_id)) { - $file_id = $redir->file_id; - } - } - } - } - $result = $file_embed->insert(); - if ($result === false) { - throw new ServerException('Failed to insert File_embed data into database!'); - } - if (!empty($data->thumbnail_url) || ($data->type == 'photo')) { - $ft = File_thumbnail::getKV('file_id', $file_id); - if ($ft instanceof File_thumbnail) { - common_log( - LOG_WARNING, - "Strangely, a File_thumbnail object exists for new file {$file_id}", - __FILE__ - ); - } else { - File_thumbnail::saveNew($data, $file_id); - } - } - } }