From 76c8139054b0f261967b2eca7cf5b188fa6b18ea Mon Sep 17 00:00:00 2001 From: hannes Date: Tue, 26 Jan 2016 01:05:53 +0000 Subject: [PATCH 1/7] not pretty, but gives us better oembed data for wordpress and facebook --- plugins/Oembed/OembedPlugin.php | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/plugins/Oembed/OembedPlugin.php b/plugins/Oembed/OembedPlugin.php index 23ee6148ea..db94224d2d 100644 --- a/plugins/Oembed/OembedPlugin.php +++ b/plugins/Oembed/OembedPlugin.php @@ -46,6 +46,27 @@ class OembedPlugin extends Plugin 'maxheight' => common_config('thumbnail', 'height'), ); $metadata = oEmbedHelper::getOembedFrom($api, $url, $params); + + // Facebook just gives us javascript in its oembed html, + // so use the content of the title element instead + if(strpos($url,'https://www.facebook.com/') === 0) { + $metadata->html = $dom->getElementsByTagName('title')->item(0)->nodeValue; + } + + // Wordpress sometimes also just gives us javascript, use og:description if it is available + $xpath = new DomXpath($dom); + $generatorNode = $xpath->query('//meta[@name="generator"][1]')->item(0); + if ($generatorNode instanceof DomElement) { + // when wordpress only gives us javascript, the html stripped from tags + // is the same as the title, so this helps us to identify this (common) case + if(strpos($generatorNode->getAttribute('content'),'WordPress') === 0 + && trim(strip_tags($metadata->html)) == trim($metadata->title)) { + $propertyNode = $xpath->query('//meta[@property="og:description"][1]')->item(0); + if ($propertyNode instanceof DomElement) { + $metadata->html = $propertyNode->getAttribute('content'); + } + } + } } catch (Exception $e) { common_log(LOG_INFO, 'Could not find an oEmbed endpoint using link headers, trying OpenGraph from HTML.'); // Just ignore it! @@ -319,4 +340,4 @@ class OembedPlugin extends Plugin _m('Plugin for using and representing Oembed data.')); return true; } -} +} \ No newline at end of file From 473f893d040899d13fc936cd79096ec14fe1261a Mon Sep 17 00:00:00 2001 From: hannes Date: Tue, 26 Jan 2016 01:07:44 +0000 Subject: [PATCH 2/7] detab --- plugins/Oembed/OembedPlugin.php | 34 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/plugins/Oembed/OembedPlugin.php b/plugins/Oembed/OembedPlugin.php index db94224d2d..24b9591516 100644 --- a/plugins/Oembed/OembedPlugin.php +++ b/plugins/Oembed/OembedPlugin.php @@ -47,26 +47,26 @@ class OembedPlugin extends Plugin ); $metadata = oEmbedHelper::getOembedFrom($api, $url, $params); - // Facebook just gives us javascript in its oembed html, - // so use the content of the title element instead + // Facebook just gives us javascript in its oembed html, + // so use the content of the title element instead if(strpos($url,'https://www.facebook.com/') === 0) { - $metadata->html = $dom->getElementsByTagName('title')->item(0)->nodeValue; + $metadata->html = $dom->getElementsByTagName('title')->item(0)->nodeValue; } - // Wordpress sometimes also just gives us javascript, use og:description if it is available - $xpath = new DomXpath($dom); - $generatorNode = $xpath->query('//meta[@name="generator"][1]')->item(0); - if ($generatorNode instanceof DomElement) { - // when wordpress only gives us javascript, the html stripped from tags - // is the same as the title, so this helps us to identify this (common) case - if(strpos($generatorNode->getAttribute('content'),'WordPress') === 0 - && trim(strip_tags($metadata->html)) == trim($metadata->title)) { - $propertyNode = $xpath->query('//meta[@property="og:description"][1]')->item(0); - if ($propertyNode instanceof DomElement) { - $metadata->html = $propertyNode->getAttribute('content'); - } - } - } + // Wordpress sometimes also just gives us javascript, use og:description if it is available + $xpath = new DomXpath($dom); + $generatorNode = $xpath->query('//meta[@name="generator"][1]')->item(0); + if ($generatorNode instanceof DomElement) { + // when wordpress only gives us javascript, the html stripped from tags + // is the same as the title, so this helps us to identify this (common) case + if(strpos($generatorNode->getAttribute('content'),'WordPress') === 0 + && trim(strip_tags($metadata->html)) == trim($metadata->title)) { + $propertyNode = $xpath->query('//meta[@property="og:description"][1]')->item(0); + if ($propertyNode instanceof DomElement) { + $metadata->html = $propertyNode->getAttribute('content'); + } + } + } } catch (Exception $e) { common_log(LOG_INFO, 'Could not find an oEmbed endpoint using link headers, trying OpenGraph from HTML.'); // Just ignore it! From 884aeb4d2ebab267e6f056b88969765dd3e32ada Mon Sep 17 00:00:00 2001 From: hannes Date: Tue, 26 Jan 2016 01:10:15 +0000 Subject: [PATCH 3/7] common_purify() doesn't remove wordpress' and facebook's javascript properly, maybe better to keep the data intact, and do strip_tags or something similar when using the data --- plugins/Oembed/lib/oembedhelper.php | 5 ----- 1 file changed, 5 deletions(-) diff --git a/plugins/Oembed/lib/oembedhelper.php b/plugins/Oembed/lib/oembedhelper.php index b0527b7529..5d84d68d8c 100644 --- a/plugins/Oembed/lib/oembedhelper.php +++ b/plugins/Oembed/lib/oembedhelper.php @@ -161,11 +161,6 @@ class oEmbedHelper $oembed_data = HTTPClient::quickGetJson($api, $params); - // purify html - if(isset($oembed_data->html)) { - $oembed_data->html = common_purify($oembed_data->html); - } - return $oembed_data; } From b8d1e1f4a60c0838ed0e0c7ebc5cd784288350a2 Mon Sep 17 00:00:00 2001 From: hannes Date: Tue, 26 Jan 2016 11:28:24 +0000 Subject: [PATCH 4/7] silence errors on these xpath queries --- plugins/Oembed/OembedPlugin.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/Oembed/OembedPlugin.php b/plugins/Oembed/OembedPlugin.php index 24b9591516..f8b78e08d6 100644 --- a/plugins/Oembed/OembedPlugin.php +++ b/plugins/Oembed/OembedPlugin.php @@ -50,18 +50,18 @@ class OembedPlugin extends Plugin // Facebook just gives us javascript in its oembed html, // so use the content of the title element instead if(strpos($url,'https://www.facebook.com/') === 0) { - $metadata->html = $dom->getElementsByTagName('title')->item(0)->nodeValue; + $metadata->html = @$dom->getElementsByTagName('title')->item(0)->nodeValue; } // Wordpress sometimes also just gives us javascript, use og:description if it is available $xpath = new DomXpath($dom); - $generatorNode = $xpath->query('//meta[@name="generator"][1]')->item(0); + $generatorNode = @$xpath->query('//meta[@name="generator"][1]')->item(0); if ($generatorNode instanceof DomElement) { // when wordpress only gives us javascript, the html stripped from tags // is the same as the title, so this helps us to identify this (common) case if(strpos($generatorNode->getAttribute('content'),'WordPress') === 0 && trim(strip_tags($metadata->html)) == trim($metadata->title)) { - $propertyNode = $xpath->query('//meta[@property="og:description"][1]')->item(0); + $propertyNode = @$xpath->query('//meta[@property="og:description"][1]')->item(0); if ($propertyNode instanceof DomElement) { $metadata->html = $propertyNode->getAttribute('content'); } From aa76e5863f6a00e3ee642326c3d7d11dd89875ed Mon Sep 17 00:00:00 2001 From: hannes Date: Tue, 26 Jan 2016 13:37:52 +0000 Subject: [PATCH 5/7] don't mess upp charsets in oembed/og! check for utf-8 in http header and meta tags, and add prolog when loading html with DOMDocument() --- plugins/Oembed/lib/oembedhelper.php | 43 +++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/plugins/Oembed/lib/oembedhelper.php b/plugins/Oembed/lib/oembedhelper.php index 5d84d68d8c..6f514983f3 100644 --- a/plugins/Oembed/lib/oembedhelper.php +++ b/plugins/Oembed/lib/oembedhelper.php @@ -74,20 +74,57 @@ class oEmbedHelper if (Event::handle('GetRemoteUrlMetadata', array($url, &$metadata))) { // If that event didn't return anything, try downloading the body and parse it - $body = HTTPClient::quickGet($url); + + // don't use quickGet since we want to check Content-Type header for utf-8 + $client = new HTTPClient(); + $response = $client->get($url); + if (!$response->isOk()) { + // TRANS: Exception. %s is the URL we tried to GET. + throw new Exception(sprintf(_m('Could not GET URL %s.'), $url), $response->getStatus()); + } + $body = $response->getBody(); // DOMDocument::loadHTML may throw warnings on unrecognized elements, // and notices on unrecognized namespaces. $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE)); + + // DOMDocument assumes ISO-8859-1 per HTML spec + // use UTF-8 if we find any evidence of that encoding + $utf8_evidence = false; + $unicode_check_dom = new DOMDocument(); + $ok = $unicode_check_dom->loadHTML($body); + if (!$ok) throw new oEmbedHelper_BadHtmlException(); + $metaNodes = $unicode_check_dom->getElementsByTagName('meta'); + foreach($metaNodes as $metaNode) { + // case in-sensitive since Content-type and utf-8 can be written in many ways + if(stristr($metaNode->getAttribute('http-equiv'),'content-type') + && stristr($metaNode->getAttribute('content'),'utf-8')) { + $utf8_evidence = true; + break; + } elseif(stristr($metaNode->getAttribute('charset'),'utf-8')) { + $utf8_evidence = true; + break; + } + } + unset($unicode_check_dom); + + // The Content-Type HTTP response header overrides encoding metatags in DOM + if(stristr($response->getHeader('Content-Type'),'utf-8')) { + $utf8_evidence = true; + } + + // add utf-8 encoding prolog if we have reason to believe this is utf-8 content + $utf8_tag = $utf8_evidence ? '' : ''; + $dom = new DOMDocument(); - $ok = $dom->loadHTML($body); + $ok = $dom->loadHTML($utf8_tag.$body); unset($body); // storing the DOM in memory is enough... error_reporting($old); if (!$ok) { throw new oEmbedHelper_BadHtmlException(); } - + Event::handle('GetRemoteUrlMetadataFromDom', array($url, $dom, &$metadata)); } From 06e325d61bd57783af944ebc6d328820399fc9d4 Mon Sep 17 00:00:00 2001 From: hannes Date: Thu, 28 Jan 2016 15:19:29 +0000 Subject: [PATCH 6/7] fixes two issues when the oembed thumbnail is blank --- plugins/Oembed/OembedPlugin.php | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/plugins/Oembed/OembedPlugin.php b/plugins/Oembed/OembedPlugin.php index f8b78e08d6..0253018c02 100644 --- a/plugins/Oembed/OembedPlugin.php +++ b/plugins/Oembed/OembedPlugin.php @@ -72,6 +72,21 @@ class OembedPlugin extends Plugin // Just ignore it! $metadata = OpenGraphHelper::ogFromHtml($dom); } + + // sometimes sites serve the path, not the full URL, for images + // let's "be liberal in what you accept from others"! + // add protocol and host if the thumbnail_url starts with / + if(substr($metadata->thumbnail_url,0,1) == '/') { + $thumbnail_url_parsed = parse_url($metadata->url); + $metadata->thumbnail_url = $thumbnail_url_parsed['scheme']."://".$thumbnail_url_parsed['host'].$metadata->thumbnail_url; + } + + // some wordpress opengraph implementations sometimes return a white blank image + // no need for us to save that! + if($metadata->thumbnail_url == 'https://s0.wp.com/i/blank.jpg') { + unset($metadata->thumbnail_url); + } + } public function onEndShowHeadElements(Action $action) From 05439831e717cb20b3c76010354381ff50555f1d Mon Sep 17 00:00:00 2001 From: hannes Date: Thu, 28 Jan 2016 15:32:11 +0000 Subject: [PATCH 7/7] add comment that DOMDocument('1.0', 'UTF-8') does not work --- plugins/Oembed/lib/oembedhelper.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/Oembed/lib/oembedhelper.php b/plugins/Oembed/lib/oembedhelper.php index 6f514983f3..cb1c56e755 100644 --- a/plugins/Oembed/lib/oembedhelper.php +++ b/plugins/Oembed/lib/oembedhelper.php @@ -113,7 +113,8 @@ class oEmbedHelper $utf8_evidence = true; } - // add utf-8 encoding prolog if we have reason to believe this is utf-8 content + // add utf-8 encoding prolog if we have reason to believe this is utf-8 content + // DOMDocument('1.0', 'UTF-8') does not work! $utf8_tag = $utf8_evidence ? '' : ''; $dom = new DOMDocument();