639 lines
21 KiB
PHP
639 lines
21 KiB
PHP
<?php
|
|
/**
|
|
* Phergie
|
|
*
|
|
* PHP version 5
|
|
*
|
|
* LICENSE
|
|
*
|
|
* This source file is subject to the new BSD license that is bundled
|
|
* with this package in the file LICENSE.
|
|
* It is also available through the world-wide-web at this URL:
|
|
* http://phergie.org/license
|
|
*
|
|
* @category Phergie
|
|
* @package Phergie_Plugin_Url
|
|
* @author Phergie Development Team <team@phergie.org>
|
|
* @copyright 2008-2010 Phergie Development Team (http://phergie.org)
|
|
* @license http://phergie.org/license New BSD License
|
|
* @link http://pear.phergie.org/package/Phergie_Plugin_Url
|
|
*/
|
|
|
|
/**
|
|
* Monitors incoming messages for instances of URLs and responds with messages
|
|
* containing relevant information about detected URLs.
|
|
*
|
|
* Has an utility method accessible via
|
|
* $this->getPlugin('Url')->getTitle('http://foo..').
|
|
*
|
|
* @category Phergie
|
|
* @package Phergie_Plugin_Url
|
|
* @author Phergie Development Team <team@phergie.org>
|
|
* @license http://phergie.org/license New BSD License
|
|
* @link http://pear.phergie.org/package/Phergie_Plugin_Url
|
|
* @uses Phergie_Plugin_Encoding pear.phergie.org
|
|
* @uses Phergie_Plugin_Http pear.phergie.org
|
|
* @uses Phergie_Plugin_Tld pear.phergie.org
|
|
*/
|
|
class Phergie_Plugin_Url extends Phergie_Plugin_Abstract
|
|
{
|
|
/**
|
|
* Links output format
|
|
*
|
|
* Can use the variables %nick%, %title% and %link% in it to display
|
|
* page titles and links
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $baseFormat = '%message%';
|
|
protected $messageFormat = '[ %link% ] %title%';
|
|
|
|
/**
|
|
* Flag indicating whether a single response should be sent for a single
|
|
* message containing multiple links
|
|
*
|
|
* @var bool
|
|
*/
|
|
protected $mergeLinks = true;
|
|
|
|
/**
|
|
* Max length of the fetched URL title
|
|
*
|
|
* @var int
|
|
*/
|
|
protected $titleLength = 40;
|
|
|
|
/**
|
|
* Url cache to prevent spamming, especially with multiple bots on the
|
|
* same channel
|
|
*
|
|
* @var array
|
|
*/
|
|
protected $urlCache = array();
|
|
protected $shortCache = array();
|
|
|
|
/**
|
|
* Time in seconds to store the cached entries
|
|
*
|
|
* Setting it to 0 or below disables the cache expiration
|
|
*
|
|
* @var int
|
|
*/
|
|
protected $expire = 1800;
|
|
|
|
/**
|
|
* Number of entries to keep in the cache at one time per channel
|
|
*
|
|
* Setting it to 0 or below disables the cache limit
|
|
*
|
|
* @var int
|
|
*/
|
|
protected $limit = 10;
|
|
|
|
/**
|
|
* Flag that determines if the plugin will fall back to using an HTTP
|
|
* stream when a URL using SSL is detected and OpenSSL support isn't
|
|
* available in the PHP installation in use
|
|
*
|
|
* @var bool
|
|
*/
|
|
protected $sslFallback = true;
|
|
|
|
/**
|
|
* Flag that is set to true by the custom error handler if an HTTP error
|
|
* code has been received
|
|
*
|
|
* @var boolean
|
|
*/
|
|
protected $errorStatus = false;
|
|
protected $errorMessage = null;
|
|
|
|
/**
|
|
* Flag indicating whether or not to display error messages as the title
|
|
* if a link posted encounters an error
|
|
*
|
|
* @var boolean
|
|
*/
|
|
protected $showErrors = true;
|
|
|
|
/**
|
|
* Flag indicating whether to detect schemeless URLS (i.e. "example.com")
|
|
*
|
|
* @var boolean
|
|
*/
|
|
protected $detectSchemeless = false;
|
|
|
|
/**
|
|
* Shortener object
|
|
*/
|
|
protected $shortener;
|
|
|
|
/**
|
|
* Array of renderers
|
|
*/
|
|
protected $renderers = array();
|
|
|
|
/**
|
|
* Checks for dependencies.
|
|
*
|
|
* @return void
|
|
*/
|
|
public function onLoad()
|
|
{
|
|
$plugins = $this->plugins;
|
|
$plugins->getPlugin('Encoding');
|
|
$plugins->getPlugin('Http');
|
|
$plugins->getPlugin('Tld');
|
|
|
|
// make the shortener configurable
|
|
$shortener = $this->getConfig('url.shortener', 'Trim');
|
|
$shortener = "Phergie_Plugin_Url_Shorten_{$shortener}";
|
|
$this->shortener = new $shortener($this->plugins->getPlugin('Http'));
|
|
|
|
if (!$this->shortener instanceof Phergie_Plugin_Url_Shorten_Abstract) {
|
|
$this->fail("Declared shortener class {$shortener} is not of proper ancestry");
|
|
}
|
|
|
|
// load config (a bit ugly, but focusing on porting):
|
|
foreach (
|
|
array(
|
|
'detect_schemeless' => 'detectSchemeless',
|
|
'base_format' => 'baseFormat',
|
|
'message_format' => 'messageFormat',
|
|
'merge_links' => 'mergeLinks',
|
|
'title_length' => 'titleLength',
|
|
'show_errors' => 'showErrors',
|
|
'expire' => 'expire',
|
|
) as $config => $local) {
|
|
if (isset($this->config["url.{$config}"])) {
|
|
$this->$local = $this->config["uri.{$config}"];
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Checks an incoming message for the presence of a URL and, if one is
|
|
* found, responds with its title if it is an HTML document and the
|
|
* shortened equivalent of its original URL if it meets length requirements.
|
|
*
|
|
* @todo Update this to pull configuration settings from $this->config
|
|
* rather than caching them as class properties
|
|
* @return void
|
|
*/
|
|
public function onPrivmsg()
|
|
{
|
|
$this->handleMsg();
|
|
}
|
|
|
|
/**
|
|
* Checks an incoming message for the presence of a URL and, if one is
|
|
* found, responds with its title if it is an HTML document and the
|
|
* shortened equivalent of its original URL if it meets length requirements.
|
|
*
|
|
* @todo Update this to pull configuration settings from $this->config
|
|
* rather than caching them as class properties
|
|
* @return void
|
|
*/
|
|
public function onAction()
|
|
{
|
|
$this->handleMsg();
|
|
}
|
|
|
|
/**
|
|
* Handles message events and responds with url titles.
|
|
*
|
|
* @return void
|
|
*/
|
|
protected function handleMsg()
|
|
{
|
|
$source = $this->getEvent()->getSource();
|
|
$user = $this->getEvent()->getNick();
|
|
|
|
$responses = array();
|
|
$urls = $this->findUrls($this->getEvent()->getArgument(1));
|
|
|
|
foreach ($urls as $parsed) {
|
|
$url = $parsed['glued'];
|
|
|
|
// allow out-of-class renderers to handle this URL
|
|
foreach ($this->renderers as $renderer) {
|
|
if ($renderer->renderUrl($parsed) === true) {
|
|
// renderers should return true if they've fully
|
|
// rendered the passed URL (they're responsible
|
|
// for their own output)
|
|
$this->debug('Handled by renderer: ' . get_class($renderer));
|
|
continue 2;
|
|
}
|
|
}
|
|
|
|
// Convert url
|
|
$shortenedUrl = $this->shortener->shorten($url);
|
|
if (!$shortenedUrl) {
|
|
$this->debug('Invalid Url: Unable to shorten. (' . $url . ')');
|
|
$shortenedUrl = $url;
|
|
}
|
|
|
|
// Prevent spamfest
|
|
if ($this->checkUrlCache($url, $shortenedUrl)) {
|
|
$this->debug('Invalid Url: URL is in the cache. (' . $url . ')');
|
|
continue;
|
|
}
|
|
|
|
$title = $this->getTitle($url);
|
|
if (!empty($title)) {
|
|
$responses[] = str_replace(
|
|
array(
|
|
'%title%',
|
|
'%link%',
|
|
'%nick%'
|
|
), array(
|
|
$title,
|
|
$shortenedUrl,
|
|
$user
|
|
), $this->messageFormat
|
|
);
|
|
}
|
|
|
|
// Update cache
|
|
$this->updateUrlCache($url, $shortenedUrl);
|
|
unset($title, $shortenedUrl, $title);
|
|
}
|
|
|
|
// Check to see if there were any URL responses, format them and handle if they
|
|
// get merged into one message or not
|
|
if (count($responses) > 0) {
|
|
if ($this->mergeLinks) {
|
|
$message = str_replace(
|
|
array(
|
|
'%message%',
|
|
'%nick%'
|
|
), array(
|
|
implode('; ', $responses),
|
|
$user
|
|
), $this->baseFormat
|
|
);
|
|
$this->doPrivmsg($source, $message);
|
|
} else {
|
|
foreach ($responses as $response) {
|
|
$message = str_replace(
|
|
array(
|
|
'%message%',
|
|
'%nick%'
|
|
), array(
|
|
implode('; ', $responses),
|
|
$user
|
|
), $this->baseFormat
|
|
);
|
|
$this->doPrivmsg($source, $message);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Detect URLs in a given string.
|
|
*
|
|
* @param string $message the string to detect urls in
|
|
*
|
|
* @return array the array of urls found
|
|
*/
|
|
public function findUrls($message)
|
|
{
|
|
$pattern = '#'.($this->detectSchemeless ? '' : 'https?://').'(?:([0-9]{1,3}(?:\.[0-9]{1,3}){3})(?![^/]) | ('
|
|
.($this->detectSchemeless ? '(?<!http:/|https:/)[@/\\\]' : '').')?(?:(?:[a-z0-9_-]+\.?)+\.[a-z0-9]{1,6}))[^\s]*#xis';
|
|
$urls = array();
|
|
|
|
// URL Match
|
|
if (preg_match_all($pattern, $message, $matches, PREG_SET_ORDER)) {
|
|
foreach ($matches as $m) {
|
|
$url = trim(rtrim($m[0], ', ].?!;'));
|
|
|
|
// Check to see if the URL was from an email address, is a directory, etc
|
|
if (!empty($m[2])) {
|
|
$this->debug('Invalid Url: URL is either an email or a directory path. (' . $url . ')');
|
|
continue;
|
|
}
|
|
|
|
// Parse the given URL
|
|
if (!$parsed = $this->parseUrl($url)) {
|
|
$this->debug('Invalid Url: Could not parse the URL. (' . $url . ')');
|
|
continue;
|
|
}
|
|
|
|
// Check to see if the given IP/Host is valid
|
|
if (!empty($m[1]) and !$this->checkValidIP($m[1])) {
|
|
$this->debug('Invalid Url: ' . $m[1] . ' is not a valid IP address. (' . $url . ')');
|
|
continue;
|
|
}
|
|
|
|
// Process TLD if it's not an IP
|
|
if (empty($m[1])) {
|
|
// Get the TLD from the host
|
|
$pos = strrpos($parsed['host'], '.');
|
|
$parsed['tld'] = ($pos !== false ? substr($parsed['host'], ($pos+1)) : '');
|
|
|
|
// Check to see if the URL has a valid TLD
|
|
if ($this->plugins->tld->getTld($parsed['tld']) === false) {
|
|
$this->debug('Invalid Url: ' . $parsed['tld'] . ' is not a supported TLD. (' . $url . ')');
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Check to see if the URL is to a secured site or not and handle it accordingly
|
|
if ($parsed['scheme'] == 'https' && !extension_loaded('openssl')) {
|
|
if (!$this->sslFallback) {
|
|
$this->debug('Invalid Url: HTTPS is an invalid scheme, OpenSSL isn\'t available. (' . $url . ')');
|
|
continue;
|
|
} else {
|
|
$parsed['scheme'] = 'http';
|
|
}
|
|
}
|
|
|
|
if (!in_array($parsed['scheme'], array('http', 'https'))) {
|
|
$this->debug('Invalid Url: ' . $parsed['scheme'] . ' is not a supported scheme. (' . $url . ')');
|
|
continue;
|
|
}
|
|
|
|
$urls[] = $parsed + array('glued' => $this->glueURL($parsed));
|
|
}
|
|
}
|
|
|
|
return $urls;
|
|
}
|
|
|
|
/**
|
|
* Checks a given URL (+shortened) against the cache to verify if they were
|
|
* previously posted on the channel.
|
|
*
|
|
* @param string $url The URL to check against
|
|
* @param string $shortenedUrl The shortened URL to check against
|
|
*
|
|
* @return bool
|
|
*/
|
|
protected function checkUrlCache($url, $shortenedUrl)
|
|
{
|
|
$source = $this->getEvent()->getSource();
|
|
|
|
/**
|
|
* Transform the URL (+shortened) into a HEX CRC32 checksum to prevent potential problems
|
|
* and minimize the size of the cache for less cache bloat.
|
|
*/
|
|
$url = $this->getUrlChecksum($url);
|
|
$shortenedUrl = $this->getUrlChecksum($shortenedUrl);
|
|
|
|
$cache = array(
|
|
'url' => isset($this->urlCache[$source][$url]) ? $this->urlCache[$source][$url] : null,
|
|
'shortened' => isset($this->shortCache[$source][$shortenedUrl]) ? $this->shortCache[$source][$shortenedUrl] : null
|
|
);
|
|
|
|
$expire = $this->expire;
|
|
$this->debug("Cache expire: {$expire}");
|
|
/**
|
|
* If cache expiration is enabled, check to see if the given url has expired in the cache
|
|
* If expire is disabled, simply check to see if the url is listed
|
|
*/
|
|
if (($expire > 0 && (($cache['url'] + $expire) > time() || ($cache['shortened'] + $expire) > time()))
|
|
|| ($expire <= 0 && (isset($cache['url']) || isset($cache['shortened'])))
|
|
) {
|
|
unset($cache, $url, $shortenedUrl, $expire);
|
|
return true;
|
|
}
|
|
unset($cache, $url, $shortenedUrl, $expire);
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Updates the cache and adds the given URL (+shortened) to the cache. It
|
|
* also handles cleaning the cache of old entries as well.
|
|
*
|
|
* @param string $url The URL to add to the cache
|
|
* @param string $shortenedUrl The shortened to add to the cache
|
|
*
|
|
* @return bool
|
|
*/
|
|
protected function updateUrlCache($url, $shortenedUrl)
|
|
{
|
|
$source = $this->getEvent()->getSource();
|
|
|
|
/**
|
|
* Transform the URL (+shortened) into a HEX CRC32 checksum to prevent potential problems
|
|
* and minimize the size of the cache for less cache bloat.
|
|
*/
|
|
$url = $this->getUrlChecksum($url);
|
|
$shortenedUrl = $this->getUrlChecksum($shortenedUrl);
|
|
$time = time();
|
|
|
|
// Handle the URL cache and remove old entries that surpass the limit if enabled
|
|
$this->urlCache[$source][$url] = $time;
|
|
if ($this->limit > 0 && count($this->urlCache[$source]) > $this->limit) {
|
|
asort($this->urlCache[$source], SORT_NUMERIC);
|
|
array_shift($this->urlCache[$source]);
|
|
}
|
|
|
|
// Handle the shortened cache and remove old entries that surpass the limit if enabled
|
|
$this->shortCache[$source][$shortenedUrl] = $time;
|
|
if ($this->limit > 0 && count($this->shortCache[$source]) > $this->limit) {
|
|
asort($this->shortCache[$source], SORT_NUMERIC);
|
|
array_shift($this->shortCache[$source]);
|
|
}
|
|
unset($url, $shortenedUrl, $time);
|
|
}
|
|
|
|
/**
|
|
* Transliterates a UTF-8 string into corresponding ASCII characters and
|
|
* truncates and appends an ellipsis to the string if it exceeds a given
|
|
* length.
|
|
*
|
|
* @param string $str String to decode
|
|
* @param int $trim Maximum string length, optional
|
|
*
|
|
* @return string
|
|
*/
|
|
protected function decode($str, $trim = null)
|
|
{
|
|
$out = $this->plugins->encoding->transliterate($str);
|
|
if ($trim > 0) {
|
|
$out = substr($out, 0, $trim) . (strlen($out) > $trim ? '...' : '');
|
|
}
|
|
return $out;
|
|
}
|
|
|
|
/**
|
|
* Takes a url, parses and cleans the URL without of all the junk
|
|
* and then return the hex checksum of the url.
|
|
*
|
|
* @param string $url url to checksum
|
|
*
|
|
* @return string the hex checksum of the cleaned url
|
|
*/
|
|
protected function getUrlChecksum($url)
|
|
{
|
|
$checksum = strtolower(urldecode($this->glueUrl($url, true)));
|
|
$checksum = preg_replace('#\s#', '', $this->plugins->encoding->transliterate($checksum));
|
|
return dechex(crc32($checksum));
|
|
}
|
|
|
|
/**
|
|
* Parses a given URI and procceses the output to remove redundant
|
|
* or missing values.
|
|
*
|
|
* @param string $url the url to parse
|
|
*
|
|
* @return array the url components
|
|
*/
|
|
protected function parseUrl($url)
|
|
{
|
|
if (is_array($url)) return $url;
|
|
|
|
$url = trim(ltrim($url, ' /@\\'));
|
|
if (!preg_match('&^(?:([a-z][-+.a-z0-9]*):)&xis', $url, $matches)) {
|
|
$url = 'http://' . $url;
|
|
}
|
|
$parsed = parse_url($url);
|
|
|
|
if (!isset($parsed['scheme'])) {
|
|
$parsed['scheme'] = 'http';
|
|
}
|
|
$parsed['scheme'] = strtolower($parsed['scheme']);
|
|
|
|
if (isset($parsed['path']) && !isset($parsed['host'])) {
|
|
$host = $parsed['path'];
|
|
$path = '';
|
|
if (strpos($parsed['path'], '/') !== false) {
|
|
list($host, $path) = array_pad(explode('/', $parsed['path'], 2), 2, null);
|
|
}
|
|
$parsed['host'] = $host;
|
|
$parsed['path'] = $path;
|
|
}
|
|
|
|
return $parsed;
|
|
}
|
|
|
|
/**
|
|
* Parses a given URI and then glues it back together in the proper format.
|
|
* If base is set, then it chops off the scheme, user and pass and fragment
|
|
* information to return a more unique base URI.
|
|
*
|
|
* @param string $uri uri to rebuild
|
|
* @param string $base set to true to only return the base components
|
|
*
|
|
* @return string the rebuilt uri
|
|
*/
|
|
protected function glueUrl($uri, $base = false)
|
|
{
|
|
$parsed = $uri;
|
|
if (!is_array($parsed)) {
|
|
$parsed = $this->parseUrl($parsed);
|
|
}
|
|
|
|
if (is_array($parsed)) {
|
|
$uri = '';
|
|
if (!$base) {
|
|
$uri .= (!empty($parsed['scheme']) ? $parsed['scheme'] . ':' .
|
|
((strtolower($parsed['scheme']) == 'mailto') ? '' : '//') : '');
|
|
$uri .= (!empty($parsed['user']) ? $parsed['user'] .
|
|
(!empty($parsed['pass']) ? ':' . $parsed['pass'] : '') . '@' : '');
|
|
}
|
|
if ($base && !empty($parsed['host'])) {
|
|
$parsed['host'] = trim($parsed['host']);
|
|
if (substr($parsed['host'], 0, 4) == 'www.') {
|
|
$parsed['host'] = substr($parsed['host'], 4);
|
|
}
|
|
}
|
|
$uri .= (!empty($parsed['host']) ? $parsed['host'] : '');
|
|
if (!empty($parsed['port'])
|
|
&& (($parsed['scheme'] == 'http' && $parsed['port'] == 80)
|
|
|| ($parsed['scheme'] == 'https' && $parsed['port'] == 443))
|
|
) {
|
|
unset($parsed['port']);
|
|
}
|
|
$uri .= (!empty($parsed['port']) ? ':' . $parsed['port'] : '');
|
|
if (!empty($parsed['path']) && (!$base || $base && $parsed['path'] != '/')) {
|
|
$uri .= (substr($parsed['path'], 0, 1) == '/') ? $parsed['path'] : ('/' . $parsed['path']);
|
|
}
|
|
$uri .= (!empty($parsed['query']) ? '?' . $parsed['query'] : '');
|
|
if (!$base) {
|
|
$uri .= (!empty($parsed['fragment']) ? '#' . $parsed['fragment'] : '');
|
|
}
|
|
}
|
|
return $uri;
|
|
}
|
|
|
|
/**
|
|
* Checks the given string to see if its a valid IP4 address
|
|
*
|
|
* @param string $ip the ip to validate
|
|
*
|
|
* @return bool
|
|
*/
|
|
protected function checkValidIP($ip)
|
|
{
|
|
return long2ip(ip2long($ip)) === $ip;
|
|
}
|
|
|
|
/**
|
|
* Returns the title of the given page
|
|
*
|
|
* @param string $url url to the page
|
|
*
|
|
* @return string title
|
|
*/
|
|
public function getTitle($url)
|
|
{
|
|
$http = $this->plugins->getPlugin('Http');
|
|
$options = array(
|
|
'timeout' => 3.5,
|
|
'user_agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080201 Firefox/2.0.0.12'
|
|
);
|
|
|
|
$response = $http->head($url, array(), $options);
|
|
$header = $response->getHeaders('Content-Type');
|
|
|
|
if (!preg_match('#^(text/x?html|application/xhtml+xml)(?:;.*)?$#', $header)) {
|
|
$title = $header;
|
|
} else {
|
|
$response = $http->get($url, array(), $options);
|
|
$content = $response->getContent();
|
|
if (preg_match('#<title[^>]*>(.*?)</title>#is', $content, $match)) {
|
|
$title = preg_replace('/[\s\v]+/', ' ', trim($match[1]));
|
|
}
|
|
}
|
|
$encoding = $this->plugins->getPlugin('Encoding');
|
|
$title = $encoding->decodeEntities($title);
|
|
|
|
if (empty($title)) {
|
|
if ($response->isError()) {
|
|
$title = $response->getCodeAsString();
|
|
} else {
|
|
$title = 'No Title';
|
|
}
|
|
}
|
|
|
|
return $title;
|
|
}
|
|
|
|
/**
|
|
* Output a debug message
|
|
*
|
|
* @param string $msg the message to output
|
|
*
|
|
* @return void
|
|
*/
|
|
protected function debug($msg)
|
|
{
|
|
echo "(DEBUG:Url) $msg\n";
|
|
}
|
|
|
|
/**
|
|
* Add a renderer to the stack
|
|
*
|
|
* @param object $obj the renderer to add
|
|
*
|
|
* @return void
|
|
*/
|
|
public function registerRenderer($obj)
|
|
{
|
|
$this->renderers[spl_object_hash($obj)] = $obj;
|
|
}
|
|
}
|