gnu-social/plugins/Irc/extlib/phergie/Phergie/Plugin/Encoding.php

183 lines
5.8 KiB
PHP
Raw Normal View History

2010-07-26 00:04:12 +01:00
<?php
/**
* Phergie
*
* PHP version 5
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.
* It is also available through the world-wide-web at this URL:
* http://phergie.org/license
*
* @category Phergie
* @package Phergie_Plugin_Encoding
* @author Phergie Development Team <team@phergie.org>
* @copyright 2008-2010 Phergie Development Team (http://phergie.org)
* @license http://phergie.org/license New BSD License
* @link http://pear.phergie.org/package/Phergie_Plugin_Encoding
*/
/**
* Handles decoding markup entities and converting text between character
* encodings.
*
* @category Phergie
* @package Phergie_Plugin_Encoding
* @author Phergie Development Team <team@phergie.org>
* @license http://phergie.org/license New BSD License
* @link http://pear.phergie.org/package/Phergie_Plugin_Encoding
*/
class Phergie_Plugin_Encoding extends Phergie_Plugin_Abstract
{
/**
* Lookup table for entity conversions not supported by
* html_entity_decode()
*
* @var array
* @link http://us.php.net/manual/en/function.get-html-translation-table.php#73409
* @link http://us.php.net/manual/en/function.get-html-translation-table.php#73410
*/
protected static $entities = array(
'&alpha;' => 913,
'&apos;' => 39,
'&beta;' => 914,
'&bull;' => 149,
'&chi;' => 935,
'&circ;' => 94,
'&delta;' => 916,
'&epsilon;' => 917,
'&eta;' => 919,
'&fnof;' => 402,
'&gamma;' => 915,
'&iota;' => 921,
'&kappa;' => 922,
'&lambda;' => 923,
'&ldquo;' => 147,
'&lsaquo;' => 139,
'&lsquo;' => 145,
'&mdash;' => 151,
'&minus;' => 45,
'&mu;' => 924,
'&ndash;' => 150,
'&nu;' => 925,
'&oelig;' => 140,
'&omega;' => 937,
'&omicron;' => 927,
'&phi;' => 934,
'&pi;' => 928,
'&piv;' => 982,
'&psi;' => 936,
'&rdquo;' => 148,
'&rho;' => 929,
'&rsaquo;' => 155,
'&rsquo;' => 146,
'&scaron;' => 138,
'&sigma;' => 931,
'&sigmaf;' => 962,
'&tau;' => 932,
'&theta;' => 920,
'&thetasym;' => 977,
'&tilde;' => 126,
'&trade;' => 153,
'&upsih;' => 978,
'&upsilon;' => 933,
'&xi;' => 926,
'&yuml;' => 159,
'&zeta;' => 918,
);
/**
* Decodes markup entities in a given string.
*
* @param string $string String containing markup entities
* @param string $charset Optional character set name to use in decoding
* entities, defaults to UTF-8
*
* @return string String with markup entities decoded
*/
public function decodeEntities($string, $charset = 'UTF-8')
{
$string = str_ireplace(
array_keys(self::$entities),
array_map('chr', self::$entities),
$string
);
$string = html_entity_decode($string, ENT_QUOTES, $charset);
$string = preg_replace(
array('/&#0*([0-9]+);/me', '/&#x0*([a-f0-9]+);/mei'),
array('$this->codeToUtf(\\1)', '$this->codeToUtf(hexdec(\\1))'),
$string
);
return $string;
}
/**
* Converts a given unicode to its UTF-8 equivalent.
*
* @param int $code Code to convert
* @return string Character corresponding to code
*/
public function codeToUtf8($code)
{
$code = (int) $code;
switch ($code) {
// 1 byte, 7 bits
case 0:
return chr(0);
case ($code & 0x7F):
return chr($code);
// 2 bytes, 11 bits
case ($code & 0x7FF):
return chr(0xC0 | (($code >> 6) & 0x1F)) .
chr(0x80 | ($code & 0x3F));
// 3 bytes, 16 bits
case ($code & 0xFFFF):
return chr(0xE0 | (($code >> 12) & 0x0F)) .
chr(0x80 | (($code >> 6) & 0x3F)) .
chr(0x80 | ($code & 0x3F));
// 4 bytes, 21 bits
case ($code & 0x1FFFFF):
return chr(0xF0 | ($code >> 18)) .
chr(0x80 | (($code >> 12) & 0x3F)) .
chr(0x80 | (($code >> 6) & 0x3F)) .
chr(0x80 | ($code & 0x3F));
}
}
/**
* Transliterates characters in a given string where possible.
*
* @param string $string String containing characters to
* transliterate
* @param string $charsetFrom Optional character set of the string,
* defaults to UTF-8
* @param string $charsetTo Optional character set to which the string
* should be converted, defaults to ISO-8859-1
*
* @return string String with characters transliterated or the original
* string if transliteration was not possible
*/
public function transliterate($string, $charsetFrom = 'UTF-8', $charsetTo = 'ISO-8859-1')
{
// @link http://pecl.php.net/package/translit
if (function_exists('transliterate')) {
$string = transliterate($string, array('han_transliterate', 'diacritical_remove'), $charsetFrom, $charsetTo);
} elseif (function_exists('iconv')) {
$string = iconv($charsetFrom, $charsetTo . '//TRANSLIT', $string);
} else {
// @link http://stackoverflow.com/questions/1284535/php-transliteration/1285491#1285491
$string = preg_replace(
'~&([a-z]{1,2})(acute|cedil|circ|grave|lig|orn|ring|slash|th|tilde|uml);~i',
'$1',
htmlentities($string, ENT_COMPAT, $charsetFrom)
);
}
return $string;
}
}