gnu-social/extlib/HTMLPurifier/HTMLPurifier/EntityParser.php
Evan Prodromou 81ea0f8117 Add HTMLPurifier to extlib
HTMLPurifier defangs arbitrary submitted HTML. We're using it in the
OStatus plugin, but it may be valuable for other parts of the codebase
(I think OEmbed might benefit, for example).
2010-02-20 11:35:01 -05:00

145 lines
4.6 KiB
PHP

<?php
// if want to implement error collecting here, we'll need to use some sort
// of global data (probably trigger_error) because it's impossible to pass
// $config or $context to the callback functions.
/**
* Handles referencing and derefencing character entities
*/
class HTMLPurifier_EntityParser
{
/**
* Reference to entity lookup table.
*/
protected $_entity_lookup;
/**
* Callback regex string for parsing entities.
*/
protected $_substituteEntitiesRegex =
'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
// 1. hex 2. dec 3. string (XML style)
/**
* Decimal to parsed string conversion table for special entities.
*/
protected $_special_dec2str =
array(
34 => '"',
38 => '&',
39 => "'",
60 => '<',
62 => '>'
);
/**
* Stripped entity names to decimal conversion table for special entities.
*/
protected $_special_ent2dec =
array(
'quot' => 34,
'amp' => 38,
'lt' => 60,
'gt' => 62
);
/**
* Substitutes non-special entities with their parsed equivalents. Since
* running this whenever you have parsed character is t3h 5uck, we run
* it before everything else.
*
* @param $string String to have non-special entities parsed.
* @returns Parsed string.
*/
public function substituteNonSpecialEntities($string) {
// it will try to detect missing semicolons, but don't rely on it
return preg_replace_callback(
$this->_substituteEntitiesRegex,
array($this, 'nonSpecialEntityCallback'),
$string
);
}
/**
* Callback function for substituteNonSpecialEntities() that does the work.
*
* @param $matches PCRE matches array, with 0 the entire match, and
* either index 1, 2 or 3 set with a hex value, dec value,
* or string (respectively).
* @returns Replacement string.
*/
protected function nonSpecialEntityCallback($matches) {
// replaces all but big five
$entity = $matches[0];
$is_num = (@$matches[0][1] === '#');
if ($is_num) {
$is_hex = (@$entity[2] === 'x');
$code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
// abort for special characters
if (isset($this->_special_dec2str[$code])) return $entity;
return HTMLPurifier_Encoder::unichr($code);
} else {
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
if (!$this->_entity_lookup) {
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
}
if (isset($this->_entity_lookup->table[$matches[3]])) {
return $this->_entity_lookup->table[$matches[3]];
} else {
return $entity;
}
}
}
/**
* Substitutes only special entities with their parsed equivalents.
*
* @notice We try to avoid calling this function because otherwise, it
* would have to be called a lot (for every parsed section).
*
* @param $string String to have non-special entities parsed.
* @returns Parsed string.
*/
public function substituteSpecialEntities($string) {
return preg_replace_callback(
$this->_substituteEntitiesRegex,
array($this, 'specialEntityCallback'),
$string);
}
/**
* Callback function for substituteSpecialEntities() that does the work.
*
* This callback has same syntax as nonSpecialEntityCallback().
*
* @param $matches PCRE-style matches array, with 0 the entire match, and
* either index 1, 2 or 3 set with a hex value, dec value,
* or string (respectively).
* @returns Replacement string.
*/
protected function specialEntityCallback($matches) {
$entity = $matches[0];
$is_num = (@$matches[0][1] === '#');
if ($is_num) {
$is_hex = (@$entity[2] === 'x');
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
return isset($this->_special_dec2str[$int]) ?
$this->_special_dec2str[$int] :
$entity;
} else {
return isset($this->_special_ent2dec[$matches[3]]) ?
$this->_special_ent2dec[$matches[3]] :
$entity;
}
}
}
// vim: et sw=4 sts=4