htmLawed extlib updated

This commit is contained in:
Mikael Nordfeldth 2013-09-24 01:37:38 +02:00
parent f268c3f877
commit f01c478aab
5 changed files with 4155 additions and 3676 deletions

View File

@ -1,9 +1,9 @@
<?php
/*
htmLawed 1.1.8.1, 16 July 2009
htmLawed 1.1.16, 29 August 2013
Copyright Santosh Patnaik
GPL v3 license
Dual licensed with LGPL 3 and GPL 2+
A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed
See htmLawed_README.txt/htm
@ -51,7 +51,7 @@ foreach(explode(';', str_replace(array(' ', "\t", "\r", "\n"), '', $x)) as $v){
if($x2){$C['schemes'][$x] = array_flip(explode(',', $x2));}
}
if(!isset($C['schemes']['*'])){$C['schemes']['*'] = array('file'=>1, 'http'=>1, 'https'=>1,);}
if(!empty($C['safe']) && empty($C['schemes']['style'])){$C['schemes']['style'] = array('nil'=>1);}
if(!empty($C['safe']) && empty($C['schemes']['style'])){$C['schemes']['style'] = array('!'=>1);}
$C['abs_url'] = isset($C['abs_url']) ? $C['abs_url'] : 0;
if(!isset($C['base_url']) or !preg_match('`^[a-zA-Z\d.+\-]+://[^/]+/(.+?/)?$`', $C['base_url'])){
$C['base_url'] = $C['abs_url'] = 0;
@ -65,6 +65,7 @@ $C['cdata'] = isset($C['cdata']) ? $C['cdata'] : (empty($C['safe']) ? 3 : 0);
$C['clean_ms_char'] = empty($C['clean_ms_char']) ? 0 : $C['clean_ms_char'];
$C['comment'] = isset($C['comment']) ? $C['comment'] : (empty($C['safe']) ? 3 : 0);
$C['css_expression'] = empty($C['css_expression']) ? 0 : 1;
$C['direct_list_nest'] = empty($C['direct_list_nest']) ? 0 : 1;
$C['hexdec_entity'] = isset($C['hexdec_entity']) ? $C['hexdec_entity'] : 1;
$C['hook'] = (!empty($C['hook']) && function_exists($C['hook'])) ? $C['hook'] : 0;
$C['hook_tag'] = (!empty($C['hook_tag']) && function_exists($C['hook_tag'])) ? $C['hook_tag'] : 0;
@ -149,14 +150,15 @@ $cI = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'b'=>1, 'bdo'=>1, 'bi
$cN = array('a'=>array('a'=>1), 'button'=>array('a'=>1, 'button'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'fieldset'=>array('fieldset'=>1), 'form'=>array('form'=>1), 'label'=>array('label'=>1), 'noscript'=>array('script'=>1), 'pre'=>array('big'=>1, 'font'=>1, 'img'=>1, 'object'=>1, 'script'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1), 'rb'=>array('ruby'=>1), 'rt'=>array('ruby'=>1)); // Illegal
$cN2 = array_keys($cN);
$cR = array('blockquote'=>1, 'dir'=>1, 'dl'=>1, 'form'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1);
$cS = array('colgroup'=>array('col'=>1), 'dir'=>array('li'), 'dl'=>array('dd'=>1, 'dt'=>1), 'menu'=>array('li'=>1), 'ol'=>array('li'=>1), 'optgroup'=>array('option'=>1), 'option'=>array('#pcdata'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'tfoot'=>1, 'tbody'=>1, 'tr'=>1, 'thead'=>1), 'tbody'=>array('tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('tr'=>1), 'tr'=>array('td'=>1, 'th'=>1), 'ul'=>array('li'=>1)); // Specific - immediate parent-child
$cS = array('colgroup'=>array('col'=>1), 'dir'=>array('li'=>1), 'dl'=>array('dd'=>1, 'dt'=>1), 'menu'=>array('li'=>1), 'ol'=>array('li'=>1), 'optgroup'=>array('option'=>1), 'option'=>array('#pcdata'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'tfoot'=>1, 'tbody'=>1, 'tr'=>1, 'thead'=>1), 'tbody'=>array('tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('tr'=>1), 'tr'=>array('td'=>1, 'th'=>1), 'ul'=>array('li'=>1)); // Specific - immediate parent-child
if($GLOBALS['C']['direct_list_nest']){$cS['ol'] = $cS['ul'] += array('ol'=>1, 'ul'=>1);}
$cO = array('address'=>array('p'=>1), 'applet'=>array('param'=>1), 'blockquote'=>array('script'=>1), 'fieldset'=>array('legend'=>1, '#pcdata'=>1), 'form'=>array('script'=>1), 'map'=>array('area'=>1), 'object'=>array('param'=>1, 'embed'=>1)); // Other
$cT = array('colgroup'=>1, 'dd'=>1, 'dt'=>1, 'li'=>1, 'option'=>1, 'p'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1); // Omitable closing
// block/inline type; ins & del both type; #pcdata: text
$eB = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'isindex'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'table'=>1, 'ul'=>1);
$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'cite'=>1, 'code'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'map'=>1, 'object'=>1, 'param'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'tt'=>1, 'u'=>1, 'var'=>1);
$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'cite'=>1, 'code'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'map'=>1, 'object'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'tt'=>1, 'u'=>1, 'var'=>1);
$eN = array('a'=>1, 'big'=>1, 'button'=>1, 'fieldset'=>1, 'font'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'label'=>1, 'object'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1); // Exclude from specific ele; $cN values
$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'dt'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1); // Missing in $eB & $eI
$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'dt'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1); // Missing in $eB & $eI
$eF = $eB + $eI;
// $in sets allowed child
@ -192,7 +194,10 @@ for($i=-1, $ci=count($t); ++$i<$ci;){
echo '&lt;', $s, $e, $a, '&gt;';
}
if(isset($x[0])){
if($do < 3 or isset($ok['#pcdata'])){echo $x;}
if(strlen(trim($x)) && (($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql))){
echo '<div>', $x, '</div>';
}
elseif($do < 3 or isset($ok['#pcdata'])){echo $x;}
elseif(strpos($x, "\x02\x04")){
foreach(preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $v){
echo (substr($v, 0, 2) == "\x01\x02" ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : ''));
@ -200,7 +205,7 @@ for($i=-1, $ci=count($t); ++$i<$ci;){
}elseif($do > 4){echo preg_replace('`\S`', '', $x);}
}
// get markup
if(!preg_match('`^(/?)([a-zA-Z1-6]+)([^>]*)>(.*)`sm', $t[$i], $r)){$x = $t[$i]; continue;}
if(!preg_match('`^(/?)([a-z1-6]+)([^>]*)>(.*)`sm', $t[$i], $r)){$x = $t[$i]; continue;}
$s = null; $e = null; $a = null; $x = null; list($all, $s, $e, $a, $x) = $r;
// close tag
if($s){
@ -295,20 +300,14 @@ function hl_cmtcd($t){
// comment/CDATA sec handler
$t = $t[0];
global $C;
if($t[3] == '-'){
if(!$C['comment']){return $t;}
if($C['comment'] == 1){return '';}
if(!($v = $C[$n = $t[3] == '-' ? 'comment' : 'cdata'])){return $t;}
if($v == 1){return '';}
if($n == 'comment'){
if(substr(($t = preg_replace('`--+`', '-', substr($t, 4, -3))), -1) != ' '){$t .= ' ';}
$t = $C['comment'] == 2 ? str_replace(array('&', '<', '>'), array('&amp;', '&lt;', '&gt;'), $t) : $t;
$t = "\x01\x02\x04!--$t--\x05\x02\x01";
}else{ // CDATA
if(!$C['cdata']){return $t;}
if($C['cdata'] == 1){return '';}
$t = substr($t, 1, -1);
$t = $C['cdata'] == 2 ? str_replace(array('&', '<', '>'), array('&amp;', '&lt;', '&gt;'), $t) : $t;
$t = "\x01\x01\x04$t\x05\x01\x01";
}
return str_replace(array('&', '<', '>'), array("\x03", "\x04", "\x05"), $t);
else{$t = substr($t, 1, -1);}
$t = $v == 2 ? str_replace(array('&', '<', '>'), array('&amp;', '&lt;', '&gt;'), $t) : $t;
return str_replace(array('&', '<', '>'), array("\x03", "\x04", "\x05"), ($n == 'comment' ? "\x01\x02\x04!--$t--\x05\x02\x01" : "\x01\x01\x04$t\x05\x01\x01"));
// eof
}
@ -334,9 +333,11 @@ global $C;
$b = $a = '';
if($c == null){$c = 'style'; $b = $p[1]; $a = $p[3]; $p = trim($p[2]);}
$c = isset($C['schemes'][$c]) ? $C['schemes'][$c] : $C['schemes']['*'];
if(isset($c['*']) or !strcspn($p, '#?;')){return "{$b}{$p}{$a}";} // All ok, frag, query, param
if(preg_match('`^([a-z\d\-+.&#; ]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])){ // Denied prot
return "{$b}denied:{$p}{$a}";
static $d = 'denied:';
if(isset($c['!']) && substr($p, 0, 7) != $d){$p = "$d$p";}
if(isset($c['*']) or !strcspn($p, '#?;') or (substr($p, 0, 7) == $d)){return "{$b}{$p}{$a}";} // All ok, frag, query, param
if(preg_match('`^([^:?[@!$()*,=/\'\]]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])){ // Denied prot
return "{$b}{$d}{$p}{$a}";
}
if($C['abs_url']){
if($C['abs_url'] == -1 && strpos($p, $C['base_url']) === 0){ // Make url rel
@ -419,10 +420,7 @@ if(!preg_match('`^<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>$`m', $t, $m)){
return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : '');
}
// attr string
$a = str_replace(array("\xad", "\n", "\r", "\t"), ' ', trim($m[3]));
if(strpos($a, '&') !== false){
str_replace(array('&#xad;', '&#173;', '&shy;'), ' ', $a);
}
$a = str_replace(array("\n", "\r", "\t"), ' ', trim($m[3]));
// tag transform
static $eD = array('applet'=>1, 'center'=>1, 'dir'=>1, 'embed'=>1, 'font'=>1, 'isindex'=>1, 'menu'=>1, 's'=>1, 'strike'=>1, 'u'=>1); // Deprecated
if($C['make_tag_strict'] && isset($eD[$e])){
@ -432,11 +430,11 @@ if($C['make_tag_strict'] && isset($eD[$e])){
// close tag
static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); // Empty ele
if(!empty($m[1])){
return (!isset($eE[$e]) ? "</$e>" : (($C['keep_bad'])%2 ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : ''));
return (!isset($eE[$e]) ? (empty($C['hook_tag']) ? "</$e>" : $C['hook_tag']($e)) : (($C['keep_bad'])%2 ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : ''));
}
// open tag & attr
static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific
static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'flashvars'=>array('embed'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific
static $aNE = array('checked'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'disabled'=>1, 'ismap'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'readonly'=>1, 'selected'=>1); // Empty
static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* & src
static $aNU = array('class'=>array('param'=>1, 'script'=>1), 'dir'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'id'=>array('script'=>1), 'lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'xml:lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'onclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'ondblclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeydown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeypress'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeyup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousedown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousemove'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseout'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseover'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'style'=>array('param'=>1, 'script'=>1), 'title'=>array('param'=>1, 'script'=>1)); // Univ & exceptions
@ -475,8 +473,8 @@ while(strlen($a)){
$aA[$nm] = '';
}
break; case 2: // Val
if(preg_match('`^"[^"]*"`', $a, $m) or preg_match("`^'[^']*'`", $a, $m) or preg_match("`^\s*[^\s\"']+`", $a, $m)){
$m = $m[0]; $w = 1; $mode = 0; $a = ltrim(substr_replace($a, '', 0, strlen($m)));
if(preg_match('`^((?:"[^"]*")|(?:\'[^\']*\')|(?:\s*[^\s"\']+))(.*)`', $a, $m)){
$a = ltrim($m[2]); $m = $m[1]; $w = 1; $mode = 0;
$aA[$nm] = trim(($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m);
}
break;
@ -493,7 +491,7 @@ global $S;
$rl = isset($S[$e]) ? $S[$e] : array();
$a = array(); $nfr = 0;
foreach($aA as $k=>$v){
if(((isset($C['deny_attribute']['*']) ? isset($C['deny_attribute'][$k]) : !isset($C['deny_attribute'][$k])) or isset($rl[$k])) && ((!isset($rl['n'][$k]) && !isset($rl['n']['*'])) or isset($rl[$k])) && (isset($aN[$k][$e]) or (isset($aNU[$k]) && !isset($aNU[$k][$e])))){
if(((isset($C['deny_attribute']['*']) ? isset($C['deny_attribute'][$k]) : !isset($C['deny_attribute'][$k])) && (isset($aN[$k][$e]) or (isset($aNU[$k]) && !isset($aNU[$k][$e]))) && !isset($rl['n'][$k]) && !isset($rl['n']['*'])) or isset($rl[$k])){
if(isset($aNE[$k])){$v = $k;}
elseif(!empty($lcase) && (($e != 'button' or $e != 'input') or $k == 'type')){ // Rather loose but ?not cause issues
$v = (isset($aNL[($v2 = strtolower($v))])) ? $v2 : $v;
@ -503,9 +501,10 @@ foreach($aA as $k=>$v){
static $sC = array('&#x20;'=>' ', '&#32;'=>' ', '&#x45;'=>'e', '&#69;'=>'e', '&#x65;'=>'e', '&#101;'=>'e', '&#x58;'=>'x', '&#88;'=>'x', '&#x78;'=>'x', '&#120;'=>'x', '&#x50;'=>'p', '&#80;'=>'p', '&#x70;'=>'p', '&#112;'=>'p', '&#x53;'=>'s', '&#83;'=>'s', '&#x73;'=>'s', '&#115;'=>'s', '&#x49;'=>'i', '&#73;'=>'i', '&#x69;'=>'i', '&#105;'=>'i', '&#x4f;'=>'o', '&#79;'=>'o', '&#x6f;'=>'o', '&#111;'=>'o', '&#x4e;'=>'n', '&#78;'=>'n', '&#x6e;'=>'n', '&#110;'=>'n', '&#x55;'=>'u', '&#85;'=>'u', '&#x75;'=>'u', '&#117;'=>'u', '&#x52;'=>'r', '&#82;'=>'r', '&#x72;'=>'r', '&#114;'=>'r', '&#x4c;'=>'l', '&#76;'=>'l', '&#x6c;'=>'l', '&#108;'=>'l', '&#x28;'=>'(', '&#40;'=>'(', '&#x29;'=>')', '&#41;'=>')', '&#x20;'=>':', '&#32;'=>':', '&#x22;'=>'"', '&#34;'=>'"', '&#x27;'=>"'", '&#39;'=>"'", '&#x2f;'=>'/', '&#47;'=>'/', '&#x2a;'=>'*', '&#42;'=>'*', '&#x5c;'=>'\\', '&#92;'=>'\\');
$v = strtr($v, $sC);
}
$v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'hl_prot', $v);
$v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'hl_prot', $v);
$v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v;
}elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){
$v = str_replace("\xad", ' ', (strpos($v, '&') !== false ? str_replace(array('&#xad;', '&#173;', '&shy;'), ' ', $v) : $v));
$v = hl_prot($v, $k);
if($k == 'href'){ // X-spam
if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){
@ -626,7 +625,7 @@ if($e == 'u'){$e = 'span'; return 'text-decoration: underline;';}
static $fs = array('0'=>'xx-small', '1'=>'xx-small', '2'=>'small', '3'=>'medium', '4'=>'large', '5'=>'x-large', '6'=>'xx-large', '7'=>'300%', '-1'=>'smaller', '-2'=>'60%', '+1'=>'larger', '+2'=>'150%', '+3'=>'200%', '+4'=>'300%');
if($e == 'font'){
$a2 = '';
if(preg_match('`face\s*=\s*(\'|")([^=]+?)\\1`i', $a, $m) or preg_match('`face\s*=\s*([^"])(\S+)`i', $a, $m)){
if(preg_match('`face\s*=\s*(\'|")([^=]+?)\\1`i', $a, $m) or preg_match('`face\s*=(\s*)(\S+)`i', $a, $m)){
$a2 .= ' font-family: '. str_replace('"', '\'', trim($m[2])). ';';
}
if(preg_match('`color\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m)){
@ -645,20 +644,24 @@ return '';
function hl_tidy($t, $w, $p){
// Tidy/compact HTM
if(strpos(' pre,script,textarea', "$p,")){return $t;}
$t = str_replace(' </', '</', preg_replace(array('`(<\w[^>]*(?<!/)>)\s+`', '`\s+`', '`(<\w[^>]*(?<!/)>) `'), array(' $1', ' ', '$1'), preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea).*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t)));
$t = preg_replace('`\s+`', ' ', preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t));
if(($w = strtolower($w)) == -1){
return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t);
}
$s = strpos(" $w", 't') ? "\t" : ' ';
$s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ($s == "\t" ? 1 : 2));
$n = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0;
$N = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0;
$a = array('br'=>1);
$b = array('button'=>1, 'input'=>1, 'option'=>1);
$b = array('button'=>1, 'input'=>1, 'option'=>1, 'param'=>1);
$c = array('caption'=>1, 'dd'=>1, 'dt'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1);
$d = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'form'=>1, 'hr'=>1, 'iframe'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1);
$d = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'form'=>1, 'hr'=>1, 'iframe'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1);
$T = explode('<', $t);
$X = 1;
while($X){
$n = $N;
$t = $T;
ob_start();
if(isset($d[$p])){echo str_repeat($s, ++$n);}
$t = explode('<', $t);
echo ltrim(array_shift($t));
for($i=-1, $j=count($t); ++$i<$j;){
$r = ''; list($e, $r) = explode('>', $t[$i]);
@ -666,20 +669,25 @@ for($i=-1, $j=count($t); ++$i<$j;){
$y = !$x ? ltrim($e, '/') : ($x > 0 ? substr($e, 0, strcspn($e, ' ')) : 0);
$e = "<$e>";
if(isset($d[$y])){
if(!$x){echo "\n", str_repeat($s, --$n), "$e\n", str_repeat($s, $n);}
if(!$x){
if($n){echo "\n", str_repeat($s, --$n), "$e\n", str_repeat($s, $n);}
else{++$N; ob_end_clean(); continue 2;}
}
else{echo "\n", str_repeat($s, $n), "$e\n", str_repeat($s, ($x != 1 ? ++$n : $n));}
echo ltrim($r); continue;
echo $r; continue;
}
$f = "\n". str_repeat($s, $n);
if(isset($c[$y])){
if(!$x){echo $e, $f, ltrim($r);}
if(!$x){echo $e, $f, $r;}
else{echo $f, $e, $r;}
}elseif(isset($b[$y])){echo $f, $e, $r;
}elseif(isset($a[$y])){echo $e, $f, ltrim($r);
}elseif(!$y){echo $f, $e, $f, ltrim($r);
}elseif(isset($a[$y])){echo $e, $f, $r;
}elseif(!$y){echo $f, $e, $f, $r;
}else{echo $e, $r;}
}
$t = preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents());
$X = 0;
}
$t = str_replace(array("\n ", " \n"), "\n", preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents()));
ob_end_clean();
if(($l = strpos(" $w", 'r') ? (strpos(" $w", 'n') ? "\r\n" : "\r") : 0)){
$t = str_replace("\n", $l, $t);
@ -690,7 +698,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array(
function hl_version(){
// rel
return '1.1.8.1';
return '1.1.16';
// eof
}

View File

@ -1,10 +1,10 @@
<?php
/*
htmLawedTest.php, 16 July 2009
htmLawed 1.1.8.1, 16 July 2009
htmLawedTest.php, 28 May 2013
htmLawed 1.1.16, 29 August 2013
Copyright Santosh Patnaik
GPL v3 license
Dual licensed with LGPL 3 and GPL 2+
A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed
Test htmLawed; user provides text input; input and processed input are shown as highlighted code and rendered HTML; also shown are execution time and peak memory usage
@ -22,7 +22,7 @@ $_sid = 'sid'; // session name; alphanum.
$_slife = 30; // session life in min.
// errors
error_reporting(E_ALL | (defined('E_STRICT') ? E_STRICT : 1));
error_reporting(E_ALL | (defined('E_STRICT') ? E_STRICT : 0));
ini_set('display_errors', $_errs);
// session
@ -44,7 +44,9 @@ if(get_magic_quotes_gpc()){
}
ini_set('magic_quotes_gpc', 0);
}
if(get_magic_quotes_runtime()){
set_magic_quotes_runtime(0);
}
$_POST['enc'] = (isset($_POST['enc']) and preg_match('`^[-\w]+$`', $_POST['enc'])) ? $_POST['enc'] : 'utf-8';
@ -132,7 +134,7 @@ function hexdump($d){
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html lang="en" xml:lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=<?php echo htmlspecialchars($_POST['enc']); ?>" />
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<meta name="description" content="htmLawed <?php echo hl_version();?> test page" />
<style type="text/css"><!--/*--><![CDATA[/*><!--*/
a, a.resizer{text-decoration:none;}
@ -142,15 +144,16 @@ body{background-color:#efefef;}
body, button, div, html, input, p{font-size:13px; font-family:'Lucida grande', Verdana, Arial, Helvetica, sans-serif;}
button, input{font-size: 85%;}
div.help{border-top: 1px dotted gray; margin-top: 15px; padding-top: 15px; color:#999999;}
#inputC, #inputD, #inputF, #inputR, #outputD, #outputF, #outputH, #outputR, #settingF{display:block;}
#inputC, #inputD, #inputF, #inputR, #outputD, #outputF, #outputH, #outputR, #settingF, #diff{display:block;}
#inputC, #settingF{background-color:white; border:1px gray solid; padding:3px;}
#inputC li{margin: 0; padding: 0;}
#inputC ul{margin: 0; padding: 0; margin-left: 14px;}
#inputC input{margin: 0; margin-left: 2px; margin-right: 2px; padding: 1px; vertical-align: middle;}
#inputD{overflow:auto; background-color:#ffff99; border:1px #cc9966 solid; padding:3px;}
#inputR{overflow:auto; background-color:#ffffcc; border:1px #ffcc99 solid; padding:3px;}
#inputC, #settingF, #inputD, #inputR, #outputD, #outputR, textarea{font-size:100%; font-family:'Bitstream vera sans mono', 'courier new', 'courier', monospace;}
#inputC, #settingF, #inputD, #inputR, #outputD, #outputR, #diff, textarea{font-size:100%; font-family:'Bitstream vera sans mono', 'courier new', 'courier', monospace;}
#outputD{overflow:auto; background-color: #99ffcc; border:1px #66cc99 solid; padding:3px;}
#diff{overflow:auto; background-color: white; border:1px #dcdcdc solid; padding:3px;}
#outputH{overflow:auto; background-color:white; padding:3px; border:1px #dcdcdc solid;}
#outputR{overflow:auto; background-color: #ccffcc; border:1px #99cc99 solid; padding:3px;}
span.cmtcdata{color: orange;}
@ -259,9 +262,6 @@ function sndUnproc(){
var i = document.getElementById('text');
if(!i){return;}
i = i.value;
i = i.replace(/>/g, '&gt;');
i = i.replace(/</g, '&lt;');
i = i.replace(/"/g, '&quot;');
var w = window.open('htmLawedTest.php?pre=1', 'hlprehtm');
var f = document.createElement('form');
f.enctype = 'application/x-www-form-urlencoded';
@ -269,10 +269,14 @@ function sndUnproc(){
f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>';
if(f.style){f.style.display = 'none';}
else{f.visibility = 'hidden';}
f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="token" id="token" value="<?php echo $token; ?>" /><input style="display:none;" type="hidden" name="<?php echo htmlspecialchars($_sid); ?>" id="<?php echo htmlspecialchars($_sid); ?>" value="' + readCookie('<?php echo htmlspecialchars($_sid); ?>') + '" /><input style="display:none;" type="hidden" name="inputH" id="inputH" value="'+ i+ '" /></p>';
f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="token" id="token" value="<?php echo $token; ?>" /><input style="display:none;" type="hidden" name="<?php echo htmlspecialchars($_sid); ?>" id="<?php echo htmlspecialchars($_sid); ?>" value="' + readCookie('<?php echo htmlspecialchars($_sid); ?>') + '" /></p>';
f.action = 'htmLawedTest.php?pre=1';
f.target = 'hlprehtm';
f.method = 'post';
var t = document.createElement('textarea');
t.name = 'inputH';
t.value = i;
f.appendChild(t);
var b = document.getElementsByTagName('body')[0];
b.appendChild(f);
f.submit();
@ -282,9 +286,6 @@ function sndValidn(id, type){
var i = document.getElementById(id);
if(!i){return;}
i = i.value;
i = i.replace(/>/g, '&gt;');
i = i.replace(/</g, '&lt;');
i = i.replace(/"/g, '&quot;');
var w = window.open('http://validator.w3.org/check', 'validate'+id+type);
var f = document.createElement('form');
f.enctype = 'application/x-www-form-urlencoded';
@ -292,9 +293,13 @@ function sndValidn(id, type){
f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>';
if(f.style){f.style.display = 'none';}
else{f.visibility = 'hidden';}
f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="fragment" id="fragment" value="'+ i+ '" /><input style="display:none;" type="hidden" name="prefill" id="prefill" value="1" /><input style="display:none;" type="hidden" name="prefill_doctype" id="prefill_doctype" value="'+ type+ '" /><input style="display:none;" type="hidden" name="group" id="group" value="1" /><input type="hidden" name="ss" id="ss" value="1" /></p>';
f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="prefill" id="prefill" value="1" /><input style="display:none;" type="hidden" name="prefill_doctype" id="prefill_doctype" value="'+ type+ '" /><input style="display:none;" type="hidden" name="group" id="group" value="1" /><input type="hidden" name="ss" id="ss" value="1" /></p>';
f.action = 'http://validator.w3.org/check';
f.target = 'validate'+id+type;
var t = document.createElement('textarea');
t.name = 'fragment';
t.value = i;
f.appendChild(t);
var b = document.getElementsByTagName('body')[0];
b.appendChild(f);
f.submit();
@ -328,7 +333,7 @@ tRs = {
a.appendChild(document.createTextNode("\u2195"));
a.style.cursor = 'n-resize';
a.className= 'resizer';
a.title = 'click-drag to resize'
a.title = 'click-drag to resize textarea'
tRs.adEv(a, 'mousedown', tRs.initResize);
textareas[i].parentNode.appendChild(a);
}
@ -374,6 +379,58 @@ tRs = {
}
};
tRs.adEv(window, 'load', tRs.adBtn);
// Diff Match and Patch javascript code by Neil Fraser; Apache license 2.0; http://code.google.com/p/google-diff-match-patch/
(function(){function diff_match_patch(){this.Diff_Timeout=1;this.Diff_EditCost=4;this.Match_Threshold=0.5;this.Match_Distance=1E3;this.Patch_DeleteThreshold=0.5;this.Patch_Margin=4;this.Match_MaxBits=32}
diff_match_patch.prototype.diff_main=function(a,b,c,d){"undefined"==typeof d&&(d=0>=this.Diff_Timeout?Number.MAX_VALUE:(new Date).getTime()+1E3*this.Diff_Timeout);if(null==a||null==b)throw Error("Null input. (diff_main)");if(a==b)return a?[[0,a]]:[];"undefined"==typeof c&&(c=!0);var e=c,f=this.diff_commonPrefix(a,b),c=a.substring(0,f),a=a.substring(f),b=b.substring(f),f=this.diff_commonSuffix(a,b),g=a.substring(a.length-f),a=a.substring(0,a.length-f),b=b.substring(0,b.length-f),a=this.diff_compute_(a,
b,e,d);c&&a.unshift([0,c]);g&&a.push([0,g]);this.diff_cleanupMerge(a);return a};
diff_match_patch.prototype.diff_compute_=function(a,b,c,d){if(!a)return[[1,b]];if(!b)return[[-1,a]];var e=a.length>b.length?a:b,f=a.length>b.length?b:a,g=e.indexOf(f);if(-1!=g)return c=[[1,e.substring(0,g)],[0,f],[1,e.substring(g+f.length)]],a.length>b.length&&(c[0][0]=c[2][0]=-1),c;if(1==f.length)return[[-1,a],[1,b]];return(e=this.diff_halfMatch_(a,b))?(f=e[0],a=e[1],g=e[2],b=e[3],e=e[4],f=this.diff_main(f,g,c,d),c=this.diff_main(a,b,c,d),f.concat([[0,e]],c)):c&&100<a.length&&100<b.length?this.diff_lineMode_(a,
b,d):this.diff_bisect_(a,b,d)};
diff_match_patch.prototype.diff_lineMode_=function(a,b,c){var d=this.diff_linesToChars_(a,b),a=d.chars1,b=d.chars2,d=d.lineArray,a=this.diff_main(a,b,!1,c);this.diff_charsToLines_(a,d);this.diff_cleanupSemantic(a);a.push([0,""]);for(var e=d=b=0,f="",g="";b<a.length;){switch(a[b][0]){case 1:e++;g+=a[b][1];break;case -1:d++;f+=a[b][1];break;case 0:if(1<=d&&1<=e){a.splice(b-d-e,d+e);b=b-d-e;d=this.diff_main(f,g,!1,c);for(e=d.length-1;0<=e;e--)a.splice(b,0,d[e]);b+=d.length}d=e=0;g=f=""}b++}a.pop();return a};
diff_match_patch.prototype.diff_bisect_=function(a,b,c){for(var d=a.length,e=b.length,f=Math.ceil((d+e)/2),g=f,h=2*f,j=Array(h),i=Array(h),k=0;k<h;k++)j[k]=-1,i[k]=-1;j[g+1]=0;i[g+1]=0;for(var k=d-e,p=0!=k%2,q=0,s=0,o=0,v=0,u=0;u<f&&!((new Date).getTime()>c);u++){for(var n=-u+q;n<=u-s;n+=2){var l=g+n,m;m=n==-u||n!=u&&j[l-1]<j[l+1]?j[l+1]:j[l-1]+1;for(var r=m-n;m<d&&r<e&&a.charAt(m)==b.charAt(r);)m++,r++;j[l]=m;if(m>d)s+=2;else if(r>e)q+=2;else if(p&&(l=g+k-n,0<=l&&l<h&&-1!=i[l])){var t=d-i[l];if(m>=
t)return this.diff_bisectSplit_(a,b,m,r,c)}}for(n=-u+o;n<=u-v;n+=2){l=g+n;t=n==-u||n!=u&&i[l-1]<i[l+1]?i[l+1]:i[l-1]+1;for(m=t-n;t<d&&m<e&&a.charAt(d-t-1)==b.charAt(e-m-1);)t++,m++;i[l]=t;if(t>d)v+=2;else if(m>e)o+=2;else if(!p&&(l=g+k-n,0<=l&&l<h&&-1!=j[l]&&(m=j[l],r=g+m-l,t=d-t,m>=t)))return this.diff_bisectSplit_(a,b,m,r,c)}}return[[-1,a],[1,b]]};
diff_match_patch.prototype.diff_bisectSplit_=function(a,b,c,d,e){var f=a.substring(0,c),g=b.substring(0,d),a=a.substring(c),b=b.substring(d),f=this.diff_main(f,g,!1,e),e=this.diff_main(a,b,!1,e);return f.concat(e)};
diff_match_patch.prototype.diff_linesToChars_=function(a,b){function c(a){for(var b="",c=0,f=-1,g=d.length;f<a.length-1;){f=a.indexOf("\n",c);-1==f&&(f=a.length-1);var q=a.substring(c,f+1),c=f+1;(e.hasOwnProperty?e.hasOwnProperty(q):void 0!==e[q])?b+=String.fromCharCode(e[q]):(b+=String.fromCharCode(g),e[q]=g,d[g++]=q)}return b}var d=[],e={};d[0]="";var f=c(a),g=c(b);return{chars1:f,chars2:g,lineArray:d}};
diff_match_patch.prototype.diff_charsToLines_=function(a,b){for(var c=0;c<a.length;c++){for(var d=a[c][1],e=[],f=0;f<d.length;f++)e[f]=b[d.charCodeAt(f)];a[c][1]=e.join("")}};diff_match_patch.prototype.diff_commonPrefix=function(a,b){if(!a||!b||a.charAt(0)!=b.charAt(0))return 0;for(var c=0,d=Math.min(a.length,b.length),e=d,f=0;c<e;)a.substring(f,e)==b.substring(f,e)?f=c=e:d=e,e=Math.floor((d-c)/2+c);return e};
diff_match_patch.prototype.diff_commonSuffix=function(a,b){if(!a||!b||a.charAt(a.length-1)!=b.charAt(b.length-1))return 0;for(var c=0,d=Math.min(a.length,b.length),e=d,f=0;c<e;)a.substring(a.length-e,a.length-f)==b.substring(b.length-e,b.length-f)?f=c=e:d=e,e=Math.floor((d-c)/2+c);return e};
diff_match_patch.prototype.diff_commonOverlap_=function(a,b){var c=a.length,d=b.length;if(0==c||0==d)return 0;c>d?a=a.substring(c-d):c<d&&(b=b.substring(0,c));c=Math.min(c,d);if(a==b)return c;for(var d=0,e=1;;){var f=a.substring(c-e),f=b.indexOf(f);if(-1==f)return d;e+=f;if(0==f||a.substring(c-e)==b.substring(0,e))d=e,e++}};
diff_match_patch.prototype.diff_halfMatch_=function(a,b){function c(a,b,c){for(var d=a.substring(c,c+Math.floor(a.length/4)),e=-1,g="",h,j,n,l;-1!=(e=b.indexOf(d,e+1));){var m=f.diff_commonPrefix(a.substring(c),b.substring(e)),r=f.diff_commonSuffix(a.substring(0,c),b.substring(0,e));g.length<r+m&&(g=b.substring(e-r,e)+b.substring(e,e+m),h=a.substring(0,c-r),j=a.substring(c+m),n=b.substring(0,e-r),l=b.substring(e+m))}return 2*g.length>=a.length?[h,j,n,l,g]:null}if(0>=this.Diff_Timeout)return null;
var d=a.length>b.length?a:b,e=a.length>b.length?b:a;if(4>d.length||2*e.length<d.length)return null;var f=this,g=c(d,e,Math.ceil(d.length/4)),d=c(d,e,Math.ceil(d.length/2)),h;if(!g&&!d)return null;h=d?g?g[4].length>d[4].length?g:d:d:g;var j;a.length>b.length?(g=h[0],d=h[1],e=h[2],j=h[3]):(e=h[0],j=h[1],g=h[2],d=h[3]);h=h[4];return[g,d,e,j,h]};
diff_match_patch.prototype.diff_cleanupSemantic=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=0,h=0,j=0,i=0;f<a.length;)0==a[f][0]?(c[d++]=f,g=j,h=i,i=j=0,e=a[f][1]):(1==a[f][0]?j+=a[f][1].length:i+=a[f][1].length,e&&e.length<=Math.max(g,h)&&e.length<=Math.max(j,i)&&(a.splice(c[d-1],0,[-1,e]),a[c[d-1]+1][0]=1,d--,d--,f=0<d?c[d-1]:-1,i=j=h=g=0,e=null,b=!0)),f++;b&&this.diff_cleanupMerge(a);this.diff_cleanupSemanticLossless(a);for(f=1;f<a.length;){if(-1==a[f-1][0]&&1==a[f][0]){b=a[f-1][1];c=a[f][1];
d=this.diff_commonOverlap_(b,c);e=this.diff_commonOverlap_(c,b);if(d>=e){if(d>=b.length/2||d>=c.length/2)a.splice(f,0,[0,c.substring(0,d)]),a[f-1][1]=b.substring(0,b.length-d),a[f+1][1]=c.substring(d),f++}else if(e>=b.length/2||e>=c.length/2)a.splice(f,0,[0,b.substring(0,e)]),a[f-1][0]=1,a[f-1][1]=c.substring(0,c.length-e),a[f+1][0]=-1,a[f+1][1]=b.substring(e),f++;f++}f++}};
diff_match_patch.prototype.diff_cleanupSemanticLossless=function(a){function b(a,b){if(!a||!b)return 6;var c=a.charAt(a.length-1),d=b.charAt(0),e=c.match(diff_match_patch.nonAlphaNumericRegex_),f=d.match(diff_match_patch.nonAlphaNumericRegex_),g=e&&c.match(diff_match_patch.whitespaceRegex_),h=f&&d.match(diff_match_patch.whitespaceRegex_),c=g&&c.match(diff_match_patch.linebreakRegex_),d=h&&d.match(diff_match_patch.linebreakRegex_),i=c&&a.match(diff_match_patch.blanklineEndRegex_),j=d&&b.match(diff_match_patch.blanklineStartRegex_);
return i||j?5:c||d?4:e&&!g&&h?3:g||h?2:e||f?1:0}for(var c=1;c<a.length-1;){if(0==a[c-1][0]&&0==a[c+1][0]){var d=a[c-1][1],e=a[c][1],f=a[c+1][1],g=this.diff_commonSuffix(d,e);if(g)var h=e.substring(e.length-g),d=d.substring(0,d.length-g),e=h+e.substring(0,e.length-g),f=h+f;for(var g=d,h=e,j=f,i=b(d,e)+b(e,f);e.charAt(0)===f.charAt(0);){var d=d+e.charAt(0),e=e.substring(1)+f.charAt(0),f=f.substring(1),k=b(d,e)+b(e,f);k>=i&&(i=k,g=d,h=e,j=f)}a[c-1][1]!=g&&(g?a[c-1][1]=g:(a.splice(c-1,1),c--),a[c][1]=
h,j?a[c+1][1]=j:(a.splice(c+1,1),c--))}c++}};diff_match_patch.nonAlphaNumericRegex_=/[^a-zA-Z0-9]/;diff_match_patch.whitespaceRegex_=/\s/;diff_match_patch.linebreakRegex_=/[\r\n]/;diff_match_patch.blanklineEndRegex_=/\n\r?\n$/;diff_match_patch.blanklineStartRegex_=/^\r?\n\r?\n/;
diff_match_patch.prototype.diff_cleanupEfficiency=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=!1,h=!1,j=!1,i=!1;f<a.length;){if(0==a[f][0])a[f][1].length<this.Diff_EditCost&&(j||i)?(c[d++]=f,g=j,h=i,e=a[f][1]):(d=0,e=null),j=i=!1;else if(-1==a[f][0]?i=!0:j=!0,e&&(g&&h&&j&&i||e.length<this.Diff_EditCost/2&&3==g+h+j+i))a.splice(c[d-1],0,[-1,e]),a[c[d-1]+1][0]=1,d--,e=null,g&&h?(j=i=!0,d=0):(d--,f=0<d?c[d-1]:-1,j=i=!1),b=!0;f++}b&&this.diff_cleanupMerge(a)};
diff_match_patch.prototype.diff_cleanupMerge=function(a){a.push([0,""]);for(var b=0,c=0,d=0,e="",f="",g;b<a.length;)switch(a[b][0]){case 1:d++;f+=a[b][1];b++;break;case -1:c++;e+=a[b][1];b++;break;case 0:1<c+d?(0!==c&&0!==d&&(g=this.diff_commonPrefix(f,e),0!==g&&(0<b-c-d&&0==a[b-c-d-1][0]?a[b-c-d-1][1]+=f.substring(0,g):(a.splice(0,0,[0,f.substring(0,g)]),b++),f=f.substring(g),e=e.substring(g)),g=this.diff_commonSuffix(f,e),0!==g&&(a[b][1]=f.substring(f.length-g)+a[b][1],f=f.substring(0,f.length-
g),e=e.substring(0,e.length-g))),0===c?a.splice(b-d,c+d,[1,f]):0===d?a.splice(b-c,c+d,[-1,e]):a.splice(b-c-d,c+d,[-1,e],[1,f]),b=b-c-d+(c?1:0)+(d?1:0)+1):0!==b&&0==a[b-1][0]?(a[b-1][1]+=a[b][1],a.splice(b,1)):b++,c=d=0,f=e=""}""===a[a.length-1][1]&&a.pop();c=!1;for(b=1;b<a.length-1;)0==a[b-1][0]&&0==a[b+1][0]&&(a[b][1].substring(a[b][1].length-a[b-1][1].length)==a[b-1][1]?(a[b][1]=a[b-1][1]+a[b][1].substring(0,a[b][1].length-a[b-1][1].length),a[b+1][1]=a[b-1][1]+a[b+1][1],a.splice(b-1,1),c=!0):a[b][1].substring(0,
a[b+1][1].length)==a[b+1][1]&&(a[b-1][1]+=a[b+1][1],a[b][1]=a[b][1].substring(a[b+1][1].length)+a[b+1][1],a.splice(b+1,1),c=!0)),b++;c&&this.diff_cleanupMerge(a)};diff_match_patch.prototype.diff_xIndex=function(a,b){var c=0,d=0,e=0,f=0,g;for(g=0;g<a.length;g++){1!==a[g][0]&&(c+=a[g][1].length);-1!==a[g][0]&&(d+=a[g][1].length);if(c>b)break;e=c;f=d}return a.length!=g&&-1===a[g][0]?f:f+(b-e)};
diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],c=/&/g,d=/</g,e=/>/g,f=/\n/g,g=0;g<a.length;g++){var h=a[g][0],j=a[g][1],j=j.replace(c,"&amp;").replace(d,"&lt;").replace(e,"&gt;").replace(f,"<span style=\"color: #dcdcdc;\">&not;</span><br>");switch(h){case 1:b[g]='<ins style="background:#ccffcc; text-decoration: none;">'+j+"</ins>";break;case -1:b[g]='<del style="background:#ffffcc; text-decoration: line-through; color: orange;">'+j+"</del>";break;case 0:b[g]="<span>"+j+"</span>"}}return b.join("")};
diff_match_patch.prototype.diff_text1=function(a){for(var b=[],c=0;c<a.length;c++)1!==a[c][0]&&(b[c]=a[c][1]);return b.join("")};diff_match_patch.prototype.diff_text2=function(a){for(var b=[],c=0;c<a.length;c++)-1!==a[c][0]&&(b[c]=a[c][1]);return b.join("")};diff_match_patch.prototype.diff_levenshtein=function(a){for(var b=0,c=0,d=0,e=0;e<a.length;e++){var f=a[e][0],g=a[e][1];switch(f){case 1:c+=g.length;break;case -1:d+=g.length;break;case 0:b+=Math.max(c,d),d=c=0}}return b+=Math.max(c,d)};
diff_match_patch.prototype.diff_toDelta=function(a){for(var b=[],c=0;c<a.length;c++)switch(a[c][0]){case 1:b[c]="+"+encodeURI(a[c][1]);break;case -1:b[c]="-"+a[c][1].length;break;case 0:b[c]="="+a[c][1].length}return b.join("\t").replace(/%20/g," ")};
diff_match_patch.prototype.diff_fromDelta=function(a,b){for(var c=[],d=0,e=0,f=b.split(/\t/g),g=0;g<f.length;g++){var h=f[g].substring(1);switch(f[g].charAt(0)){case "+":try{c[d++]=[1,decodeURI(h)]}catch(j){throw Error("Illegal escape in diff_fromDelta: "+h);}break;case "-":case "=":var i=parseInt(h,10);if(isNaN(i)||0>i)throw Error("Invalid number in diff_fromDelta: "+h);h=a.substring(e,e+=i);"="==f[g].charAt(0)?c[d++]=[0,h]:c[d++]=[-1,h];break;default:if(f[g])throw Error("Invalid diff operation in diff_fromDelta: "+
f[g]);}}if(e!=a.length)throw Error("Delta length ("+e+") does not equal source text length ("+a.length+").");return c};diff_match_patch.prototype.match_main=function(a,b,c){if(null==a||null==b||null==c)throw Error("Null input. (match_main)");c=Math.max(0,Math.min(c,a.length));return a==b?0:a.length?a.substring(c,c+b.length)==b?c:this.match_bitap_(a,b,c):-1};
diff_match_patch.prototype.match_bitap_=function(a,b,c){function d(a,d){var e=a/b.length,g=Math.abs(c-d);return!f.Match_Distance?g?1:e:e+g/f.Match_Distance}if(b.length>this.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,c);-1!=h&&(g=Math.min(d(0,h),g),h=a.lastIndexOf(b,c+b.length),-1!=h&&(g=Math.min(d(0,h),g)));for(var j=1<<b.length-1,h=-1,i,k,p=b.length+a.length,q,s=0;s<b.length;s++){i=0;for(k=p;i<k;)d(s,c+
k)<=g?i=k:p=k,k=Math.floor((p-i)/2+i);p=k;i=Math.max(1,c-k+1);var o=Math.min(c+k,a.length)+b.length;k=Array(o+2);for(k[o+1]=(1<<s)-1;o>=i;o--){var v=e[a.charAt(o-1)];k[o]=0===s?(k[o+1]<<1|1)&v:(k[o+1]<<1|1)&v|(q[o+1]|q[o])<<1|1|q[o+1];if(k[o]&j&&(v=d(s,o-1),v<=g))if(g=v,h=o-1,h>c)i=Math.max(1,2*c-h);else break}if(d(s+1,c)>g)break;q=k}return h};
diff_match_patch.prototype.match_alphabet_=function(a){for(var b={},c=0;c<a.length;c++)b[a.charAt(c)]=0;for(c=0;c<a.length;c++)b[a.charAt(c)]|=1<<a.length-c-1;return b};
diff_match_patch.prototype.patch_addContext_=function(a,b){if(0!=b.length){for(var c=b.substring(a.start2,a.start2+a.length1),d=0;b.indexOf(c)!=b.lastIndexOf(c)&&c.length<this.Match_MaxBits-this.Patch_Margin-this.Patch_Margin;)d+=this.Patch_Margin,c=b.substring(a.start2-d,a.start2+a.length1+d);d+=this.Patch_Margin;(c=b.substring(a.start2-d,a.start2))&&a.diffs.unshift([0,c]);(d=b.substring(a.start2+a.length1,a.start2+a.length1+d))&&a.diffs.push([0,d]);a.start1-=c.length;a.start2-=c.length;a.length1+=
c.length+d.length;a.length2+=c.length+d.length}};
diff_match_patch.prototype.patch_make=function(a,b,c){var d;if("string"==typeof a&&"string"==typeof b&&"undefined"==typeof c)d=a,b=this.diff_main(d,b,!0),2<b.length&&(this.diff_cleanupSemantic(b),this.diff_cleanupEfficiency(b));else if(a&&"object"==typeof a&&"undefined"==typeof b&&"undefined"==typeof c)b=a,d=this.diff_text1(b);else if("string"==typeof a&&b&&"object"==typeof b&&"undefined"==typeof c)d=a;else if("string"==typeof a&&"string"==typeof b&&c&&"object"==typeof c)d=a,b=c;else throw Error("Unknown call format to patch_make.");
if(0===b.length)return[];for(var c=[],a=new diff_match_patch.patch_obj,e=0,f=0,g=0,h=d,j=0;j<b.length;j++){var i=b[j][0],k=b[j][1];if(!e&&0!==i)a.start1=f,a.start2=g;switch(i){case 1:a.diffs[e++]=b[j];a.length2+=k.length;d=d.substring(0,g)+k+d.substring(g);break;case -1:a.length1+=k.length;a.diffs[e++]=b[j];d=d.substring(0,g)+d.substring(g+k.length);break;case 0:k.length<=2*this.Patch_Margin&&e&&b.length!=j+1?(a.diffs[e++]=b[j],a.length1+=k.length,a.length2+=k.length):k.length>=2*this.Patch_Margin&&
e&&(this.patch_addContext_(a,h),c.push(a),a=new diff_match_patch.patch_obj,e=0,h=d,f=g)}1!==i&&(f+=k.length);-1!==i&&(g+=k.length)}e&&(this.patch_addContext_(a,h),c.push(a));return c};diff_match_patch.prototype.patch_deepCopy=function(a){for(var b=[],c=0;c<a.length;c++){var d=a[c],e=new diff_match_patch.patch_obj;e.diffs=[];for(var f=0;f<d.diffs.length;f++)e.diffs[f]=d.diffs[f].slice();e.start1=d.start1;e.start2=d.start2;e.length1=d.length1;e.length2=d.length2;b[c]=e}return b};
diff_match_patch.prototype.patch_apply=function(a,b){if(0==a.length)return[b,[]];var a=this.patch_deepCopy(a),c=this.patch_addPadding(a),b=c+b+c;this.patch_splitMax(a);for(var d=0,e=[],f=0;f<a.length;f++){var g=a[f].start2+d,h=this.diff_text1(a[f].diffs),j,i=-1;if(h.length>this.Match_MaxBits){if(j=this.match_main(b,h.substring(0,this.Match_MaxBits),g),-1!=j&&(i=this.match_main(b,h.substring(h.length-this.Match_MaxBits),g+h.length-this.Match_MaxBits),-1==i||j>=i))j=-1}else j=this.match_main(b,h,g);
if(-1==j)e[f]=!1,d-=a[f].length2-a[f].length1;else if(e[f]=!0,d=j-g,g=-1==i?b.substring(j,j+h.length):b.substring(j,i+this.Match_MaxBits),h==g)b=b.substring(0,j)+this.diff_text2(a[f].diffs)+b.substring(j+h.length);else if(g=this.diff_main(h,g,!1),h.length>this.Match_MaxBits&&this.diff_levenshtein(g)/h.length>this.Patch_DeleteThreshold)e[f]=!1;else{this.diff_cleanupSemanticLossless(g);for(var h=0,k,i=0;i<a[f].diffs.length;i++){var p=a[f].diffs[i];0!==p[0]&&(k=this.diff_xIndex(g,h));1===p[0]?b=b.substring(0,
j+k)+p[1]+b.substring(j+k):-1===p[0]&&(b=b.substring(0,j+k)+b.substring(j+this.diff_xIndex(g,h+p[1].length)));-1!==p[0]&&(h+=p[1].length)}}}b=b.substring(c.length,b.length-c.length);return[b,e]};
diff_match_patch.prototype.patch_addPadding=function(a){for(var b=this.Patch_Margin,c="",d=1;d<=b;d++)c+=String.fromCharCode(d);for(d=0;d<a.length;d++)a[d].start1+=b,a[d].start2+=b;var d=a[0],e=d.diffs;if(0==e.length||0!=e[0][0])e.unshift([0,c]),d.start1-=b,d.start2-=b,d.length1+=b,d.length2+=b;else if(b>e[0][1].length){var f=b-e[0][1].length;e[0][1]=c.substring(e[0][1].length)+e[0][1];d.start1-=f;d.start2-=f;d.length1+=f;d.length2+=f}d=a[a.length-1];e=d.diffs;0==e.length||0!=e[e.length-1][0]?(e.push([0,
c]),d.length1+=b,d.length2+=b):b>e[e.length-1][1].length&&(f=b-e[e.length-1][1].length,e[e.length-1][1]+=c.substring(0,f),d.length1+=f,d.length2+=f);return c};
diff_match_patch.prototype.patch_splitMax=function(a){for(var b=this.Match_MaxBits,c=0;c<a.length;c++)if(!(a[c].length1<=b)){var d=a[c];a.splice(c--,1);for(var e=d.start1,f=d.start2,g="";0!==d.diffs.length;){var h=new diff_match_patch.patch_obj,j=!0;h.start1=e-g.length;h.start2=f-g.length;if(""!==g)h.length1=h.length2=g.length,h.diffs.push([0,g]);for(;0!==d.diffs.length&&h.length1<b-this.Patch_Margin;){var g=d.diffs[0][0],i=d.diffs[0][1];1===g?(h.length2+=i.length,f+=i.length,h.diffs.push(d.diffs.shift()),
j=!1):-1===g&&1==h.diffs.length&&0==h.diffs[0][0]&&i.length>2*b?(h.length1+=i.length,e+=i.length,j=!1,h.diffs.push([g,i]),d.diffs.shift()):(i=i.substring(0,b-h.length1-this.Patch_Margin),h.length1+=i.length,e+=i.length,0===g?(h.length2+=i.length,f+=i.length):j=!1,h.diffs.push([g,i]),i==d.diffs[0][1]?d.diffs.shift():d.diffs[0][1]=d.diffs[0][1].substring(i.length))}g=this.diff_text2(h.diffs);g=g.substring(g.length-this.Patch_Margin);i=this.diff_text1(d.diffs).substring(0,this.Patch_Margin);""!==i&&
(h.length1+=i.length,h.length2+=i.length,0!==h.diffs.length&&0===h.diffs[h.diffs.length-1][0]?h.diffs[h.diffs.length-1][1]+=i:h.diffs.push([0,i]));j||a.splice(++c,0,h)}}};diff_match_patch.prototype.patch_toText=function(a){for(var b=[],c=0;c<a.length;c++)b[c]=a[c];return b.join("")};
diff_match_patch.prototype.patch_fromText=function(a){var b=[];if(!a)return b;for(var a=a.split("\n"),c=0,d=/^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$/;c<a.length;){var e=a[c].match(d);if(!e)throw Error("Invalid patch string: "+a[c]);var f=new diff_match_patch.patch_obj;b.push(f);f.start1=parseInt(e[1],10);""===e[2]?(f.start1--,f.length1=1):"0"==e[2]?f.length1=0:(f.start1--,f.length1=parseInt(e[2],10));f.start2=parseInt(e[3],10);""===e[4]?(f.start2--,f.length2=1):"0"==e[4]?f.length2=0:(f.start2--,f.length2=
parseInt(e[4],10));for(c++;c<a.length;){e=a[c].charAt(0);try{var g=decodeURI(a[c].substring(1))}catch(h){throw Error("Illegal escape in patch_fromText: "+g);}if("-"==e)f.diffs.push([-1,g]);else if("+"==e)f.diffs.push([1,g]);else if(" "==e)f.diffs.push([0,g]);else if("@"==e)break;else if(""!==e)throw Error('Invalid patch mode "'+e+'" in: '+g);c++}}return b};diff_match_patch.patch_obj=function(){this.diffs=[];this.start2=this.start1=null;this.length2=this.length1=0};
diff_match_patch.patch_obj.prototype.toString=function(){var a,b;a=0===this.length1?this.start1+",0":1==this.length1?this.start1+1:this.start1+1+","+this.length1;b=0===this.length2?this.start2+",0":1==this.length2?this.start2+1:this.start2+1+","+this.length2;a=["@@ -"+a+" +"+b+" @@\n"];var c;for(b=0;b<this.diffs.length;b++){switch(this.diffs[b][0]){case 1:c="+";break;case -1:c="-";break;case 0:c=" "}a[b+1]=c+encodeURI(this.diffs[b][1])+"\n"}return a.join("").replace(/%20/g," ")};
this.diff_match_patch=diff_match_patch;this.DIFF_DELETE=-1;this.DIFF_INSERT=1;this.DIFF_EQUAL=0;})()
var dmp = new diff_match_patch(); function diffLaunch(){var text1 = document.getElementById('text').value; var text2 = document.getElementById('text2').value; dmp.Diff_Timeout = 0; dmp.Diff_EditCost = 4; var d = dmp.diff_main(text1, text2); var ds = dmp.diff_prettyHtml(d); document.getElementById('diff').innerHTML = ds;
}
//--><!]]></script>
<title>htmLawed (<?php echo hl_version();?>) test</title>
</head>
@ -420,7 +477,7 @@ else{
}
?>
<span style="float:right;" class="help"><span style="font-size: 85%;">Encoding: </span><input type="text" size="8" id="enc" name="enc" style="vertical-align: middle;" value="<?php echo htmlspecialchars($_POST['enc']); ?>" title="IANA-recognized name of the input character-set; can be multiple ;- or space-separated values; may not work in some browsers" /></span>
<span style="float:right;" class="help" title="IANA-recognized name of the input character-set; can be multiple ;- or space-separated values; may not work in some browsers"><span style="font-size: 85%;">Encoding: </span><input type="text" size="8" id="enc" name="enc" style="vertical-align: middle;" value="<?php echo htmlspecialchars($_POST['enc']); ?>" /></span>
</div>
<br style="clear:both;" />
@ -454,6 +511,7 @@ $cfg = array(
'comment'=>array('4', 'nil', 'allow HTML comments', 'nil'),
'css_expression'=>array('2', 'nil', 'allow dynamic expressions in CSS style properties', 'nil'),
'deny_attribute'=>array('1', '0', 'denied attributes', '0', '50', '', 'these'),
'direct_list_nest'=>array('2', 'nil', 'allow direct nesting of a list within another without requiring it to be a list item', 'nil'),
'elements'=>array('', '', 'allowed elements', '50'),
'hexdec_entity'=>array('3', '1', 'convert hexadecimal numeric entities to decimal ones, or vice versa', '0'),
'hook'=>array('', '', 'name of hook function', '25'),
@ -516,23 +574,23 @@ if($do){
}
}
if($cfg['anti_link_spam'] && (!empty($cfg['anti_link_spam11']) or !empty($cfg['anti_link_spam12']))){
if(isset($cfg['anti_link_spam']) && $cfg['anti_link_spam'] && (!empty($cfg['anti_link_spam11']) or !empty($cfg['anti_link_spam12']))){
$cfg['anti_link_spam'] = array($cfg['anti_link_spam11'], $cfg['anti_link_spam12']);
}
unset($cfg['anti_link_spam11'], $cfg['anti_link_spam12']);
if($cfg['anti_mail_spam'] == 1){
if(isset($cfg['anti_mail_spam']) && $cfg['anti_mail_spam'] == 1){
$cfg['anti_mail_spam'] = isset($cfg['anti_mail_spam1'][0]) ? $cfg['anti_mail_spam1'] : 0;
}
unset($cfg['anti_mail_spam11']);
if($cfg['deny_attribute'] == 1){
if(isset($cfg['deny_attribute']) && $cfg['deny_attribute'] == 1){
$cfg['deny_attribute'] = isset($cfg['deny_attribute1'][0]) ? $cfg['deny_attribute1'] : 0;
}
unset($cfg['deny_attribute1']);
if($cfg['tidy'] == 2){
if(isset($cfg['tidy']) && $cfg['tidy'] == 2){
$cfg['tidy'] = isset($cfg['tidy2'][0]) ? $cfg['tidy2'] : 0;
}
unset($cfg['tidy2']);
if($cfg['unique_ids'] == 2){
if(isset($cfg['unique_ids']) && $cfg['unique_ids'] == 2){
$cfg['unique_ids'] = isset($cfg['unique_ids2'][0]) ? $cfg['unique_ids2'] : 1;
}
unset($cfg['unique_ids2']);
@ -540,9 +598,9 @@ if($do){
$cfg['show_setting'] = 'hlcfg';
$st = microtime();
$out = htmLawed($_POST['text'], $cfg, str_replace(array('$', '{'), '', $_POST['spec']));
$out = htmLawed($_POST['text'], $cfg, $_POST['spec']);
$et = microtime();
echo '<br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'inputR\'); return false;"><span class="notice">Input code &raquo;</span></a> <span class="help" title="tags estimated as half of total &gt; and &lt; chars; values may be inaccurate for non-ASCII text"><small><big>', strlen($_POST['text']), '</big> chars, ~<big>', round((substr_count($_POST['text'], '>') + substr_count($_POST['text'], '<'))/2), '</big> tags</small>&nbsp;</span><div id="inputR" style="display: none;">', format($_POST['text']), '</div><script type="text/javascript">hl(\'inputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? ' <a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'inputD\'); return false;"><span class="notice">Input binary &raquo;&nbsp;</span></a><div id="inputD" style="display: none;">'. hexdump($_POST['text']). '</div>' : ''), ' <a href="htmLawedTest.php" title="[toggle visibility] finalized internal settings as interpreted by htmLawed; for developers" onclick="javascript:toggle(\'settingF\'); return false;"><span class="notice">Finalized internal settings &raquo;&nbsp;</span></a> <div id="settingF" style="display: none;">', str_replace(array(' ', "\t", ' '), array(' ', '&nbsp; ', '&nbsp; '), nl2br(htmlspecialchars(print_r($GLOBALS['hlcfg']['config'], true)))), '</div><script type="text/javascript">hl(\'settingF\');</script>', '<br /><a href="htmLawedTest.php" title="[toggle visibility] suitable for copy-paste" onclick="javascript:toggle(\'outputF\'); return false;"><span class="notice">Output &raquo;</span></a> <span class="help" title="approx., server-specific value excluding the \'include()\' call"><small>htmLawed processing time <big>', number_format(((substr($et,0,9)) + (substr($et,-10)) - (substr($st,0,9)) - (substr($st,-10))),4), '</big> s</small></span>', (($mem = memory_get_peak_usage()) !== false ? '<span class="help"><small>, peak memory usage <big>'. round(($mem-$pre_mem)/1048576, 2). '</big> <small>MB</small>' : ''), '</small></span><div id="outputF" style="display: block;"><div><textarea id="text2" class="textarea" name="text2" rows="5" cols="100" style="width: 100%;">', htmlspecialchars($out), '</textarea></div><button type="button" onclick="javascript:document.getElementById(\'text2\').focus();document.getElementById(\'text2\').select()" title="select all to copy" style="float:right;">Select all</button>';
echo '<br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'inputR\'); return false;"><span class="notice">Input code &raquo;</span></a> <span class="help" title="tags estimated as half of total &gt; and &lt; chars; values may be inaccurate for non-ASCII text"><small><big>', strlen($_POST['text']), '</big> chars, ~<big>', ($tag = round((substr_count($_POST['text'], '>') + substr_count($_POST['text'], '<'))/2)), '</big> tag', ($tag > 1 ? 's' : ''), '</small>&nbsp;</span><div id="inputR" style="display: none;">', format($_POST['text']), '</div><script type="text/javascript">hl(\'inputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? ' <a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'inputD\'); return false;"><span class="notice">Input binary &raquo;&nbsp;</span></a><div id="inputD" style="display: none;">'. hexdump($_POST['text']). '</div>' : ''), ' <a href="htmLawedTest.php" title="[toggle visibility] finalized internal settings as interpreted by htmLawed; for developers" onclick="javascript:toggle(\'settingF\'); return false;"><span class="notice">Finalized internal settings &raquo;&nbsp;</span></a> <div id="settingF" style="display: none;">$config: ', str_replace(array(' ', "\t", ' '), array(' ', '&nbsp; ', '&nbsp; '), nl2br(htmlspecialchars(print_r($GLOBALS['hlcfg']['config'], true)))), '<br />$spec: ', str_replace(array(' ', "\t", ' '), array(' ', '&nbsp; ', '&nbsp; '), nl2br(htmlspecialchars(print_r($GLOBALS['hlcfg']['spec'], true)))), '</div><script type="text/javascript">hl(\'settingF\');</script>', '<br /><a href="htmLawedTest.php" title="[toggle visibility] suitable for copy-paste" onclick="javascript:toggle(\'outputF\'); return false;"><span class="notice">Output &raquo;</span></a> <span class="help" title="approx., server-specific value excluding the \'include()\' call"><small>htmLawed processing time <big>', number_format(((substr($et,0,9)) + (substr($et,-10)) - (substr($st,0,9)) - (substr($st,-10))),4), '</big> s</small></span>', (($mem = memory_get_peak_usage()) !== false ? '<span class="help"><small>, peak memory usage <big>'. round(($mem-$pre_mem)/1048576, 2). '</big> <small>MB</small>' : ''), '</small></span><div id="outputF" style="display: block;"><div><textarea id="text2" class="textarea" name="text2" rows="5" cols="100" style="width: 100%;">', htmlspecialchars($out), '</textarea></div><button type="button" onclick="javascript:document.getElementById(\'text2\').focus();document.getElementById(\'text2\').select()" title="select all to copy" style="float:right;">Select all</button>';
if($_w3c_validate && $validation)
{
?>
@ -552,7 +610,7 @@ if($do){
<?php
}
echo '</div><br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'outputR\'); return false;"><span class="notice">Output code &raquo;</span></a><div id="outputR" style="display: block;">', format($out), '</div><script type="text/javascript">hl(\'outputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? '<br /><a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'outputD\'); return false;"><span class="notice">Output binary &raquo;</span></a><div id="outputD" style="display: none;">'. hexdump($out). '</div>' : ''), '<br /><a href="htmLawedTest.php" title="[toggle visibility] XHTML 1 Transitional doctype" onclick="javascript:toggle(\'outputH\'); return false;"><span class="notice">Output rendered &raquo;</span></a><div id="outputH" style="display: block;">', $out, '</div>';
echo '</div><br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'outputR\'); return false;"><span class="notice">Output code &raquo;</span></a><div id="outputR" style="display: block;">', format($out), '</div><script type="text/javascript">hl(\'outputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? ' <a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'outputD\'); return false;"><span class="notice">Output binary &raquo;</span></a><div id="outputD" style="display: none;">'. hexdump($out). '</div>' : ''), ' <a href="htmLawedTest.php" title="[toggle visibility] inline output-input diff; might not be perfectly accurate, semantically or otherwise " onclick="javascript:toggle(\'diff\'); diffLaunch(); return false;"><span class="notice">Diff &raquo;</span></a> <div id="diff" style="display: none;"></div><br /><a href="htmLawedTest.php" title="[toggle visibility] XHTML 1 Transitional doctype" onclick="javascript:toggle(\'outputH\'); return false;"><span class="notice">Output rendered &raquo;</span></a><div id="outputH" style="display: block;">', $out, '</div>';
}
else{
?>

View File

@ -64,7 +64,7 @@ span.totop a, span.totop a:visited {color: #6699cc;}
&#160; <span class="toc-item"><a href="#s2.6"><span class="item-no">2.6</span>&#160; Use without modifying old <span class="term">kses()</span>&#160;code</a></span><br />
&#160; <span class="toc-item"><a href="#s2.7"><span class="item-no">2.7</span>&#160; Tolerance for ill-written HTML</a></span><br />
&#160; <span class="toc-item"><a href="#s2.8"><span class="item-no">2.8</span>&#160; Limitations &amp; work-arounds</a></span><br />
&#160; <span class="toc-item"><a href="#s2.9"><span class="item-no">2.9</span>&#160; Examples</a></span><br />
&#160; <span class="toc-item"><a href="#s2.9"><span class="item-no">2.9</span>&#160; Examples of usage</a></span><br />
<span class="toc-item"><a href="#s3"><span class="item-no">3</span>&#160; Details</a></span><br />
&#160; <span class="toc-item"><a href="#s3.1"><span class="item-no">3.1</span>&#160; Invalid/dangerous characters</a></span><br />
&#160; <span class="toc-item"><a href="#s3.2"><span class="item-no">3.2</span>&#160; Character references/entities</a></span><br />
@ -110,10 +110,10 @@ span.totop a, span.totop a:visited {color: #6699cc;}
<div id="body">
<br />
<div class="comment">htmLawed_README.txt, 16 July 2009<br />
htmLawed 1.1.8.1, 16 July 2009 <br />
<div class="comment">htmLawed_README.txt, 29 August 2013<br />
htmLawed 1.1.16, 29 August 2013<br />
Copyright Santosh Patnaik<br />
GPL v3 license<br />
Dual licensed with LGPL 3 and GPL 2+<br />
A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed</a>&#160;</div>
<br />
@ -121,9 +121,9 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s1" id="s1"></a><span class="item-no">1</span>&#160; About htmLawed
</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed is a highly customizable single-file PHP script to make text secure, and standard- and admin policy-compliant for use in the body of HTML 4, XHTML 1 or 1.1, or generic XML documents. It is thus a configurable input (X)HTML filter, processor, purifier, sanitizer, beautifier, etc., and an alternative to the <a href="http://tidy.sourceforge.net">HTMLTidy</a>&#160;application.<br />
&#160; htmLawed is a PHP script to process text with HTML markup to make it more compliant with HTML standards and administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such <em>lawing in</em>&#160;of HTML in text used in (X)HTML or XML documents ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators.<br />
<br />
&#160; The <em>lawing in</em>&#160;of input text is needed to ensure that HTML code in the text is standard-compliant, does not introduce security vulnerabilities, and does not break the aesthetics, design or layout of web-pages. htmLawed tries to do this by, for example, making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (<span class="term">XSS</span>) attacks, and allowing only specified HTML elements/tags and attributes.<br />
&#160; htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file, does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML <a href="http://tidy.sourceforge.net">Tidy</a>&#160;application.<br />
<div class="sub-section"><h3>
<a name="s1.1" id="s1.1"></a><span class="item-no">1.1</span>&#160; Example uses
@ -151,8 +151,8 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; * &#160;<strong>beautify</strong>&#160;or <strong>compact</strong>&#160;HTML &#160;^~`<br />
<br />
&#160; * &#160;<strong>restrict elements</strong>&#160; ^~`<br />
&#160; * &#160;proper closure of empty elements like <span class="term">img</span>&#160; ^`<br />
&#160; * &#160;can <strong>restrict elements</strong>&#160; ^~`<br />
&#160; * &#160;ensures proper closure of empty elements like <span class="term">img</span>&#160; ^`<br />
&#160; * &#160;<strong>transform deprecated elements</strong>&#160;like <span class="term">u</span>&#160; ^~`<br />
&#160; * &#160;HTML <strong>comments</strong>&#160;and <span class="term">CDATA</span>&#160;sections can be permitted &#160;^~`<br />
&#160; * &#160;elements like <span class="term">script</span>, <span class="term">object</span>&#160;and <span class="term">form</span>&#160;can be permitted &#160;~<br />
@ -161,7 +161,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; * &#160;remove <strong>invalid attributes</strong>&#160; ^`<br />
&#160; * &#160;element and attribute names are <strong>lower-cased</strong>&#160; ^<br />
&#160; * &#160;provide <strong>required attributes</strong>, like <span class="term">alt</span>&#160;for <span class="term">image</span>&#160; ^`<br />
&#160; * &#160;<strong>transform deprecated attributes</strong>&#160; ^~`<br />
&#160; * &#160;<strong>transforms deprecated attributes</strong>&#160; ^~`<br />
&#160; * &#160;attributes <strong>declared only once</strong>&#160; ^`<br />
<br />
&#160; * &#160;<strong>restrict attribute values</strong>, including <strong>element-specifically</strong>&#160; ^~`<br />
@ -170,6 +170,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; * &#160;ensure <strong>unique</strong>&#160;<span class="term">id</span>&#160;attribute values &#160;^~`<br />
&#160; * &#160;<strong>double-quote</strong>&#160;attribute values &#160;^<br />
&#160; * &#160;lower-case <strong>standard attribute values</strong>&#160;like <span class="term">password</span>&#160; ^`<br />
&#160; * &#160;permit custom, non-standard attributes as well as custom rules for standard attributes &#160;~`<br />
<br />
&#160; * &#160;<strong>attribute-specific URL protocol/scheme restriction</strong>&#160; *~`<br />
&#160; * &#160;disable <strong>dynamic expressions</strong>&#160;in <span class="term">style</span>&#160;values &#160;*~`<br />
@ -180,7 +181,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; * &#160;remove <strong>null</strong>&#160;characters &#160;*<br />
&#160; * &#160;neutralize potentially dangerous proprietary Netscape <strong>Javascript entities</strong>&#160; *<br />
&#160; * &#160;replace potentially dangerous <strong>soft-hyphen</strong>&#160;character in attribute values with spaces &#160;*<br />
&#160; * &#160;replace potentially dangerous <strong>soft-hyphen</strong>&#160;character in URL-accepting attribute values with spaces &#160;*<br />
<br />
&#160; * &#160;remove common <strong>invalid characters</strong>&#160;not allowed in HTML or XML &#160;^`<br />
&#160; * &#160;replace <strong>characters from Microsoft applications</strong>&#160;like <span class="term">Word</span>&#160;that are discouraged in HTML or XML &#160;^~`<br />
@ -213,50 +214,74 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s1.3" id="s1.3"></a><span class="item-no">1.3</span>&#160; History
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed was developed for use with <span class="term">LabWiki</span>, a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like <span class="term">Kses</span>&#160;and <span class="term">HTMLPurifier</span>&#160;were deemed inadequate, slow, resource-intensive, or dependent on external applications like <span class="term">HTML Tidy</span>.<br />
&#160; htmLawed was created in 2007 for use with <span class="term">LabWiki</span>, a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like <span class="term">Kses</span>&#160;and <span class="term">HTMLPurifier</span>&#160;were deemed inadequate, slow, resource-intensive, or dependent on an extension or external application like <span class="term">HTML Tidy</span>. The core logic of htmLawed, that of identifying HTML elements and attributes, was based on the <span class="term">Kses</span>&#160;(version 0.2.2) HTML filter software of Ulf Harnhammar (it can still be used with code that uses <span class="term">Kses</span>; see <a href="#s2.6">section 2.6</a>.).<br />
<br />
&#160; htmLawed started as a modification of Ulf Harnhammar's <span class="term">Kses</span>&#160;(version 0.2.2) software, and is compatible with code that uses <span class="term">Kses</span>; see <a href="#s2.6">section 2.6</a>.<br />
&#160; See <a href="#s4.3">section 4.3</a>&#160;for a detailed log of changes in htmLawed over the years, and <a href="#s4.10">section 4.10</a>&#160;for acknowledgements.<br />
</div>
<div class="sub-section"><h3>
<a name="s1.4" id="s1.4"></a><span class="item-no">1.4</span>&#160; License &amp; copyright
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed is free and open-source software licensed under GPL license version <a href="http://www.gnu.org/licenses/gpl-3.0.txt">3</a>, and copyrighted by Santosh Patnaik, MD, PhD.<br />
&#160; htmLawed is free and open-source software dual copyrighted by Santosh Patnaik, MD, PhD, and licensed under LGPL license version <a href="http://www.gnu.org/licenses/lgpl-3.0.txt">3</a>, and GPL license version <a href="http://www.gnu.org/licenses/gpl-2.0.txt">2</a>&#160;(or later).<br />
</div>
<div class="sub-section"><h3>
<a name="s1.5" id="s1.5"></a><span class="item-no">1.5</span>&#160; Terms used here
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; * &#160;<em>administrator</em>&#160;- or admin; person setting up the code to pass input through htmLawed; also, <em>user</em><br />
&#160; In this document, only HTML body-level elements are considered. htmLawed does not have support for head-level elements, <span class="term">body</span>, and the frame-level elements, <span class="term">frameset</span>, <span class="term">frame</span>&#160;and <span class="term">noframes</span>, and these elements are ignored here.<br />
<br />
&#160; * &#160;<em>administrator</em>&#160;- or admin; person setting up the code that utilizes htmLawed; also, <em>user</em><br />
&#160; * &#160;<em>attributes</em>&#160;- name-value pairs like <span class="term">href="http&#58;//x.com"</span>&#160;in opening tags<br />
&#160; * &#160;<em>author</em>&#160;- <em>writer</em><br />
&#160; * &#160;<em>author</em>&#160;- see <em>writer</em><br />
&#160; * &#160;<em>character</em>&#160;- atomic unit of text; internally represented by a numeric <em>code-point</em>&#160;as specified by the <em>encoding</em>&#160;or <em>charset</em>&#160;in use<br />
&#160; * &#160;<em>entity</em>&#160;- markup like <span class="term">&amp;gt;</span>&#160;and <span class="term">&amp;#160;</span>&#160;used to refer to a character<br />
&#160; * &#160;<em>element</em>&#160;- HTML element like <span class="term">a</span>&#160;and <span class="term">img</span><br />
&#160; * &#160;<em>element content</em>&#160;- &#160;content between the opening and closing tags of an element, like <span class="term">click</span>&#160;of <span class="term">&lt;a href="x"&gt;click&lt;/a&gt;</span><br />
&#160; * &#160;<em>element content</em>&#160;- &#160;content between the opening and closing tags of an element, like <span class="term">click</span>&#160;of the <span class="term">&lt;a href="x"&gt;click&lt;/a&gt;</span>&#160;element<br />
&#160; * &#160;<em>HTML</em>&#160;- implies XHTML unless specified otherwise<br />
&#160; * &#160;<em>input</em>&#160;- text string given to htmLawed to process<br />
&#160; * &#160;<em>HTML body</em>&#160;- Complete HTML documents typically have a <em>head</em>&#160;and a <em>body</em>&#160;container. Information in <em>head</em>&#160;specifies title of the document, etc., whereas that in the body informs what is to be displayed on a web-page; it is only the elements for <em>body</em>, except <span class="term">frames</span>, <span class="term">frameset</span>&#160;and <span class="term">noframes</span>&#160;that htmLawed is concerned with<br />
&#160; * &#160;<em>input</em>&#160;- text given to htmLawed to process<br />
&#160; * &#160;<em>processing</em>&#160;- involves filtering, correction, etc., of input<br />
&#160; * &#160;<em>safe</em>&#160;- absence or reduction of certain characters and HTML elements and attributes in the input that can otherwise potentially and circumstantially expose web-site users to security vulnerabilities like cross-site scripting attacks (XSS)<br />
&#160; * &#160;<em>scheme</em>&#160;- URL protocol like <span class="term">http</span>&#160;and <span class="term">ftp</span><br />
&#160; * &#160;<em>specs</em>&#160;- standard specifications<br />
&#160; * &#160;<em>safe</em>&#160;- absence or reduction of certain characters and HTML elements and attributes in HTML of text that can otherwise potentially, and circumstantially, expose text readers to security vulnerabilities like cross-site scripting attacks (XSS)<br />
&#160; * &#160;<em>scheme</em>&#160;- a URL protocol like <span class="term">http</span>&#160;and <span class="term">ftp</span><br />
&#160; * &#160;<em>specifications</em>&#160;- standard specifications, for HTML4, HTML5, Ruby, etc.<br />
&#160; * &#160;<em>style property</em>&#160;- terms like <span class="term">border</span>&#160;and <span class="term">height</span>&#160;for which declarations are made in values for the <span class="term">style</span>&#160;attribute of elements<br />
&#160; * &#160;<em>tag</em>&#160;- markers like <span class="term">&lt;a href="x"&gt;</span>&#160;and <span class="term">&lt;/a&gt;</span>&#160;delineating element content; the opening tag can contain attributes<br />
&#160; * &#160;<em>tag content</em>&#160;- consists of tag markers <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>, element names like <span class="term">div</span>, and possibly attributes<br />
&#160; * &#160;<em>user</em>&#160;- administrator<br />
&#160; * &#160;<em>writer</em>&#160;- end-user like a blog commenter providing the input that is to be processed; also, <em>author</em><br />
</div>
<div class="sub-section"><h3>
<a name="s1.6" id="s1.6"></a><span class="item-no">1.6</span>&#160; Availability
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed can be downloaded for free at its <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">website</a>. Besides the <span class="term">htmLawed.php</span>&#160;file, the download has the htmLawed documentation (this document) in plain <a href="htmLawed_README.txt">text</a>&#160;and <a href="htmLawed_README.htm">HTML</a>&#160;formats, a script for <a href="htmLawedTest.php">testing</a>, and a text file for <a href="htmLawed_TESTCASE.txt">test-cases</a>. htmLawed is also available as a PHP class (OOP code) on its website.<br />
</div>
</div>
<div class="section"><h2>
<a name="s2" id="s2"></a><span class="item-no">2</span>&#160; Usage
</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed should work with PHP 4.3 and higher. Either <span class="term">include()</span>&#160;the <span class="term">htmLawed.php</span>&#160;file or copy-paste the entire code.<br />
&#160; htmLawed works in PHP version 4.4 or higher. Either <span class="term">include()</span>&#160;the <span class="term">htmLawed.php</span>&#160;file, or copy-paste the entire code. To use with PHP 4.3, have the following code included:<br />
<br />
<code class="code">&#160; &#160; if(!function_exists(&#39;ctype_digit&#39;)){</code>
<br />
<code class="code">&#160; &#160; &#160;function ctype_digit($var){</code>
<br />
<code class="code">&#160; &#160; &#160; return ((int) $var == $var);</code>
<br />
<code class="code">&#160; &#160; &#160;}</code>
<br />
<code class="code">&#160; &#160; }</code>
<br />
&#160; To easily <strong>test</strong>&#160;htmLawed using a form-based interface, use the provided <a href="htmLawedTest.php">demo</a>&#160;(<span class="term">htmLawed.php</span>&#160;and <span class="term">htmLawedTest.php</span>&#160;should be in the same directory on the web-server).<br />
<div class="sub-section"><h3>
<a name="s2.1" id="s2.1"></a><span class="item-no">2.1</span>&#160; Simple
@ -268,7 +293,13 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<code class="code">&#160; &#160; $processed = htmLawed($text);</code>
<br />
<br />
&#160; <strong>Note</strong>: If input is from a <span class="term">$_GET</span>&#160;or <span class="term">$_POST</span>&#160;value, and <span class="term">magic quotes</span>&#160;are enabled on the PHP setup, run <span class="term">stripslashes()</span>&#160;on the input before passing to htmLawed.<br />
&#160; With the <span class="term">htmLawed class</span>&#160;(<a href="#s1.6">section 1.6</a>), usage is:<br />
<br />
<code class="code">&#160; &#160; $processed = htmLawed&#58;&#58;hl($text);</code>
<br />
<br />
&#160; <strong>Notes</strong>: (1) If input is from a <span class="term">$_GET</span>&#160;or <span class="term">$_POST</span>&#160;value, and <span class="term">magic quotes</span>&#160;are enabled on the PHP setup, run <span class="term">stripslashes()</span>&#160;on the input before passing to htmLawed. (2) htmLawed does not have support for head-level elements, <span class="term">body</span>, and the frame-level elements, <span class="term">frameset</span>, <span class="term">frame</span>&#160;and <span class="term">noframes</span>.<br />
<br />
&#160; By default, htmLawed will process the text allowing all valid HTML elements/tags, secure URL scheme/CSS style properties, etc. It will allow <span class="term">CDATA</span>&#160;sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- <span class="term">$config</span>&#160;and <span class="term">$spec</span>:<br />
<br />
@ -276,9 +307,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<code class="code">&#160; &#160; $processed = htmLawed($text, $config, $spec);</code>
<br />
<br />
&#160; These extra parameters are detailed below. Some examples are shown in <a href="#s2.9">section 2.9</a>.<br />
<br />
&#160; <strong>Note</strong>: For maximum protection against <span class="term">XSS</span>&#160;and other scripting attacks (e.g., by disallowing Javascript code), consider using the <span class="term">safe</span>&#160;parameter; see <a href="#s3.6">section 3.6</a>.<br />
&#160; The <span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;arguments are detailed below. Some examples are shown in <a href="#s2.9">section 2.9</a>. For maximum protection against <span class="term">XSS</span>&#160;and other scripting attacks (e.g., by disallowing Javascript code), consider using the <span class="term">safe</span>&#160;parameter; see <a href="#s3.6">section 3.6</a>.<br />
</div>
<div class="sub-section"><h3>
@ -318,13 +347,13 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; Anti-link-spam measure; see <a href="#s3.4.7">section 3.4.7</a><br />
<br />
&#160; <span class="term">0</span>&#160;- no measure taken &#160;*<br />
&#160; <span class="term">array("regex1", "regex2")</span>&#160;- will ensure a <span class="term">rel</span>&#160;attribute with <span class="term">nofollow</span>&#160;in its value in case the <span class="term">href</span>&#160;attribute value matches the regular expression pattern <span class="term">regex1</span>, and/or will remove <span class="term">href</span>&#160;if its value matches the regular expression pattern <span class="term">regex2</span>. E.g., <span class="term">array("/./", "/&#58;//\W&#42;(?!(abc\.com|xyz\.org))/")</span>; see <a href="#s3.4.7">section 3.4.7</a>&#160;for more.<br />
&#160; <em>array("regex1", "regex2")</em>&#160;- will ensure a <span class="term">rel</span>&#160;attribute with <span class="term">nofollow</span>&#160;in its value in case the <span class="term">href</span>&#160;attribute value matches the regular expression pattern <span class="term">regex1</span>, and/or will remove <span class="term">href</span>&#160;if its value matches the regular expression pattern <span class="term">regex2</span>. E.g., <span class="term">array("/./", "/&#58;//\W&#42;(?!(abc\.com|xyz\.org))/")</span>; see <a href="#s3.4.7">section 3.4.7</a>&#160;for more.<br />
<br />
&#160; <strong>anti_mail_spam</strong><br />
&#160; Anti-mail-spam measure; see <a href="#s3.4.7">section 3.4.7</a><br />
<br />
&#160; <span class="term">0</span>&#160;- no measure taken &#160;*<br />
&#160; <span class="term">word</span>&#160;- <span class="term">@</span>&#160;in mail address in <span class="term">href</span>&#160;attribute value is replaced with specified <span class="term">word</span><br />
&#160; <em>word</em>&#160;- <span class="term">@</span>&#160;in mail address in <span class="term">href</span>&#160;attribute value is replaced with specified <em>word</em><br />
<br />
&#160; <strong>balance</strong><br />
&#160; Balance tags for well-formedness and proper nesting; see <a href="#s3.3.3">section 3.3.3</a><br />
@ -368,9 +397,15 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; Denied HTML attributes; see <a href="#s3.4">section 3.4</a><br />
<br />
&#160; <span class="term">0</span>&#160;- none &#160;*<br />
&#160; <span class="term">string</span>&#160;- dictated by values in <span class="term">string</span><br />
&#160; <em>string</em>&#160;- dictated by values in <em>string</em><br />
&#160; <span class="term">on&#42;</span>&#160;(like <span class="term">onfocus</span>) attributes not allowed - "<br />
<br />
&#160; <strong>direct_nest_list</strong><br />
&#160; Allow direct nesting of a list within another without requiring it to be a list item; see <a href="#s3.3.4">section 3.3.4</a><br />
<br />
&#160; <span class="term">0</span>&#160;- no &#160;*<br />
&#160; <span class="term">1</span>&#160;- yes<br />
<br />
&#160; <strong>elements</strong><br />
&#160; Allowed HTML elements; see <a href="#s3.3">section 3.3</a><br />
<br />
@ -388,13 +423,13 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; Name of an optional hook function to alter the input string, <span class="term">$config</span>&#160;or <span class="term">$spec</span>&#160;before htmLawed starts its main work; see <a href="#s3.7">section 3.7</a><br />
<br />
&#160; <span class="term">0</span>&#160;- no hook function &#160;*<br />
&#160; <span class="term">name</span>&#160;- <span class="term">name</span>&#160;is name of the hook function (<span class="term">kses_hook</span>&#160; ^)<br />
&#160; <em>name</em>&#160;- <em>name</em>&#160;is name of the hook function (<span class="term">kses_hook</span>&#160; ^)<br />
<br />
&#160; <strong>hook_tag</strong><br />
&#160; Name of an optional hook function to alter tag content finalized by htmLawed; see <a href="#s3.4.9">section 3.4.9</a><br />
<br />
&#160; <span class="term">0</span>&#160;- no hook function &#160;*<br />
&#160; <span class="term">name</span>&#160;- <span class="term">name</span>&#160;is name of the hook function<br />
&#160; <em>name</em>&#160;- <em>name</em>&#160;is name of the hook function<br />
<br />
&#160; <strong>keep_bad</strong><br />
&#160; Neutralize bad tags by converting <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>&#160;to entities, or remove them; see <a href="#s3.3.3">section 3.3.3</a><br />
@ -441,11 +476,11 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; <span class="term">1</span>&#160;- will auto-adjust other relevant <span class="term">$config</span>&#160;parameters (indicated by <span class="term">"</span>&#160;in this list)<br />
<br />
&#160; <strong>schemes</strong><br />
&#160; Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs; <span class="term">&#42;</span>&#160;covers all unspecified attributes; see <a href="#s3.4.3">section 3.4.3</a><br />
&#160; Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs (or <span class="term">!</span>&#160;to <em>deny</em>&#160;any URL); <span class="term">&#42;</span>&#160;covers all unspecified attributes; see <a href="#s3.4.3">section 3.4.3</a><br />
<br />
&#160; <span class="term">href&#58; aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; &#42;&#58;file, http, https</span>&#160; *<br />
&#160; <span class="term">&#42;&#58; ftp, gopher, http, https, mailto, news, nntp, telnet</span>&#160; ^<br />
&#160; <span class="term">href&#58; aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; style&#58; nil; &#42;&#58;file, http, https</span>&#160; "<br />
&#160; <span class="term">href&#58; aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; style&#58; !; &#42;&#58;file, http, https</span>&#160; "<br />
<br />
&#160; <strong>show_setting</strong><br />
&#160; Name of a PHP variable to assign the <em>finalized</em>&#160;<span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;values; see <a href="#s3.8">section 3.8</a><br />
@ -468,7 +503,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; <span class="term">0</span>&#160;- no &#160;^<br />
&#160; <span class="term">1</span>&#160;- remove duplicate and/or invalid ones &#160;*<br />
&#160; <span class="term">word</span>&#160;- remove invalid ones and replace duplicate ones with new and unique ones based on the <span class="term">word</span>; the admin-specified <span class="term">word</span>, like <span class="term">my_</span>, should begin with a letter (a-z) and can contain letters, digits, <span class="term">.</span>, <span class="term">_</span>, <span class="term">-</span>, and <span class="term">&#58;</span>.<br />
&#160; <em>word</em>&#160;- remove invalid ones and replace duplicate ones with new and unique ones based on the <em>word</em>; the admin-specified <em>word</em>, like <span class="term">my_</span>, should begin with a letter (a-z) and can contain letters, digits, <span class="term">.</span>, <span class="term">_</span>, <span class="term">-</span>, and <span class="term">&#58;</span>.<br />
<br />
&#160; <strong>valid_xhtml</strong><br />
&#160; Magic parameter to make input the most valid XHTML without needing to specify other relevant <span class="term">$config</span>&#160;parameters; see <a href="#s3.5">section 3.5</a><br />
@ -488,7 +523,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s2.3" id="s2.3"></a><span class="item-no">2.3</span>&#160; Extra HTML specifications using the $spec parameter
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; The <span class="term">$spec</span>&#160;argument can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policy compliance. <span class="term">$spec</span>&#160;is specified as a string of text containing one or more <em>rules</em>, with multiple rules separated from each other by a semi-colon (<span class="term">;</span>). E.g.,<br />
&#160; The <span class="term">$spec</span>&#160;argument of htmLawed can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policies. <span class="term">$spec</span>&#160;is specified as a string of text containing one or more <em>rules</em>, with multiple rules separated from each other by a semi-colon (<span class="term">;</span>). E.g.,<br />
<br />
<code class="code">&#160; &#160; $spec = &#39;i=-&#42;; td, tr=style, id, -&#42;; a=id(match="/[a-z][a-z\d.&#58;\-&#96;"]&#42;/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt&#39;;</code>
@ -517,7 +552,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; * &#160;<span class="term">a=-&#42;, href, title</span>&#160;- none except <span class="term">href</span>&#160;and <span class="term">title</span><br />
&#160; * &#160;<span class="term">a=-&#42;, -id, href, title</span>&#160;- none except <span class="term">href</span>&#160;and <span class="term">title</span><br />
<br />
&#160; Rules regarding <strong>attribute values</strong>&#160;are optionally specified inside round brackets after attribute names in slash ('/')-separated <em>parameter = value</em>&#160;pairs. E.g., <span class="term">title(maxlen=30/minlen=5)</span>. None, or one or more of the following parameters may be specified:<br />
&#160; Rules regarding <strong>attribute values</strong>&#160;are optionally specified inside round brackets after attribute names in slash ('/')-separated <em>parameter = value</em>&#160;pairs. E.g., <span class="term">title(maxlen=30/minlen=5)</span>. None or one or more of the following parameters may be specified:<br />
<br />
&#160; * &#160;<span class="term">oneof</span>&#160;- one or more choices separated by <span class="term">|</span>&#160;that the value should match; if only one choice is provided, then the value must match that choice<br />
<br />
@ -541,7 +576,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; <em>Rule</em>: <span class="term">input=title(), value(maxval=8/default=6)</span><br />
&#160; <em>Output</em>: <span class="term">&lt;input title="WIDTH" value="6" /&gt;&lt;input title="length" value="5" /&gt;</span><br />
<br />
&#160; <em>Rule</em>: <span class="term">input=title(nomatch=$w.d$i), value(match=$em$/default=6em)</span><br />
&#160; <em>Rule</em>: <span class="term">input=title(nomatch=%w.d%i), value(match=%em%/default=6em)</span><br />
&#160; <em>Output</em>: <span class="term">&lt;input value="10em" /&gt;&lt;input title="length" value="6em" /&gt;</span><br />
<br />
&#160; <em>Rule</em>: <span class="term">input=title(oneof=height|depth/default=depth), value(noneof=5|6)</span><br />
@ -549,14 +584,22 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; <strong>Special characters</strong>: The characters <span class="term">;</span>, <span class="term">,</span>, <span class="term">/</span>, <span class="term">(</span>, <span class="term">)</span>, <span class="term">|</span>, <span class="term">~</span>&#160;and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be <em>escaped</em>&#160;by enclosing in pairs of double-quotes (<span class="term">"</span>). A back-tick (<span class="term">&#96;</span>) can be used to escape a literal <span class="term">"</span>. An example rule illustrating this is <span class="term">input=value(maxlen=30/match="/^\w/"/default="your &#96;"ID&#96;"")</span>.<br />
<br />
&#160; <strong>Note</strong>: To deny an attribute for all elements for which it is legal, <span class="term">$config["deny_attribute"]</span>&#160;(see <a href="#s3.4">section 3.4</a>) can be used instead of <span class="term">$spec</span>. Also, attributes can be allowed element-specifically through <span class="term">$spec</span>&#160;while being denied globally through <span class="term">$config["deny_attribute"]</span>. The <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) can also be used to implement the <span class="term">$spec</span>&#160;functionality.<br />
&#160; <strong>Note</strong>: To deny an attribute for all elements for which it is legal, <span class="term">$config["deny_attribute"]</span>&#160;(see <a href="#s3.4">section 3.4</a>) can be used instead of <span class="term">$spec</span>. Also, attributes can be allowed element-specifically through <span class="term">$spec</span>&#160;while being denied globally through <span class="term">$config["deny_attribute"]</span>. The <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) can also be possibly used to implement a functionality like that achieved using <span class="term">$spec</span>&#160;functionality.<br />
<br />
&#160; <span class="term">$spec</span>&#160;can also be used to permit custom, non-standard attributes as well as custom rules for standard attributes. Thus, the following value of <span class="term">$spec</span>&#160;will permit the custom uses of the standard <span class="term">rel</span>&#160;attribute in <span class="term">input</span>&#160;(not permitted as per standards) and of a non-standard attribute, <span class="term">vFlag</span>, in <span class="term">img</span>.<br />
<br />
<code class="code">&#160; &#160; $spec = &#39;img=vFlag; input=rel&#39;</code>
<br />
<br />
&#160; The attribute names can contain alphabets, colons (:) and hyphens (-), but they must start with an alphabet.<br />
</div>
<div class="sub-section"><h3>
<a name="s2.4" id="s2.4"></a><span class="item-no">2.4</span>&#160; Performance time &amp; memory usage
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; The time and memory used by htmLawed depends on its configuration and the size of the input, and the amount, nestedness and well-formedness of the HTML markup within it. In particular, tag balancing and beautification each can increase the processing time by about a quarter.<br />
&#160; The time and memory consumed during text processing by htmLawed depends on its configuration, the size of the input, and the amount, nestedness and well-formedness of the HTML markup within the input. In particular, tag balancing and beautification each can increase the processing time by about a quarter.<br />
<br />
&#160; The htmLawed <a href="htmLawedTest.php">demo</a>&#160;can be used to evaluate the performance and effects of different types of input and <span class="term">$config</span>.<br />
@ -565,17 +608,21 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s2.5" id="s2.5"></a><span class="item-no">2.5</span>&#160; Some security risks to keep in mind
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially <em>dangerous</em>&#160;HTML code. (This may not be a problem if the authors are trusted.)<br />
<br />
&#160; For example, following increase security risks:<br />
&#160; When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially <em>dangerous</em>&#160;HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permitted through htmLawed's setting should be considered. For example, following increase security risks:<br />
<br />
&#160; * &#160;Allowing <span class="term">script</span>, <span class="term">applet</span>, <span class="term">embed</span>, <span class="term">iframe</span>&#160;or <span class="term">object</span>&#160;elements, or certain of their attributes like <span class="term">allowscriptaccess</span><br />
<br />
&#160; * &#160;Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., <span class="term">&lt;!--[if gte IE 4]&gt;&lt;script&gt;alert("xss");&lt;/script&gt;&lt;![endif]--&gt;</span><br />
<br />
&#160; * &#160;Allowing dynamic CSS expressions (a feature of the IE browser)<br />
&#160; * &#160;Allowing dynamic CSS expressions (some Internet Explorer versions are vulnerable)<br />
<br />
&#160; <em>Unsafe</em>&#160;HTML can be removed by setting <span class="term">$config</span>&#160;appropriately. E.g., <span class="term">$config["elements"] = "&#42; -script"</span>&#160;(<a href="#s3.3">section 3.3</a>), <span class="term">$config["safe"] = 1</span>&#160;(<a href="#s3.6">section 3.6</a>), etc.<br />
&#160; * &#160;Allowing the <span class="term">style</span>&#160;attribute<br />
<br />
&#160; To remove <em>unsecure</em>&#160;HTML, code-developers using htmLawed must set <span class="term">$config</span>&#160;appropriately. E.g., <span class="term">$config["elements"] = "&#42; -script"</span>&#160;to deny the <span class="term">script</span>&#160;element (<a href="#s3.3">section 3.3</a>), <span class="term">$config["safe"] = 1</span>&#160;to auto-configure ceratin htmLawed parameters for maximizing security (<a href="#s3.6">section 3.6</a>), etc.<br />
<br />
&#160; Permitting the <span class="term">&#42;style&#42;</span>&#160;attribute brings in risks of <em>click-jacking</em>, <em>phishing</em>, web-page overlays, etc., <em>even</em>&#160;when the <span class="term">safe</span>&#160;parameter is enabled (see <a href="#s3.6">section 3.6</a>). Except for URLs and a few other things like CSS dynamic expressions, htmLawed currently does not check every CSS style property. It does provide ways for the code-developer implementing htmLawed to do such checks through htmLawed's <span class="term">$spec</span>&#160;argument, and through the <span class="term">hook_tag</span>&#160;parameter (see <a href="#s3.4.8">section 3.4.8</a>&#160;for more). Disallowing <span class="term">style</span>&#160;completely and relying on CSS classes and stylesheet files is recommended.<br />
<br />
&#160; htmLawed does not check or correct the character <strong>encoding</strong>&#160;of the input it receives. In conjunction with permissive circumstances, such as when the character encoding is left undefined through HTTP headers or HTML <span class="term">meta</span>&#160;tags, this can allow for an exploit (like Google's <em>UTF-7/XSS</em>&#160;vulnerability of the past).<br />
</div>
<div class="sub-section"><h3>
@ -644,7 +691,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s2.7" id="s2.7"></a><span class="item-no">2.7</span>&#160; Tolerance for ill-written HTML
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be <em>read</em>&#160;as HTML, and be considered mere plain text instead. Following statements indicate the degree of <em>looseness</em>&#160;that htmLawed can work with, and can be provided in instructions to writers:<br />
&#160; htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be <em>read</em>&#160;as HTML, and may therefore get identified as mere plain text. Following statements indicate the degree of <em>looseness</em>&#160;that htmLawed can work with, and can be provided in instructions to writers:<br />
<br />
&#160; * &#160;Tags must be flanked by <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>&#160;with no <span class="term">&gt;</span>&#160;inside -- any needed <span class="term">&gt;</span>&#160;should be put in as <span class="term">&amp;gt;</span>. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and <span class="term">&gt;</span>, like <span class="term">&lt;div &gt;</span>&#160;and <span class="term">&lt;img / &gt;</span>, but not after the <span class="term">&lt;</span>.<br />
<br />
@ -652,13 +699,13 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; * &#160;Attribute string of elements may be liberally spaced with tabs, line-breaks, etc.<br />
<br />
&#160; * &#160;Attribute values may not be double-quoted, or may be single-quoted.<br />
&#160; * &#160;Attribute values may be single- and not double-quoted.<br />
<br />
&#160; * &#160;Left-padding of numeric entities (like, <span class="term">&amp;#0160;</span>, <span class="term">&amp;x07ff;</span>) with <span class="term">0</span>&#160;is okay as long as the number of characters between between the <span class="term">&amp;</span>&#160;and the <span class="term">;</span>&#160;does not exceed 8. All entities must end with <span class="term">;</span>&#160;though.<br />
<br />
&#160; * &#160;Named character entities must be properly cased. E.g., <span class="term">&amp;Lt;</span>&#160;or <span class="term">&amp;TILDE;</span>&#160;will not be let through without modification.<br />
&#160; * &#160;Named character entities must be properly cased. Thus, <span class="term">&amp;Lt;</span>&#160;or <span class="term">&amp;TILDE;</span>&#160;will not be recognized as entities and will be <em>neutralized</em>.<br />
<br />
&#160; * &#160;HTML comments should not be inside element tags (okay between tags), and should begin with <span class="term">&lt;!--</span>&#160;and end with <span class="term">--&gt;</span>. Characters like <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;may be allowed inside depending on <span class="term">$config</span>, but any <span class="term">--&gt;</span>&#160;inside should be put in as <span class="term">--&amp;gt;</span>. Any <span class="term">--</span>&#160;inside will be automatically converted to <span class="term">-</span>, and a space will be added before the comment delimiter <span class="term">--&gt;</span>.<br />
&#160; * &#160;HTML comments should not be inside element tags (they can be between tags), and should begin with <span class="term">&lt;!--</span>&#160;and end with <span class="term">--&gt;</span>. Characters like <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;may be allowed inside depending on <span class="term">$config</span>, but any <span class="term">--&gt;</span>&#160;inside should be put in as <span class="term">--&amp;gt;</span>. Any <span class="term">--</span>&#160;inside will be automatically converted to <span class="term">-</span>, and a space will be added before the comment delimiter <span class="term">--&gt;</span>.<br />
<br />
&#160; * &#160;<span class="term">CDATA</span>&#160;sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with <span class="term">&lt;[CDATA[</span>&#160;and end with <span class="term">]]&gt;</span>. Characters like <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;may be allowed inside depending on <span class="term">$config</span>, but any <span class="term">]]&gt;</span>&#160;inside should be put in as <span class="term">]]&amp;gt;</span>.<br />
<br />
@ -673,22 +720,22 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; * &#160;With <span class="term">$config["unique_ids"]</span>&#160;not <span class="term">0</span>&#160;and the <span class="term">id</span>&#160;attribute being permitted, writers should carefully avoid using duplicate or invalid <span class="term">id</span>&#160;values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when <span class="term">&lt;a id="home"&gt;&lt;/a&gt;&lt;input id="home" /&gt;&lt;label for="home"&gt;&lt;/label&gt;</span>&#160;is processed into<br />
<span class="term">&lt;a id="home"&gt;&lt;/a&gt;&lt;input id="prefix_home" /&gt;&lt;label for="home"&gt;&lt;/label&gt;</span>.<br />
<br />
&#160; * &#160;Note that even if intended HTML is lost in a highly ill-written input, the processed output will be more secure and standard-compliant.<br />
&#160; * &#160;Even if intended HTML is lost from an ill-written input, the processed output will be more secure and standard-compliant.<br />
<br />
&#160; * &#160;For URLs, unless <span class="term">$config["scheme"]</span>&#160;is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., <span class="term">htt&amp;#112;</span>&#160;(which many browsers will read as the harmless <span class="term">http</span>) may be considered bad by htmLawed.<br />
<br />
&#160; * &#160;htmLawed will attempt to put plain text present directly inside <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span>&#160;and <span class="term">noscript</span>&#160;elements (illegal as per the specs) inside auto-generated <span class="term">div</span>&#160;elements.<br />
&#160; * &#160;htmLawed will attempt to put plain text present directly inside <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span>&#160;and <span class="term">noscript</span>&#160;elements (illegal as per the specifications) inside auto-generated <span class="term">div</span>&#160;elements.<br />
</div>
<div class="sub-section"><h3>
<a name="s2.8" id="s2.8"></a><span class="item-no">2.8</span>&#160; Limitations &amp; work-arounds
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed's main objective is to make the input text <em>more</em>&#160;standard-compliant, secure for web-page readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds.<br />
&#160; htmLawed's main objective is to make the input text <em>more</em>&#160;standard-compliant, secure for readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds.<br />
<br />
&#160; It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specs (like asking for normalization of white-spacing within <span class="term">textarea</span>&#160;elements) are clearly wrong. Regarding security, note that <em>unsafe</em>&#160;HTML code is not necessarily legally invalid.<br />
&#160; It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specifications (like asking for normalization of white-spacing within <span class="term">textarea</span>&#160;elements) are clearly wrong. Regarding security, note that <em>unsafe</em>&#160;HTML code is not legally invalid per se.<br />
<br />
&#160; * &#160;htmLawed is meant for input that goes into the <span class="term">body</span>&#160;of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements <span class="term">frameset</span>, <span class="term">frame</span>&#160;and <span class="term">noframes</span>.<br />
&#160; * &#160;htmLawed is meant for input that goes into the <span class="term">body</span>&#160;of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements <span class="term">frameset</span>, <span class="term">frame</span>&#160;and <span class="term">noframes</span>. Content of the latter elements can, however, be individually filtered through htmLawed.<br />
<br />
&#160; * &#160;It cannot transform the non-standard <span class="term">embed</span>&#160;elements to the standard-compliant <span class="term">object</span>&#160;elements. Yet, it can allow <span class="term">embed</span>&#160;elements if permitted (<span class="term">embed</span>&#160;is widely used and supported). Admins can certainly use the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) to deploy a custom embed-to-object converter function.<br />
<br />
@ -698,7 +745,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; * &#160;By default, htmLawed won't check many attribute values for standard compliance. E.g., <span class="term">width="20m"</span>&#160;with the dimension in non-standard <span class="term">m</span>&#160;is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) or <span class="term">$spec</span>&#160;to enforce finer checks.<br />
<br />
&#160; * &#160;The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specs. Only a few of the proprietary attributes are supported.<br />
&#160; * &#160;The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported.<br />
<br />
&#160; * &#160;Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) or <span class="term">$spec</span>&#160;for finer checks. Perhaps the best option is to disallow <span class="term">style</span>&#160;but allow <span class="term">class</span>&#160;attributes with the right <span class="term">oneof</span>&#160;or <span class="term">match</span>&#160;values for <span class="term">class</span>, and have the various class style properties in <span class="term">.css</span>&#160;CSS stylesheet files.<br />
<br />
@ -710,11 +757,11 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; * &#160;Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert <span class="term">http</span>&#160;to <span class="term">https</span>. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br />
<br />
&#160; * &#160;Pairs of opening and closing tags that do not enclose any content (like <span class="term">&lt;em&gt;&lt;/em&gt;</span>) are not removed. This may be against the standard specs for certain elements (e.g., <span class="term">table</span>). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.<br />
&#160; * &#160;Pairs of opening and closing tags that do not enclose any content (like <span class="term">&lt;em&gt;&lt;/em&gt;</span>) are not removed. This may be against the standard specifications for certain elements (e.g., <span class="term">table</span>). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.<br />
<br />
&#160; * &#160;htmLawed does not check for certain element orderings described in the standard specs (e.g., in a <span class="term">table</span>, <span class="term">tbody</span>&#160;is allowed before <span class="term">tfoot</span>). Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br />
&#160; * &#160;htmLawed does not check for certain element orderings described in the standard specifications (e.g., in a <span class="term">table</span>, <span class="term">tbody</span>&#160;is allowed before <span class="term">tfoot</span>). Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br />
<br />
&#160; * &#160;htmLawed does not check the number of nested elements. E.g., it will allow two <span class="term">caption</span>&#160;elements in a <span class="term">table</span>&#160;element, illegal as per the specs. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br />
&#160; * &#160;htmLawed does not check the number of nested elements. E.g., it will allow two <span class="term">caption</span>&#160;elements in a <span class="term">table</span>&#160;element, illegal as per the specifications. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br />
<br />
&#160; * &#160;htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers (<span class="term">/&#42;</span>) in <span class="term">style</span>&#160;attribute values in order to detect malicious HTML like crafted IE-specific dynamic expressions like <span class="term">&amp;#101;xpression...</span>. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) to more specifically identify CSS expressions in the <span class="term">style</span>&#160;attribute values. Also, using <span class="term">$config["style_pass"]</span>, it is possible to have htmLawed pass <span class="term">style</span>&#160;attribute values without even looking at them (<a href="#s3.4.8">section 3.4.8</a>).<br />
<br />
@ -722,14 +769,110 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; * &#160;Because of poor Unicode support in PHP, htmLawed does not remove the <em>high value</em>&#160;HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see <a href="#s3.1">section 3.1</a>).<br />
<br />
&#160; * &#160;htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML <span class="term">meta</span>&#160;tags, this can permit an exploit (like Google's <em>UTF-7/XSS</em>&#160;vulnerability of the past). Also, htmLawed can mangle input text if it is not well-formed in terms of character encoding. Administrators can consider using code available elsewhere to check well-formedness of input text characters to correct any defect.<br />
<br />
&#160; * &#160;htmLawed is expected to work with input texts in ASCII-compatible single byte encodings such as national variants of ASCII (like ISO-646-DE/German of the ISO 646 standard), extended ASCII variants (like ISO 8859-10/Turkish of the ISO 8859/ISO Latin standard), ISO 8859-based Windows variants (like Windows 1252), EBCDIC, Shift JIS (Japanese), GB-Roman (Chinese), and KS-Roman (Korean). It should also properly handle texts with variable byte encodings like UTF-7 (Unicode) and UTF-8 (Unicode). However, htmLawed may mangle input texts with double byte encodings like UTF-16 (Unicode), JIS X 0208:1997 (Japanese) and K SX 1001:1992 (Korean), or the UTF-32 (Unicode) quadruple byte encoding. If an input text has such an encoding, administrators can use PHP's <a href="http://php.net/manual/en/book.iconv.php">iconv</a>&#160;functions, or some other mean, to convert text to UTF-8 before passing it to htmLawed.<br />
<br />
&#160; * &#160;Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts.<br />
</div>
<div class="sub-section"><h3>
<a name="s2.9" id="s2.9"></a><span class="item-no">2.9</span>&#160; Examples
<a name="s2.9" id="s2.9"></a><span class="item-no">2.9</span>&#160; Examples of usage
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; <strong>1.</strong>&#160;A blog administrator wants to allow only <span class="term">a</span>, <span class="term">em</span>, <span class="term">strike</span>, <span class="term">strong</span>&#160;and <span class="term">u</span>&#160;in comments, but needs <span class="term">strike</span>&#160;and <span class="term">u</span>&#160;transformed to <span class="term">span</span>&#160;for better XHTML 1-strict compliance, and, he wants the <span class="term">a</span>&#160;links to be to <span class="term">http</span>&#160;or <span class="term">https</span>&#160;resources:<br />
&#160; Safest, allowing only <em>safe</em>&#160;HTML markup --<br />
<br />
<code class="code">&#160; &#160; $config = array(&#39;safe&#39;=&gt;1);</code>
<br />
<code class="code">&#160; &#160; $out = htmLawed($in);</code>
<br />
<br />
&#160; Simplest, allowing all valid HTML markup except <span class="term">javascript&#58;</span>&#160;--<br />
<br />
<code class="code">&#160; &#160; $out = htmLawed($in);</code>
<br />
<br />
&#160; Allowing all valid HTML markup including <span class="term">javascript&#58;</span>&#160;--<br />
<br />
<code class="code">&#160; &#160; $config = array(&#39;schemes&#39;=&gt;&#39;&#42;&#58;&#42;&#39;);</code>
<br />
<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
<br />
<br />
&#160; Allowing only <span class="term">safe</span>&#160;HTML and the elements <span class="term">a</span>, <span class="term">em</span>, and <span class="term">strong</span>&#160;--<br />
<br />
<code class="code">&#160; &#160; $config = array(&#39;safe&#39;=&gt;1, &#39;elements&#39;=&gt;&#39;a, em, strong&#39;);</code>
<br />
<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
<br />
<br />
&#160; Not allowing elements <span class="term">script</span>&#160;and <span class="term">object</span>&#160;--<br />
<br />
<code class="code">&#160; &#160; $config = array(&#39;elements&#39;=&gt;&#39;&#42; -script -object&#39;);</code>
<br />
<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
<br />
<br />
&#160; Not allowing attributes <span class="term">id</span>&#160;and <span class="term">style</span>&#160;--<br />
<br />
<code class="code">&#160; &#160; $config = array(&#39;deny_attribute&#39;=&gt;&#39;id, style&#39;);</code>
<br />
<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
<br />
<br />
&#160; Permitting only attributes <span class="term">title</span>&#160;and <span class="term">href</span>&#160;--<br />
<br />
<code class="code">&#160; &#160; $config = array(&#39;deny_attribute&#39;=&gt;&#39;&#42; -title -href&#39;);</code>
<br />
<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
<br />
<br />
&#160; Remove bad/disallowed tags altogether instead of converting them to entities --<br />
<br />
<code class="code">&#160; &#160; $config = array(&#39;keep_bad&#39;=&gt;0);</code>
<br />
<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
<br />
<br />
&#160; Allowing attribute <span class="term">title</span>&#160;only in <span class="term">a</span>&#160;and not allowing attributes <span class="term">id</span>, <span class="term">style</span>, or scriptable <em>on*</em>&#160;attributes like <span class="term">onclick</span>&#160;--<br />
<br />
<code class="code">&#160; &#160; $config = array(&#39;deny_attribute&#39;=&gt;&#39;title, id, style, on&#42;&#39;);</code>
<br />
<code class="code">&#160; &#160; $spec = &#39;a=title&#39;;</code>
<br />
<code class="code">&#160; &#160; $out = htmLawed($in, $config, $spec);</code>
<br />
<br />
&#160; Allowing a custom attribute, <span class="term">vFlag</span>, in <span class="term">img</span>&#160;and permitting custom use of the standard attribute, <span class="term">rel</span>, in <span class="term">input</span>&#160;--<br />
<br />
<code class="code">&#160; &#160; $spec = &#39;img=vFlag; input=rel&#39;;</code>
<br />
<code class="code">&#160; &#160; $out = htmLawed($in, $config, $spec);</code>
<br />
<br />
&#160; Some case-studies are presented below.<br />
<br />
&#160; <strong>1.</strong>&#160;A blog administrator wants to allow only <span class="term">a</span>, <span class="term">em</span>, <span class="term">strike</span>, <span class="term">strong</span>&#160;and <span class="term">u</span>&#160;in comments, but needs <span class="term">strike</span>&#160;and <span class="term">u</span>&#160;transformed to <span class="term">span</span>&#160;for better XHTML 1-strict compliance, and, he wants the <span class="term">a</span>&#160;links to point only to <span class="term">http</span>&#160;or <span class="term">https</span>&#160;resources:<br />
<br />
<code class="code">&#160; &#160; $processed = htmLawed($in, array(&#39;elements&#39;=&gt;&#39;a, em, strike, strong, u&#39;, &#39;make_tag_strict&#39;=&gt;1, &#39;safe&#39;=&gt;1, &#39;schemes&#39;=&gt;&#39;&#42;&#58;http, https&#39;), &#39;a=href&#39;);</code>
@ -772,14 +915,14 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text.<br />
<br />
&#160; The <span class="term">$config["clean_ms_char"]</span>&#160;parameter need not be used if authors do not copy-paste Microsoft-created text or if the input text is not believed to use the <span class="term">Windows 1252</span>&#160;or a similar encoding. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.<br />
&#160; The <span class="term">$config["clean_ms_char"]</span>&#160;parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the <span class="term">Windows 1252</span>&#160;(<span class="term">Cp-1252</span>) or a similar encoding like <span class="term">Cp-1251</span>&#160;(otherwise, for example when UTF-8 encoding is in use, Japanese or Korean characters can get mangled). Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.<br />
</div>
<div class="sub-section"><h3>
<a name="s3.2" id="s3.2"></a><span class="item-no">3.2</span>&#160; Character references/entities
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; Valid character entities take the form <span class="term">&amp;&#42;;</span>&#160;where <span class="term">&#42;</span>&#160;is <span class="term">#x</span>&#160;followed by a hexadecimal number (hexadecimal numeric entity; like <span class="term">&amp;#xA0;</span>&#160;for non-breaking space), or alphanumeric like <span class="term">gt</span>&#160;(external or named entity; like <span class="term">&amp;nbsp;</span>&#160;for non-breaking space), or <span class="term">#</span>&#160;followed by a number (decimal numeric entity; like <span class="term">&amp;#160;</span>&#160;for non-breaking space). Character entities referring to the soft-hyphen character (the <span class="term">&amp;shy;</span>&#160;or <span class="term">\xad</span>&#160;character; hexadecimal code-point <span class="term">ad</span>&#160;[decimal <span class="term">173</span>]) in attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.<br />
&#160; Valid character entities take the form <span class="term">&amp;&#42;;</span>&#160;where <span class="term">&#42;</span>&#160;is <span class="term">#x</span>&#160;followed by a hexadecimal number (hexadecimal numeric entity; like <span class="term">&amp;#xA0;</span>&#160;for non-breaking space), or alphanumeric like <span class="term">gt</span>&#160;(external or named entity; like <span class="term">&amp;nbsp;</span>&#160;for non-breaking space), or <span class="term">#</span>&#160;followed by a number (decimal numeric entity; like <span class="term">&amp;#160;</span>&#160;for non-breaking space). Character entities referring to the soft-hyphen character (the <span class="term">&amp;shy;</span>&#160;or <span class="term">\xad</span>&#160;character; hexadecimal code-point <span class="term">ad</span>&#160;[decimal <span class="term">173</span>]) in URL-accepting attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.<br />
<br />
&#160; htmLawed (function <span class="term">hl_ent()</span>):<br />
<br />
@ -1059,7 +1202,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; An option like <span class="term">1</span>&#160;is useful, e.g., when a writer previews his submission, whereas one like <span class="term">3</span>&#160;is useful before content is finalized and made available to all.<br />
<br />
&#160; <strong>Note:</strong>&#160;In the example above, unlike <span class="term">&lt;&#42;&gt;</span>, <span class="term">&lt;xml&gt;</span>&#160;gets considered as a tag (even though there is no HTML element named <span class="term">xml</span>). In general, text matching the regular expression pattern <span class="term">&lt;(/?)([a-zA-Z][a-zA-Z1-6]&#42;)([^&gt;]&#42;?)\s?&gt;</span>&#160;is considered a tag (phrase enclosed by the angled brackets <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>, and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...).<br />
&#160; <strong>Note:</strong>&#160;In the example above, unlike <span class="term">&lt;&#42;&gt;</span>, <span class="term">&lt;xml&gt;</span>&#160;gets considered as a tag (even though there is no HTML element named <span class="term">xml</span>). Thus, the <span class="term">keep_bad</span>&#160;parameter's value affects <span class="term">&lt;xml&gt;</span>&#160;but not <span class="term">&lt;&#42;&gt;</span>. In general, text matching the regular expression pattern <span class="term">&lt;(/?)([a-zA-Z][a-zA-Z1-6]&#42;)([^&gt;]&#42;?)\s?&gt;</span>&#160;is considered a tag (phrase enclosed by the angled brackets <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>, and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...), and is subjected to the <span class="term">keep_bad</span>&#160;value.<br />
<br />
&#160; Nesting/content rules for each of the 86 elements in htmLawed's default set (see <a href="#s3.3">section 3.3</a>) are defined in function <span class="term">hl_bal()</span>. This means that if a non-standard element besides <span class="term">embed</span>&#160;is being permitted through <span class="term">$config["elements"]</span>, the element's tag content will end up getting removed if <span class="term">$config["balance"]</span>&#160;is set to <span class="term">1</span>.<br />
<br />
@ -1079,6 +1222,8 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; In some cases, the specs stipulate the number and/or the ordering of the child elements. A <span class="term">table</span>&#160;can have 0 or 1 <span class="term">caption</span>, <span class="term">tbody</span>, <span class="term">tfoot</span>, and <span class="term">thead</span>, but they must be in this order: <span class="term">caption</span>, <span class="term">thead</span>, <span class="term">tfoot</span>, <span class="term">tbody</span>.<br />
<br />
&#160; htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages.<br />
<br />
&#160; With <span class="term">$config["direct_list_nest"]</span>&#160;set to <span class="term">1</span>, htmLawed will allow direct nesting of an <span class="term">ol</span>&#160;or <span class="term">ul</span>&#160;list within another <span class="term">ol</span>&#160;or <span class="term">ul</span>&#160;without requiring the child list to be within an <span class="term">li</span>&#160;of the parent list. While this is not standard-compliant, directly nested lists are rendered properly by almost all browsers. The parameter <span class="term">$config["direct_list_nest"]</span>&#160;has no effect if tag-balancing (<a href="#s3.3.3">section 3.3.3</a>) is turned off.<br />
</div>
<div class="sub-section"><h3>
@ -1105,7 +1250,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s3.4" id="s3.4"></a><span class="item-no">3.4</span>&#160; Attributes
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed will only permit attributes described in the HTML specs (including deprecated ones). It also permits some attributes for use with the <span class="term">embed</span>&#160;element (the non-standard <span class="term">embed</span>&#160;element is supported in htmLawed because of its widespread use), and the the <span class="term">xml&#58;space</span>&#160;attribute (valid only in XHTML 1.1). A list of such 111 attributes and the elements they are allowed in is in <a href="#s5.2">section 5.2</a>.<br />
&#160; htmLawed will only permit attributes described in the HTML specs (including deprecated ones). It also permits some attributes for use with the <span class="term">embed</span>&#160;element (the non-standard <span class="term">embed</span>&#160;element is supported in htmLawed because of its widespread use), and the the <span class="term">xml&#58;space</span>&#160;attribute (valid only in XHTML 1.1). A list of such 111 attributes and the elements they are allowed in is in <a href="#s5.2">section 5.2</a>. Using the <span class="term">$spec</span>&#160;argument, htmLawed can be forced to permit custom, non-standard attributes as well as custom rules for standard attributes (<a href="#s2.3">section 2.3</a>).<br />
<br />
&#160; When <span class="term">$config["deny_attribute"]</span>&#160;is not set, or set to <span class="term">0</span>, or empty (<span class="term">""</span>), all the 111 attributes are permitted. Otherwise, <span class="term">$config["deny_attribute"]</span>&#160;can be set as a list of comma-separated names of the denied attributes. <span class="term">on&#42;</span>&#160;can be used to refer to the group of potentially dangerous, script-accepting attributes: <span class="term">onblur</span>, <span class="term">onchange</span>, <span class="term">onclick</span>, <span class="term">ondblclick</span>, <span class="term">onfocus</span>, <span class="term">onkeydown</span>, <span class="term">onkeypress</span>, <span class="term">onkeyup</span>, <span class="term">onmousedown</span>, <span class="term">onmousemove</span>, <span class="term">onmouseout</span>, <span class="term">onmouseover</span>, <span class="term">onmouseup</span>, <span class="term">onreset</span>, <span class="term">onselect</span>&#160;and <span class="term">onsubmit</span>.<br />
<br />
@ -1188,6 +1333,8 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; As a side-note, one may find <span class="term">style&#58; &#42;</span>&#160;useful as URLs in <span class="term">style</span>&#160;attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text.<br />
<br />
&#160; <span class="term">!</span>&#160;can be put in the list of schemes to disallow all protocols as well as <em>local</em>&#160;URLs. Thus, with <span class="term">href&#58; http, style&#58; !</span>, '&lt;a href="http://cnn.com" style="background-image: url('local.jpg');"&gt;CNN&lt;/a&gt;' will become '&lt;a href="http://cnn.com" style="background-image: url('denied:local.jpg');"&gt;CNN&lt;/a&gt;'.<br />
<br />
&#160; <strong>Note</strong>: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string <span class="term">src</span>&#160;(e.g., <span class="term">dynsrc</span>) or starts with <span class="term">o</span>&#160;(e.g., <span class="term">onbeforecopy</span>).<br />
<br />
&#160; With <span class="term">$config["safe"] = 1</span>, all URLs are disallowed in the <span class="term">style</span>&#160;attribute values.<br />
@ -1405,7 +1552,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s3.4.8" id="s3.4.8"></a><span class="item-no">3.4.8</span>&#160; Inline style properties
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the <span class="term">style</span>&#160;attributes. (CSS properties like <span class="term">background-image</span>&#160;that accept URLs in their values are noted in <a href="#s5.3">section 5.3</a>.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting <span class="term">$config["css_expression"]</span>&#160;to <span class="term">1</span>&#160;(default setting).<br />
&#160; htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the <span class="term">style</span>&#160;attributes. (CSS properties like <span class="term">background-image</span>&#160;that accept URLs in their values are noted in <a href="#s5.3">section 5.3</a>.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting <span class="term">$config["css_expression"]</span>&#160;to <span class="term">1</span>&#160;(default setting). Note that when <span class="term">$config["css_expression"]</span>&#160;is set to <span class="term">1</span>, htmLawed will remove <span class="term">/&#42;</span>&#160;from the <span class="term">style</span>&#160;values.<br />
<br />
&#160; <strong>Note</strong>: Because of the various ways of representing characters in attribute values (URL-escapement, entitification, etc.), htmLawed might alter the values of the <span class="term">style</span>&#160;attribute values, and may even falsely identify dynamic CSS expressions and URL schemes in them. If this is an important issue, checking of URLs and dynamic expressions can be turned off (<span class="term">$config["schemes"] = "...style&#58;&#42;..."</span>, see <a href="#s3.4.3">section 3.4.3</a>, and <span class="term">$config["css_expression"] = 0</span>). Alternately, admins can use their own custom function for finer handling of <span class="term">style</span>&#160;values through the <span class="term">hook_tag</span>&#160;parameter (see <a href="#s3.4.9">section 3.4.9</a>).<br />
<br />
@ -1420,14 +1567,30 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.).<br />
<br />
&#160; When <span class="term">$config</span>&#160;parameter <span class="term">hook_tag</span>&#160;is set to the name of a function, htmLawed (function <span class="term">hl_tag()</span>) will pass on the element name, and the <em>finalized</em>&#160;attribute name-value pairs as array elements to the function. The function is expected to return the full opening tag string like <span class="term">&lt;element_name attribute_1_name="attribute_1_value"...&gt;</span>&#160;(for empty elements like <span class="term">img</span>&#160;and <span class="term">input</span>, the element-closing slash <span class="term">/</span>&#160;should also be included).<br />
&#160; When <span class="term">$config</span>&#160;parameter <span class="term">hook_tag</span>&#160;is set to the name of a function, htmLawed (function <span class="term">hl_tag()</span>) will pass on the element name, and, in the case of an opening tag, the <em>finalized</em>&#160;attribute name-value pairs as array elements to the function. The function, after completing a task such as filtering or tag transformation, will typically return an empty string, the full opening tag string like <span class="term">&lt;element_name attribute_1_name="attribute_1_value"...&gt;</span>&#160;(for empty elements like <span class="term">img</span>&#160;and <span class="term">input</span>, the element-closing slash <span class="term">/</span>&#160;should also be included), etc.<br />
<br />
&#160; Any <span class="term">hook_tag</span>&#160;function, since htmLawed version 1.1.11, also receives names of elements in closing tags, such as <span class="term">a</span>&#160;in the closing <span class="term">&lt;/a&gt;</span>&#160;tag of the element <span class="term">&lt;a href="http&#58;//cnn.com"&gt;CNN&lt;/a&gt;</span>. Unlike for opening tags, no other value (i.e., the attribute name-value array) is passed to the function since a closing tag contains only element names. Typically, the function will return an empty string or a full closing tag (like <span class="term">&lt;/a&gt;</span>).<br />
<br />
&#160; This is a <strong>powerful functionality</strong>&#160;that can be exploited for various objectives: consolidate-and-convert inline <span class="term">style</span>&#160;attributes to <span class="term">class</span>, convert <span class="term">embed</span>&#160;elements to <span class="term">object</span>, permit only one <span class="term">caption</span>&#160;element in a <span class="term">table</span>&#160;element, disallow embedding of certain types of media, <strong>inject HTML</strong>, use <a href="http://csstidy.sourceforge.net">CSSTidy</a>&#160;to sanitize <span class="term">style</span>&#160;attribute values, etc.<br />
<br />
&#160; As an example, the custom hook code below can be used to force a series of specifically ordered <span class="term">id</span>&#160;attributes on all elements, and a specific <span class="term">param</span>&#160;element inside all <span class="term">object</span>&#160;elements:<br />
<br />
<code class="code">&#160; &#160; function my_tag_function($element, $attribute_array){</code>
<code class="code">&#160; &#160; function my_tag_function($element, $attribute_array=0){</code>
<br />
<br />
<code class="code">&#160; &#160; &#160; // If second argument is not received, it means a closing tag is being handled</code>
<br />
<code class="code">&#160; &#160; &#160; if(is_numeric($attribute_array)){</code>
<br />
<code class="code">&#160; &#160; &#160; &#160; return "&lt;/$element&gt;";</code>
<br />
<code class="code">&#160; &#160; &#160; }</code>
<br />
<br />
<code class="code">&#160; &#160; &#160; static $id = 0;</code>
@ -1487,6 +1650,11 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<code class="code">&#160; &#160; &#160; }</code>
<br />
<br />
<code class="code">&#160; &#160; &#160; static $empty_elements = array(&#39;area&#39;=&gt;1, &#39;br&#39;=&gt;1, &#39;col&#39;=&gt;1, &#39;embed&#39;=&gt;1, &#39;hr&#39;=&gt;1, &#39;img&#39;=&gt;1, &#39;input&#39;=&gt;1, &#39;isindex&#39;=&gt;1, &#39;param&#39;=&gt;1);</code>
<br />
<br />
<code class="code">&#160; &#160; &#160; return "&lt;{$element}{$string}". (isset($in_array($element, $empty_elements) ? &#39; /&#39; &#58; &#39;&#39;). &#39;&gt;&#39;. $new_element;</code>
<br />
@ -1515,7 +1683,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; htmLawed allows an admin to use <span class="term">$config["safe"]</span>&#160;to auto-adjust multiple <span class="term">$config</span>&#160;parameters (such as <span class="term">elements</span>&#160;which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by <span class="term">"</span>&#160;in <a href="#s2.2">section 2.2</a>). Thus, one can pass the <span class="term">$config</span>&#160;argument with a simpler value.<br />
<br />
&#160; With the value of <span class="term">1</span>, htmLawed considers <span class="term">CDATA</span>&#160;sections and HTML comments as plain text, and prohibits the <span class="term">applet</span>, <span class="term">embed</span>, <span class="term">iframe</span>, <span class="term">object</span>&#160;and <span class="term">script</span>&#160;elements, and the <span class="term">on&#42;</span>&#160;attributes like <span class="term">onclick</span>. ( There are <span class="term">$config</span>&#160;parameters like <span class="term">css_expression</span>&#160;that are not affected by the value set for <span class="term">safe</span>&#160;but whose default values still contribute towards a more <em>safe</em>&#160;output.) Further, URLs with schemes (see <a href="#s3.4.3">section 3.4.3</a>) are neutralized so that, e.g., <span class="term">style="moz-binding&#58;url(http&#58;//danger)"</span>&#160;becomes <span class="term">style="moz-binding&#58;url(denied&#58;http&#58;//danger)"</span>&#160;while <span class="term">style="moz-binding&#58;url(ok)"</span>&#160;remains intact.<br />
&#160; With the value of <span class="term">1</span>, htmLawed considers <span class="term">CDATA</span>&#160;sections and HTML comments as plain text, and prohibits the <span class="term">applet</span>, <span class="term">embed</span>, <span class="term">iframe</span>, <span class="term">object</span>&#160;and <span class="term">script</span>&#160;elements, and the <span class="term">on&#42;</span>&#160;attributes like <span class="term">onclick</span>. ( There are <span class="term">$config</span>&#160;parameters like <span class="term">css_expression</span>&#160;that are not affected by the value set for <span class="term">safe</span>&#160;but whose default values still contribute towards a more <em>safe</em>&#160;output.) Further, URLs with schemes (see <a href="#s3.4.3">section 3.4.3</a>) are neutralized so that, e.g., <span class="term">style="moz-binding&#58;url(http&#58;//danger)"</span>&#160;becomes <span class="term">style="moz-binding&#58;url(denied&#58;http&#58;//danger)"</span>.<br />
<br />
&#160; Admins, however, may still want to completely deny the <span class="term">style</span>&#160;attribute, e.g., with code like<br />
<br />
@ -1523,6 +1691,8 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<code class="code">&#160; &#160; $processed = htmLawed($text, array(&#39;safe&#39;=&gt;1, &#39;deny_attribute&#39;=&gt;&#39;style&#39;));</code>
<br />
<br />
&#160; Permitting the <span class="term">style</span>&#160;attribute brings in risks of <em>click-jacking</em>, etc. CSS property values can render a page non-functional or be used to deface it. Except for URLs, dynamic expressions, and some other things, htmLawed does not completely check <span class="term">style</span>&#160;values. It does provide ways for the code-developer implementing htmLawed to do such checks through the <span class="term">$spec</span>&#160;argument, and through the <span class="term">hook_tag</span>&#160;parameter (see <a href="#s3.4.8">section 3.4.8</a>&#160;for more). Disallowing style completely and relying on CSS classes and stylesheet files is recommended.<br />
<br />
&#160; If a value for a parameter auto-set through <span class="term">safe</span>&#160;is still manually provided, then that value can over-ride the auto-set value. E.g., with <span class="term">$config["safe"] = 1</span>&#160;and <span class="term">$config["elements"] = "&#42;+script"</span>, <span class="term">script</span>, but not <span class="term">applet</span>, is allowed.<br />
<br />
&#160; A page illustrating the efficacy of htmLawed's anti-XSS abilities with <span class="term">safe</span>&#160;set to <span class="term">1</span>&#160;against XSS vectors listed by <a href="http://ha.ckers.org/xss.html">RSnake</a>&#160;may be available <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm">here</a>.<br />
@ -1554,7 +1724,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s3.9" id="s3.9"></a><span class="item-no">3.9</span>&#160; Retaining non-HTML tags in input with mixed markup
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; htmLawed does not remove certain characters that though invalid are nevertheless discouraged in HTML documents as per the specs (see <a href="#s5.1">section 5.1</a>). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).<br />
&#160; htmLawed does not remove certain characters that, though invalid, are nevertheless <em>discouraged</em>&#160;in HTML documents as per the specifications (see <a href="#s5.1">section 5.1</a>). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).<br />
<br />
&#160; To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters with some of the HTML-discouraged characters (see <a href="#s3.1.2">section 3.1.2</a>). Post-htmLawed processing, the replacements are reverted.<br />
<br />
@ -1583,7 +1753,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s4.1" id="s4.1"></a><span class="item-no">4.1</span>&#160; Support
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; A careful re-reading of this documentation will very likely answer your questions.<br />
&#160; A careful reading of this documentation may provide an answer.<br />
<br />
&#160; Software updates and forum-based community-support may be found at <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed</a>. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at <a href="http://php.net">http://php.net</a>.<br />
@ -1593,18 +1763,42 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; See <a href="#s2.8">section 2.8</a>.<br />
<br />
&#160; Readers are advised to cross-check information given in this document.<br />
</div>
<div class="sub-section"><h3>
<a name="s4.3" id="s4.3"></a><span class="item-no">4.3</span>&#160; Change-log
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the <span class="term">htmLawed.php</span>&#160;file may be updated independently if the secondary files are revised.)<br />
&#160; (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the <span class="term">htmLawed.php</span>&#160;file, may be updated without a change-log entry if the secondary files, but not htmLawed per se, are revised.)<br />
<br />
&#160; <em>Version number - Release date. Notes</em><br />
<br />
&#160; 1.1.16 - 29 August 2013. Fix for a potential security vulnerability arising from specialy encoded space characters in URL schemes/protocols<br />
<br />
&#160; 1.1.15 - 11 August 2013. Improved tidying/prettifying functionality<br />
<br />
&#160; 1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during <span class="term">tidying</span>&#160;when <span class="term">balance</span>&#160;is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like <span class="term">blockquote</span>.<br />
<br />
&#160; 1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes<br />
<br />
&#160; 1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the <span class="term">face</span>&#160;attribute<br />
<br />
&#160; 1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload enviroment. <span class="term">$config["hook_tag"]</span>, if specified, now receives names of elements in closing tags.<br />
<br />
&#160; 1.1.10 - 22 October 2011. Fix for a bug in the <span class="term">tidy</span>&#160;functionality that caused the entire input to be replaced with a single space; new parameter, <span class="term">$config["direct_list_nest"]</span>&#160;to allow direct descendance of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.)<br />
<br />
&#160; 1.1.9.5 - 6 July 2011. Minor correction of a rule for nesting of <span class="term">li</span>&#160;within <span class="term">dir</span><br />
<br />
&#160; 1.1.9.4 - 3 July 2010. Parameter <span class="term">schemes</span>&#160;now accepts <span class="term">!</span>&#160;so any URL, even a local one, can be <em>denied</em>. An issue in which a second URL value in <span class="term">style</span>&#160;properties was not checked was fixed.<br />
<br />
&#160; 1.1.9.3 - 17 May 2010. Checks for correct nesting of <span class="term">param</span><br />
<br />
&#160; 1.1.9.2 - 26 April 2010. Minor fix regarding rendering of denied URL schemes<br />
<br />
&#160; 1.1.9.1 - 26 February 2010. htmLawed now uses the LGPL version 3 license; support for <span class="term">flashvars</span>&#160;attribute for <span class="term">embed</span><br />
<br />
&#160; 1.1.9 - 22 December 2009. Soft-hyphens are now removed only from URL-accepting attribute values<br />
<br />
&#160; 1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice<br />
<br />
&#160; 1.1.8 - 23 April 2009. Parameter <span class="term">deny_attribute</span>&#160;now accepts the wild-card <span class="term">&#42;</span>, making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting <span class="term">$spec</span><br />
@ -1617,11 +1811,11 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; 1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent<br />
<br />
&#160; 1.1 - 29 June 2008. <span class="term">$config["hook_tag"]</span>&#160;and <span class="term">$config["format"]</span>&#160;introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug<br />
&#160; 1.1 - 29 June 2008. <span class="term">$config["hook_tag"]</span>&#160;and <span class="term">$config["tidy"]</span>&#160;introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug<br />
<br />
&#160; 1.0.9 - 11 June 2008. Fixed bug in invalid HTML code-point entity check<br />
&#160; 1.0.9 - 11 June 2008. Fix for a bug in checks for invalid HTML code-point entities<br />
<br />
&#160; 1.0.8 - 15 May 2008. <span class="term">bordercolor</span>&#160;attribute for <span class="term">table</span>, <span class="term">td</span>&#160;and <span class="term">tr</span><br />
&#160; 1.0.8 - 15 May 2008. Permit <span class="term">bordercolor</span>&#160;attribute for <span class="term">table</span>, <span class="term">td</span>&#160;and <span class="term">tr</span><br />
<br />
&#160; 1.0.7 - 1 May 2008. Support for <span class="term">wmode</span>&#160;attribute for <span class="term">embed</span>; <span class="term">$config["show_setting"]</span>&#160;introduced; improved <span class="term">$config["elements"]</span>&#160;evaluation<br />
<br />
@ -1631,7 +1825,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; 1.0.4 - 10 March 2008. Improved corrections for <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span>&#160;and <span class="term">noscript</span><br />
<br />
&#160; 1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); a bug allowing <span class="term">td</span>&#160;directly inside <span class="term">table</span>&#160;fixed; <span class="term">safe</span>&#160;<span class="term">$config</span>&#160;parameter added<br />
&#160; 1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); fix for a bug allowing <span class="term">td</span>&#160;directly inside <span class="term">table</span>; <span class="term">$config["safe"]</span>&#160;introduced<br />
<br />
&#160; 1.0.2 - 13 February 2008. Improved implementation of <span class="term">$config["keep_bad"]</span><br />
<br />
@ -1653,6 +1847,10 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; Upgrading is as simple as replacing the previous version of <span class="term">htmLawed.php</span>&#160;(assuming it was not modified for customized features). As htmLawed output is almost always used in static documents, upgrading should not affect old, finalized content.<br />
<br />
&#160; <strong>Important</strong>&#160; The following upgrades may affect the functionality of a specific htmLawed installation:<br />
<br />
&#160; (1) From version 1.1-1.1.10 to 1.1.11 (or later), if a <span class="term">hook_tag</span>&#160;function is in use: In version 1.1.11, elements in closing tags (and not just the opening tags) are also passed to the function. There are no attribute names/values to pass, so a <span class="term">hook_tag</span>&#160;function receives only the element name. The <span class="term">hook_tag</span>&#160;function therefore may have to be edited. See <a href="#s3.4.9">section 3.4.9</a>.<br />
<br />
&#160; Old versions of htmLawed may be available online. E.g., for version 1.0, check <a href="http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip">http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip</a>, for 1.1.1, htmLawed111.zip, and for 1.1.10, htmLawed1110.zip.<br />
</div>
@ -1660,7 +1858,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s4.6" id="s4.6"></a><span class="item-no">4.6</span>&#160; Comparison with <span class="term">HTMLPurifier</span>
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it:<br />
&#160; The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of year 2010):<br />
<br />
&#160; * &#160;does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2)<br />
<br />
@ -1704,7 +1902,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s4.10" id="s4.10"></a><span class="item-no">4.10</span>&#160; Acknowledgements
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; Bryan Blakey, Ulf Harnhammer, Gareth Heyes, Lukasz Pilorz, Shelley Powers, Edward Yang, and many anonymous users.<br />
&#160; Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Dac Chartrand, Ulf Harnhammer, Gareth Heyes, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Harro Verton, Edward Yang, and many anonymous users.<br />
<br />
&#160; Thank you!<br />
@ -1726,7 +1924,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<a name="s5.2" id="s5.2"></a><span class="item-no">5.2</span>&#160; Valid attribute-element combinations
</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
<br />
&#160; Valid attribute-element combinations as per W3C specs.<br />
&#160; Valid attribute-element combinations as per <a href="http://www.w3c.org">W3C</a>&#160;specs.<br />
<br />
&#160; * &#160;includes deprecated attributes (marked <span class="term">^</span>), attributes for the non-standard <span class="term">embed</span>&#160;element (marked <span class="term">&#42;</span>), and the proprietary <span class="term">bordercolor</span>&#160;(marked <span class="term">~</span>)<br />
&#160; * &#160;only non-frameset, HTML body elements<br />
@ -1771,6 +1969,7 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
&#160; disabled - button, input, optgroup, option, select, textarea<br />
&#160; enctype - form<br />
&#160; face - font<br />
&#160; flashvars* - embed<br />
&#160; for - label<br />
&#160; frame - table<br />
&#160; frameborder - iframe<br />
@ -1935,11 +2134,11 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; <strong>Function arguments</strong>&#160;for htmLawed are:<br />
<br />
&#160; * &#160;<span class="term">$in</span>&#160;- 1st argument; a text string; the <strong>input text</strong>&#160;to be processed. Any extraneous slashes added by PHP when <em>magic quotes</em>&#160;are enabled should be removed beforehand using PHP's <span class="term">stripslashes()</span>&#160;function.<br />
&#160; * &#160;<span class="term">$in</span>&#160;- first argument; a text string; the <strong>input text</strong>&#160;to be processed. Any extraneous slashes added by PHP when <em>magic quotes</em>&#160;are enabled should be removed beforehand using PHP's <span class="term">stripslashes()</span>&#160;function.<br />
<br />
&#160; * &#160;<span class="term">$config</span>&#160;- 2nd argument; an associative array; optional (named <span class="term">$C</span>&#160;in htmLawed code). The array has keys with names like <span class="term">balance</span>&#160;and <span class="term">keep_bad</span>, and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the <strong>configurable parameters</strong>&#160;(indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through <span class="term">$config</span>. <em>Finalized</em>&#160;<span class="term">$config</span>&#160;is thus a filtered and possibly larger array.<br />
&#160; * &#160;<span class="term">$config</span>&#160;- second argument; an associative array; optional; named <span class="term">$C</span>&#160;within htmLawed code. The array has keys with names like <span class="term">balance</span>&#160;and <span class="term">keep_bad</span>, and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the <strong>configurable parameters</strong>&#160;(indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through <span class="term">$config</span>. <em>Finalized</em>&#160;<span class="term">$config</span>&#160;is thus a filtered and possibly larger array.<br />
<br />
&#160; * &#160;<span class="term">$spec</span>&#160;- 3rd argument; a text string; optional. The string has rules, written in an htmLawed-designated format, <strong>specifying</strong>&#160;element-specific attribute and attribute value restrictions. Function <span class="term">hl_spec()</span>&#160;is used to convert the string to an associative-array for internal use. <em>Finalized</em>&#160;<span class="term">$spec</span>&#160;is thus an array.<br />
&#160; * &#160;<span class="term">$spec</span>&#160;- third argument; a text string; optional. The string has rules, written in an htmLawed-designated format, <strong>specifying</strong>&#160;element-specific attribute and attribute value restrictions. Function <span class="term">hl_spec()</span>&#160;is used to convert the string to an associative-array, named <span class="term">$S</span>&#160;within htmLawed code, for internal use. <em>Finalized</em>&#160;<span class="term">$spec</span>&#160;is thus an array.<br />
<br />
&#160; <em>Finalized</em>&#160;<span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;are made <strong>global variables</strong>&#160;while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the <em>finalized</em>&#160;values, the <span class="term">show_settings</span>&#160;parameter of <span class="term">$config</span>&#160;should be used). Depending on <span class="term">$config</span>, another global variable <span class="term">hl_Ids</span>, to track <span class="term">id</span>&#160;attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing.<br />
<br />
@ -1966,13 +2165,13 @@ A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phpl
<br />
&#160; After this <em>initial processing</em>&#160;<span class="term">htmLawed()</span>&#160;identifies tags using regex and processes them with the help of <span class="term">hl_tag()</span>&#160;-- &#160;a large function that analyzes tag content, filtering it as per HTML standards, <span class="term">$config</span>&#160;and <span class="term">$spec</span>. Among other things, <span class="term">hl_tag()</span>&#160;transforms deprecated elements using <span class="term">hl_tag2()</span>, removes attributes from closing tags, checks attribute values as per <span class="term">$spec</span>&#160;rules using <span class="term">hl_attrval()</span>, and checks URL protocols using <span class="term">hl_prot()</span>. <span class="term">htmLawed()</span>&#160;performs tag balancing and nesting checks with a call to <span class="term">hl_bal()</span>, and optionally compacts/beautifies the output with proper white-spacing with a call to <span class="term">hl_tidy()</span>. The latter temporarily replaces white-space, and <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters inside <span class="term">pre</span>, <span class="term">script</span>&#160;and <span class="term">textarea</span>&#160;elements, and HTML comments and CDATA sections with control characters (code-points <span class="term">1</span>&#160;to <span class="term">5</span>, and <span class="term">7</span>).<br />
<br />
&#160; htmLawed permits the use of custom code or <strong>hook functions</strong>&#160;at two stages. The first, called inside <span class="term">htmLawed()</span>, allows the input text as well as the finalized $config and $spec values to be altered right after the initial processing (see <a href="#s3.7">section 3.7</a>). The second is called by <span class="term">hl_tag()</span>&#160;once the tag content is finalized (see <a href="#s3.4.9">section 3.4.9</a>).<br />
&#160; htmLawed permits the use of custom code or <strong>hook functions</strong>&#160;at two stages. The first, called inside <span class="term">htmLawed()</span>, allows the input text as well as the finalized <span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;values to be altered right after the initial processing (see <a href="#s3.7">section 3.7</a>). The second is called by <span class="term">hl_tag()</span>&#160;once the tag content is finalized (see <a href="#s3.4.9">section 3.4.9</a>).<br />
<br />
&#160; Being dictated by the external and stable HTML standard, htmLawed's objective is very clear-cut and less concerned with tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specs will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid in quickly grasping the logic, at least when viewed with code syntax highlighted.
&#160; The functionality of htmLawed is dictated by the external HTML standard. It is thus coded for a clear-cut objective with not much concern for tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specifications will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid a quick grasp of the logic.
</div>
</div>
<br />
<hr /><br /><br /><span class="subtle"><small>HTM version of <em><a href="htmLawed_README.txt">htmLawed_README.txt</a></em> generated on 23 Apr, 2009 using <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</small></span>
<hr /><br /><br /><span class="subtle"><small>HTM version of <em><a href="htmLawed_README.txt">htmLawed_README.txt</a></em> generated on 29 Aug, 2013 using <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</small></span>
</div><!-- ended div body -->
</div><!-- ended div top -->
</body>

View File

@ -1,8 +1,8 @@
/*
htmLawed_README.txt, 16 July 2009
htmLawed 1.1.8.1, 16 July 2009
htmLawed_README.txt, 29 August 2013
htmLawed 1.1.16, 29 August 2013
Copyright Santosh Patnaik
GPL v3 license
Dual licensed with LGPL 3 and GPL 2+
A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed
*/
@ -25,7 +25,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
2.6 Use without modifying old 'kses()' code
2.7 Tolerance for ill-written HTML
2.8 Limitations & work-arounds
2.9 Examples
2.9 Examples of usage
3 Details
3.1 Invalid/dangerous characters
3.2 Character references/entities
@ -73,9 +73,9 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
== 1 About htmLawed ================================================
htmLawed is a highly customizable single-file PHP script to make text secure, and standard- and admin policy-compliant for use in the body of HTML 4, XHTML 1 or 1.1, or generic XML documents. It is thus a configurable input (X)HTML filter, processor, purifier, sanitizer, beautifier, etc., and an alternative to the HTMLTidy:- http://tidy.sourceforge.net application.
htmLawed is a PHP script to process text with HTML markup to make it more compliant with HTML standards and administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such `lawing in` of HTML in text used in (X)HTML or XML documents ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators.
The `lawing in` of input text is needed to ensure that HTML code in the text is standard-compliant, does not introduce security vulnerabilities, and does not break the aesthetics, design or layout of web-pages. htmLawed tries to do this by, for example, making HTML well-formed with balanced and properly nested tags, neutralizing code that may be used for cross-site scripting ('XSS') attacks, and allowing only specified HTML elements/tags and attributes.
htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file, does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML Tidy:- http://tidy.sourceforge.net application.
-- 1.1 Example uses ------------------------------------------------
@ -102,8 +102,8 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* *beautify* or *compact* HTML ^~`
* *restrict elements* ^~`
* proper closure of empty elements like 'img' ^`
* can *restrict elements* ^~`
* ensures proper closure of empty elements like 'img' ^`
* *transform deprecated elements* like 'u' ^~`
* HTML *comments* and 'CDATA' sections can be permitted ^~`
* elements like 'script', 'object' and 'form' can be permitted ~
@ -112,7 +112,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* remove *invalid attributes* ^`
* element and attribute names are *lower-cased* ^
* provide *required attributes*, like 'alt' for 'image' ^`
* *transform deprecated attributes* ^~`
* *transforms deprecated attributes* ^~`
* attributes *declared only once* ^`
* *restrict attribute values*, including *element-specifically* ^~`
@ -121,6 +121,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* ensure *unique* 'id' attribute values ^~`
* *double-quote* attribute values ^
* lower-case *standard attribute values* like 'password' ^`
* permit custom, non-standard attributes as well as custom rules for standard attributes ~`
* *attribute-specific URL protocol/scheme restriction* *~`
* disable *dynamic expressions* in 'style' values *~`
@ -131,7 +132,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* remove *null* characters *
* neutralize potentially dangerous proprietary Netscape *Javascript entities* *
* replace potentially dangerous *soft-hyphen* character in attribute values with spaces *
* replace potentially dangerous *soft-hyphen* character in URL-accepting attribute values with spaces *
* remove common *invalid characters* not allowed in HTML or XML ^`
* replace *characters from Microsoft applications* like 'Word' that are discouraged in HTML or XML ^~`
@ -163,33 +164,36 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 1.3 History ----------------------------------------------------o
htmLawed was developed for use with 'LabWiki', a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like 'Kses' and 'HTMLPurifier' were deemed inadequate, slow, resource-intensive, or dependent on external applications like 'HTML Tidy'.
htmLawed was created in 2007 for use with 'LabWiki', a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like 'Kses' and 'HTMLPurifier' were deemed inadequate, slow, resource-intensive, or dependent on an extension or external application like 'HTML Tidy'. The core logic of htmLawed, that of identifying HTML elements and attributes, was based on the 'Kses' (version 0.2.2) HTML filter software of Ulf Harnhammar (it can still be used with code that uses 'Kses'; see section:- #2.6.).
htmLawed started as a modification of Ulf Harnhammar's 'Kses' (version 0.2.2) software, and is compatible with code that uses 'Kses'; see section:- #2.6.
See section:- #4.3 for a detailed log of changes in htmLawed over the years, and section:- #4.10 for acknowledgements.
-- 1.4 License & copyright ----------------------------------------o
htmLawed is free and open-source software licensed under GPL license version 3:- http://www.gnu.org/licenses/gpl-3.0.txt, and copyrighted by Santosh Patnaik, MD, PhD.
htmLawed is free and open-source software dual copyrighted by Santosh Patnaik, MD, PhD, and licensed under LGPL license version 3:- http://www.gnu.org/licenses/lgpl-3.0.txt, and GPL license version 2:- http://www.gnu.org/licenses/gpl-2.0.txt (or later).
-- 1.5 Terms used here --------------------------------------------o
* `administrator` - or admin; person setting up the code to pass input through htmLawed; also, `user`
In this document, only HTML body-level elements are considered. htmLawed does not have support for head-level elements, 'body', and the frame-level elements, 'frameset', 'frame' and 'noframes', and these elements are ignored here.
* `administrator` - or admin; person setting up the code that utilizes htmLawed; also, `user`
* `attributes` - name-value pairs like 'href="http://x.com"' in opening tags
* `author` - `writer`
* `author` - see `writer`
* `character` - atomic unit of text; internally represented by a numeric `code-point` as specified by the `encoding` or `charset` in use
* `entity` - markup like '&gt;' and '&#160;' used to refer to a character
* `element` - HTML element like 'a' and 'img'
* `element content` - content between the opening and closing tags of an element, like 'click' of '<a href="x">click</a>'
* `element content` - content between the opening and closing tags of an element, like 'click' of the '<a href="x">click</a>' element
* `HTML` - implies XHTML unless specified otherwise
* `input` - text string given to htmLawed to process
* `HTML body` - Complete HTML documents typically have a `head` and a `body` container. Information in `head` specifies title of the document, etc., whereas that in the body informs what is to be displayed on a web-page; it is only the elements for `body`, except 'frames', 'frameset' and 'noframes' that htmLawed is concerned with
* `input` - text given to htmLawed to process
* `processing` - involves filtering, correction, etc., of input
* `safe` - absence or reduction of certain characters and HTML elements and attributes in the input that can otherwise potentially and circumstantially expose web-site users to security vulnerabilities like cross-site scripting attacks (XSS)
* `scheme` - URL protocol like 'http' and 'ftp'
* `specs` - standard specifications
* `safe` - absence or reduction of certain characters and HTML elements and attributes in HTML of text that can otherwise potentially, and circumstantially, expose text readers to security vulnerabilities like cross-site scripting attacks (XSS)
* `scheme` - a URL protocol like 'http' and 'ftp'
* `specifications` - standard specifications, for HTML4, HTML5, Ruby, etc.
* `style property` - terms like 'border' and 'height' for which declarations are made in values for the 'style' attribute of elements
* `tag` - markers like '<a href="x">' and '</a>' delineating element content; the opening tag can contain attributes
* `tag content` - consists of tag markers '<' and '>', element names like 'div', and possibly attributes
@ -197,12 +201,22 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* `writer` - end-user like a blog commenter providing the input that is to be processed; also, `author`
-- 1.6 Availability ------------------------------------------------o
htmLawed can be downloaded for free at its website:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. Besides the 'htmLawed.php' file, the download has the htmLawed documentation (this document) in plain text:- htmLawed_README.txt and HTML:- htmLawed_README.htm formats, a script for testing:- htmLawedTest.php, and a text file for test-cases:- htmLawed_TESTCASE.txt. htmLawed is also available as a PHP class (OOP code) on its website.
== 2 Usage ========================================================oo
htmLawed should work with PHP 4.3 and higher. Either 'include()' the 'htmLawed.php' file or copy-paste the entire code.
htmLawed works in PHP version 4.4 or higher. Either 'include()' the 'htmLawed.php' file, or copy-paste the entire code. To use with PHP 4.3, have the following code included:
To easily *test* htmLawed using a form-based interface, use the provided demo:- htmLawedTest.php ('htmLawed.php' and 'htmLawedTest.php' should be in the same directory on the web-server).
if(!function_exists('ctype_digit')){
function ctype_digit($var){
return ((int) $var == $var);
}
}
-- 2.1 Simple ------------------------------------------------------
@ -212,15 +226,17 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
$processed = htmLawed($text);
*Note*: If input is from a '$_GET' or '$_POST' value, and 'magic quotes' are enabled on the PHP setup, run 'stripslashes()' on the input before passing to htmLawed.
With the 'htmLawed class' (section:- #1.6), usage is:
$processed = htmLawed::hl($text);
*Notes*: (1) If input is from a '$_GET' or '$_POST' value, and 'magic quotes' are enabled on the PHP setup, run 'stripslashes()' on the input before passing to htmLawed. (2) htmLawed does not have support for head-level elements, 'body', and the frame-level elements, 'frameset', 'frame' and 'noframes'.
By default, htmLawed will process the text allowing all valid HTML elements/tags, secure URL scheme/CSS style properties, etc. It will allow 'CDATA' sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- '$config' and '$spec':
$processed = htmLawed($text, $config, $spec);
These extra parameters are detailed below. Some examples are shown in section:- #2.9.
*Note*: For maximum protection against 'XSS' and other scripting attacks (e.g., by disallowing Javascript code), consider using the 'safe' parameter; see section:- #3.6.
The '$config' and '$spec' arguments are detailed below. Some examples are shown in section:- #2.9. For maximum protection against 'XSS' and other scripting attacks (e.g., by disallowing Javascript code), consider using the 'safe' parameter; see section:- #3.6.
-- 2.2 Configuring htmLawed using the '$config' parameter ---------o
@ -253,13 +269,13 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
Anti-link-spam measure; see section:- #3.4.7
'0' - no measure taken *
'array("regex1", "regex2")' - will ensure a 'rel' attribute with 'nofollow' in its value in case the 'href' attribute value matches the regular expression pattern 'regex1', and/or will remove 'href' if its value matches the regular expression pattern 'regex2'. E.g., 'array("/./", "/://\W*(?!(abc\.com|xyz\.org))/")'; see section:- #3.4.7 for more.
`array("regex1", "regex2")` - will ensure a 'rel' attribute with 'nofollow' in its value in case the 'href' attribute value matches the regular expression pattern 'regex1', and/or will remove 'href' if its value matches the regular expression pattern 'regex2'. E.g., 'array("/./", "/://\W*(?!(abc\.com|xyz\.org))/")'; see section:- #3.4.7 for more.
*anti_mail_spam*
Anti-mail-spam measure; see section:- #3.4.7
'0' - no measure taken *
'word' - '@' in mail address in 'href' attribute value is replaced with specified 'word'
`word` - '@' in mail address in 'href' attribute value is replaced with specified `word`
*balance*
Balance tags for well-formedness and proper nesting; see section:- #3.3.3
@ -303,9 +319,15 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
Denied HTML attributes; see section:- #3.4
'0' - none *
'string' - dictated by values in 'string'
`string` - dictated by values in `string`
'on*' (like 'onfocus') attributes not allowed - "
*direct_nest_list*
Allow direct nesting of a list within another without requiring it to be a list item; see section:- #3.3.4
'0' - no *
'1' - yes
*elements*
Allowed HTML elements; see section:- #3.3
@ -323,13 +345,13 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
Name of an optional hook function to alter the input string, '$config' or '$spec' before htmLawed starts its main work; see section:- #3.7
'0' - no hook function *
'name' - 'name' is name of the hook function ('kses_hook' ^)
`name` - `name` is name of the hook function ('kses_hook' ^)
*hook_tag*
Name of an optional hook function to alter tag content finalized by htmLawed; see section:- #3.4.9
'0' - no hook function *
'name' - 'name' is name of the hook function
`name` - `name` is name of the hook function
*keep_bad*
Neutralize bad tags by converting '<' and '>' to entities, or remove them; see section:- #3.3.3
@ -376,11 +398,11 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
'1' - will auto-adjust other relevant '$config' parameters (indicated by '"' in this list)
*schemes*
Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs; '*' covers all unspecified attributes; see section:- #3.4.3
Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs (or '!' to `deny` any URL); '*' covers all unspecified attributes; see section:- #3.4.3
'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https' *
'*: ftp, gopher, http, https, mailto, news, nntp, telnet' ^
'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; style: nil; *:file, http, https' "
'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; style: !; *:file, http, https' "
*show_setting*
Name of a PHP variable to assign the `finalized` '$config' and '$spec' values; see section:- #3.8
@ -403,7 +425,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
'0' - no ^
'1' - remove duplicate and/or invalid ones *
'word' - remove invalid ones and replace duplicate ones with new and unique ones based on the 'word'; the admin-specified 'word', like 'my_', should begin with a letter (a-z) and can contain letters, digits, '.', '_', '-', and ':'.
`word` - remove invalid ones and replace duplicate ones with new and unique ones based on the `word`; the admin-specified `word`, like 'my_', should begin with a letter (a-z) and can contain letters, digits, '.', '_', '-', and ':'.
*valid_xhtml*
Magic parameter to make input the most valid XHTML without needing to specify other relevant '$config' parameters; see section:- #3.5
@ -422,7 +444,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 2.3 Extra HTML specifications using the $spec parameter --------o
The '$spec' argument can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policy compliance. '$spec' is specified as a string of text containing one or more `rules`, with multiple rules separated from each other by a semi-colon (';'). E.g.,
The '$spec' argument of htmLawed can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policies. '$spec' is specified as a string of text containing one or more `rules`, with multiple rules separated from each other by a semi-colon (';'). E.g.,
$spec = 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt';
$processed = htmLawed($text, $config, $spec);
@ -445,7 +467,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* 'a=-*, href, title' - none except 'href' and 'title'
* 'a=-*, -id, href, title' - none except 'href' and 'title'
Rules regarding *attribute values* are optionally specified inside round brackets after attribute names in slash ('/')-separated `parameter = value` pairs. E.g., 'title(maxlen=30/minlen=5)'. None, or one or more of the following parameters may be specified:
Rules regarding *attribute values* are optionally specified inside round brackets after attribute names in slash ('/')-separated `parameter = value` pairs. E.g., 'title(maxlen=30/minlen=5)'. None or one or more of the following parameters may be specified:
* 'oneof' - one or more choices separated by '|' that the value should match; if only one choice is provided, then the value must match that choice
@ -469,7 +491,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
`Rule`: 'input=title(), value(maxval=8/default=6)'
`Output`: '<input title="WIDTH" value="6" /><input title="length" value="5" />'
`Rule`: 'input=title(nomatch=$w.d$i), value(match=$em$/default=6em)'
`Rule`: 'input=title(nomatch=%w.d%i), value(match=%em%/default=6em)'
`Output`: '<input value="10em" /><input title="length" value="6em" />'
`Rule`: 'input=title(oneof=height|depth/default=depth), value(noneof=5|6)'
@ -477,13 +499,19 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
*Special characters*: The characters ';', ',', '/', '(', ')', '|', '~' and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be `escaped` by enclosing in pairs of double-quotes ('"'). A back-tick ('`') can be used to escape a literal '"'. An example rule illustrating this is 'input=value(maxlen=30/match="/^\w/"/default="your `"ID`"")'.
*Note*: To deny an attribute for all elements for which it is legal, '$config["deny_attribute"]' (see section:- #3.4) can be used instead of '$spec'. Also, attributes can be allowed element-specifically through '$spec' while being denied globally through '$config["deny_attribute"]'. The 'hook_tag' parameter (section:- #3.4.9) can also be used to implement the '$spec' functionality.
*Note*: To deny an attribute for all elements for which it is legal, '$config["deny_attribute"]' (see section:- #3.4) can be used instead of '$spec'. Also, attributes can be allowed element-specifically through '$spec' while being denied globally through '$config["deny_attribute"]'. The 'hook_tag' parameter (section:- #3.4.9) can also be possibly used to implement a functionality like that achieved using '$spec' functionality.
'$spec' can also be used to permit custom, non-standard attributes as well as custom rules for standard attributes. Thus, the following value of '$spec' will permit the custom uses of the standard 'rel' attribute in 'input' (not permitted as per standards) and of a non-standard attribute, 'vFlag', in 'img'.
$spec = 'img=vFlag; input=rel'
The attribute names can contain alphabets, colons (:) and hyphens (-), but they must start with an alphabet.
-- 2.4 Performance time & memory usage ----------------------------o
The time and memory used by htmLawed depends on its configuration and the size of the input, and the amount, nestedness and well-formedness of the HTML markup within it. In particular, tag balancing and beautification each can increase the processing time by about a quarter.
The time and memory consumed during text processing by htmLawed depends on its configuration, the size of the input, and the amount, nestedness and well-formedness of the HTML markup within the input. In particular, tag balancing and beautification each can increase the processing time by about a quarter.
The htmLawed demo:- htmLawedTest.php can be used to evaluate the performance and effects of different types of input and '$config'.
@ -491,17 +519,21 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 2.5 Some security risks to keep in mind ------------------------o
When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially `dangerous` HTML code. (This may not be a problem if the authors are trusted.)
For example, following increase security risks:
When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially `dangerous` HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permitted through htmLawed's setting should be considered. For example, following increase security risks:
* Allowing 'script', 'applet', 'embed', 'iframe' or 'object' elements, or certain of their attributes like 'allowscriptaccess'
* Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., '<!--[if gte IE 4]><script>alert("xss");</script><![endif]-->'
* Allowing dynamic CSS expressions (a feature of the IE browser)
* Allowing dynamic CSS expressions (some Internet Explorer versions are vulnerable)
`Unsafe` HTML can be removed by setting '$config' appropriately. E.g., '$config["elements"] = "* -script"' (section:- #3.3), '$config["safe"] = 1' (section:- #3.6), etc.
* Allowing the 'style' attribute
To remove `unsecure` HTML, code-developers using htmLawed must set '$config' appropriately. E.g., '$config["elements"] = "* -script"' to deny the 'script' element (section:- #3.3), '$config["safe"] = 1' to auto-configure ceratin htmLawed parameters for maximizing security (section:- #3.6), etc.
Permitting the '*style*' attribute brings in risks of `click-jacking`, `phishing`, web-page overlays, etc., `even` when the 'safe' parameter is enabled (see section:- #3.6). Except for URLs and a few other things like CSS dynamic expressions, htmLawed currently does not check every CSS style property. It does provide ways for the code-developer implementing htmLawed to do such checks through htmLawed's '$spec' argument, and through the 'hook_tag' parameter (see section:- #3.4.8 for more). Disallowing 'style' completely and relying on CSS classes and stylesheet files is recommended.
htmLawed does not check or correct the character *encoding* of the input it receives. In conjunction with permissive circumstances, such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can allow for an exploit (like Google's `UTF-7/XSS` vulnerability of the past).
-- 2.6 Use without modifying old 'kses()' code --------------------o
@ -538,7 +570,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 2.7 Tolerance for ill-written HTML -----------------------------o
htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be `read` as HTML, and be considered mere plain text instead. Following statements indicate the degree of `looseness` that htmLawed can work with, and can be provided in instructions to writers:
htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be `read` as HTML, and may therefore get identified as mere plain text. Following statements indicate the degree of `looseness` that htmLawed can work with, and can be provided in instructions to writers:
* Tags must be flanked by '<' and '>' with no '>' inside -- any needed '>' should be put in as '&gt;'. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and '>', like '<div >' and '<img / >', but not after the '<'.
@ -546,13 +578,13 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* Attribute string of elements may be liberally spaced with tabs, line-breaks, etc.
* Attribute values may not be double-quoted, or may be single-quoted.
* Attribute values may be single- and not double-quoted.
* Left-padding of numeric entities (like, '&#0160;', '&x07ff;') with '0' is okay as long as the number of characters between between the '&' and the ';' does not exceed 8. All entities must end with ';' though.
* Named character entities must be properly cased. E.g., '&Lt;' or '&TILDE;' will not be let through without modification.
* Named character entities must be properly cased. Thus, '&Lt;' or '&TILDE;' will not be recognized as entities and will be `neutralized`.
* HTML comments should not be inside element tags (okay between tags), and should begin with '<!--' and end with '-->'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any '-->' inside should be put in as '--&gt;'. Any '--' inside will be automatically converted to '-', and a space will be added before the comment delimiter '-->'.
* HTML comments should not be inside element tags (they can be between tags), and should begin with '<!--' and end with '-->'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any '-->' inside should be put in as '--&gt;'. Any '--' inside will be automatically converted to '-', and a space will be added before the comment delimiter '-->'.
* 'CDATA' sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with '<[CDATA[' and end with ']]>'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any ']]>' inside should be put in as ']]&gt;'.
@ -567,21 +599,21 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* With '$config["unique_ids"]' not '0' and the 'id' attribute being permitted, writers should carefully avoid using duplicate or invalid 'id' values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when '<a id="home"></a><input id="home" /><label for="home"></label>' is processed into
'<a id="home"></a><input id="prefix_home" /><label for="home"></label>'.
* Note that even if intended HTML is lost in a highly ill-written input, the processed output will be more secure and standard-compliant.
* Even if intended HTML is lost from an ill-written input, the processed output will be more secure and standard-compliant.
* For URLs, unless '$config["scheme"]' is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., 'htt&#112;' (which many browsers will read as the harmless 'http') may be considered bad by htmLawed.
* htmLawed will attempt to put plain text present directly inside 'blockquote', 'form', 'map' and 'noscript' elements (illegal as per the specs) inside auto-generated 'div' elements.
* htmLawed will attempt to put plain text present directly inside 'blockquote', 'form', 'map' and 'noscript' elements (illegal as per the specifications) inside auto-generated 'div' elements.
-- 2.8 Limitations & work-arounds ---------------------------------o
htmLawed's main objective is to make the input text `more` standard-compliant, secure for web-page readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds.
htmLawed's main objective is to make the input text `more` standard-compliant, secure for readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with work-arounds.
It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specs (like asking for normalization of white-spacing within 'textarea' elements) are clearly wrong. Regarding security, note that `unsafe` HTML code is not necessarily legally invalid.
It should be borne in mind that no browser application is 100% standard-compliant, and that some of the standard specifications (like asking for normalization of white-spacing within 'textarea' elements) are clearly wrong. Regarding security, note that `unsafe` HTML code is not legally invalid per se.
* htmLawed is meant for input that goes into the 'body' of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements 'frameset', 'frame' and 'noframes'.
* htmLawed is meant for input that goes into the 'body' of HTML documents. HTML's head-level elements are not supported, nor are the frameset elements 'frameset', 'frame' and 'noframes'. Content of the latter elements can, however, be individually filtered through htmLawed.
* It cannot transform the non-standard 'embed' elements to the standard-compliant 'object' elements. Yet, it can allow 'embed' elements if permitted ('embed' is widely used and supported). Admins can certainly use the 'hook_tag' parameter (section:- #3.4.9) to deploy a custom embed-to-object converter function.
@ -591,7 +623,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* By default, htmLawed won't check many attribute values for standard compliance. E.g., 'width="20m"' with the dimension in non-standard 'm' is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the 'hook_tag' parameter (section:- #3.4.9) or '$spec' to enforce finer checks.
* The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specs. Only a few of the proprietary attributes are supported.
* The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported.
* Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the 'hook_tag' parameter (section:- #3.4.9) or '$spec' for finer checks. Perhaps the best option is to disallow 'style' but allow 'class' attributes with the right 'oneof' or 'match' values for 'class', and have the various class style properties in '.css' CSS stylesheet files.
@ -603,11 +635,11 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert 'http' to 'https'. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9).
* Pairs of opening and closing tags that do not enclose any content (like '<em></em>') are not removed. This may be against the standard specs for certain elements (e.g., 'table'). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.
* Pairs of opening and closing tags that do not enclose any content (like '<em></em>') are not removed. This may be against the standard specifications for certain elements (e.g., 'table'). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.
* htmLawed does not check for certain element orderings described in the standard specs (e.g., in a 'table', 'tbody' is allowed before 'tfoot'). Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9).
* htmLawed does not check for certain element orderings described in the standard specifications (e.g., in a 'table', 'tbody' is allowed before 'tfoot'). Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9).
* htmLawed does not check the number of nested elements. E.g., it will allow two 'caption' elements in a 'table' element, illegal as per the specs. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9).
* htmLawed does not check the number of nested elements. E.g., it will allow two 'caption' elements in a 'table' element, illegal as per the specifications. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9).
* htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers ('/*') in 'style' attribute values in order to detect malicious HTML like crafted IE-specific dynamic expressions like '&#101;xpression...'. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the 'hook_tag' parameter (section:- #3.4.9) to more specifically identify CSS expressions in the 'style' attribute values. Also, using '$config["style_pass"]', it is possible to have htmLawed pass 'style' attribute values without even looking at them (section:- #3.4.8).
@ -615,13 +647,69 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
* Because of poor Unicode support in PHP, htmLawed does not remove the `high value` HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section:- #3.1).
* htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can permit an exploit (like Google's `UTF-7/XSS` vulnerability of the past). Also, htmLawed can mangle input text if it is not well-formed in terms of character encoding. Administrators can consider using code available elsewhere to check well-formedness of input text characters to correct any defect.
* htmLawed is expected to work with input texts in ASCII-compatible single byte encodings such as national variants of ASCII (like ISO-646-DE/German of the ISO 646 standard), extended ASCII variants (like ISO 8859-10/Turkish of the ISO 8859/ISO Latin standard), ISO 8859-based Windows variants (like Windows 1252), EBCDIC, Shift JIS (Japanese), GB-Roman (Chinese), and KS-Roman (Korean). It should also properly handle texts with variable byte encodings like UTF-7 (Unicode) and UTF-8 (Unicode). However, htmLawed may mangle input texts with double byte encodings like UTF-16 (Unicode), JIS X 0208:1997 (Japanese) and K SX 1001:1992 (Korean), or the UTF-32 (Unicode) quadruple byte encoding. If an input text has such an encoding, administrators can use PHP's iconv:- http://php.net/manual/en/book.iconv.php functions, or some other mean, to convert text to UTF-8 before passing it to htmLawed.
* Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts.
-- 2.9 Examples ---------------------------------------------------o
-- 2.9 Examples of usage -------------------------------------------o
*1.* A blog administrator wants to allow only 'a', 'em', 'strike', 'strong' and 'u' in comments, but needs 'strike' and 'u' transformed to 'span' for better XHTML 1-strict compliance, and, he wants the 'a' links to be to 'http' or 'https' resources:
Safest, allowing only `safe` HTML markup --
$config = array('safe'=>1);
$out = htmLawed($in);
Simplest, allowing all valid HTML markup except 'javascript:' --
$out = htmLawed($in);
Allowing all valid HTML markup including 'javascript:' --
$config = array('schemes'=>'*:*');
$out = htmLawed($in, $config);
Allowing only 'safe' HTML and the elements 'a', 'em', and 'strong' --
$config = array('safe'=>1, 'elements'=>'a, em, strong');
$out = htmLawed($in, $config);
Not allowing elements 'script' and 'object' --
$config = array('elements'=>'* -script -object');
$out = htmLawed($in, $config);
Not allowing attributes 'id' and 'style' --
$config = array('deny_attribute'=>'id, style');
$out = htmLawed($in, $config);
Permitting only attributes 'title' and 'href' --
$config = array('deny_attribute'=>'* -title -href');
$out = htmLawed($in, $config);
Remove bad/disallowed tags altogether instead of converting them to entities --
$config = array('keep_bad'=>0);
$out = htmLawed($in, $config);
Allowing attribute 'title' only in 'a' and not allowing attributes 'id', 'style', or scriptable `on*` attributes like 'onclick' --
$config = array('deny_attribute'=>'title, id, style, on*');
$spec = 'a=title';
$out = htmLawed($in, $config, $spec);
Allowing a custom attribute, 'vFlag', in 'img' and permitting custom use of the standard attribute, 'rel', in 'input' --
$spec = 'img=vFlag; input=rel';
$out = htmLawed($in, $config, $spec);
Some case-studies are presented below.
*1.* A blog administrator wants to allow only 'a', 'em', 'strike', 'strong' and 'u' in comments, but needs 'strike' and 'u' transformed to 'span' for better XHTML 1-strict compliance, and, he wants the 'a' links to point only to 'http' or 'https' resources:
$processed = htmLawed($in, array('elements'=>'a, em, strike, strong, u', 'make_tag_strict'=>1, 'safe'=>1, 'schemes'=>'*:http, https'), 'a=href');
@ -656,13 +744,13 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text.
The '$config["clean_ms_char"]' parameter need not be used if authors do not copy-paste Microsoft-created text or if the input text is not believed to use the 'Windows 1252' or a similar encoding. Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.
The '$config["clean_ms_char"]' parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the 'Windows 1252' ('Cp-1252') or a similar encoding like 'Cp-1251' (otherwise, for example when UTF-8 encoding is in use, Japanese or Korean characters can get mangled). Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.
-- 3.2 Character references/entities ------------------------------o
Valid character entities take the form '&*;' where '*' is '#x' followed by a hexadecimal number (hexadecimal numeric entity; like '&#xA0;' for non-breaking space), or alphanumeric like 'gt' (external or named entity; like '&nbsp;' for non-breaking space), or '#' followed by a number (decimal numeric entity; like '&#160;' for non-breaking space). Character entities referring to the soft-hyphen character (the '&shy;' or '\xad' character; hexadecimal code-point 'ad' [decimal '173']) in attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.
Valid character entities take the form '&*;' where '*' is '#x' followed by a hexadecimal number (hexadecimal numeric entity; like '&#xA0;' for non-breaking space), or alphanumeric like 'gt' (external or named entity; like '&nbsp;' for non-breaking space), or '#' followed by a number (decimal numeric entity; like '&#160;' for non-breaking space). Character entities referring to the soft-hyphen character (the '&shy;' or '\xad' character; hexadecimal code-point 'ad' [decimal '173']) in URL-accepting attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.
htmLawed (function 'hl_ent()'):
@ -861,7 +949,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
An option like '1' is useful, e.g., when a writer previews his submission, whereas one like '3' is useful before content is finalized and made available to all.
*Note:* In the example above, unlike '<*>', '<xml>' gets considered as a tag (even though there is no HTML element named 'xml'). In general, text matching the regular expression pattern '<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>' is considered a tag (phrase enclosed by the angled brackets '<' and '>', and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...).
*Note:* In the example above, unlike '<*>', '<xml>' gets considered as a tag (even though there is no HTML element named 'xml'). Thus, the 'keep_bad' parameter's value affects '<xml>' but not '<*>'. In general, text matching the regular expression pattern '<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>' is considered a tag (phrase enclosed by the angled brackets '<' and '>', and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...), and is subjected to the 'keep_bad' value.
Nesting/content rules for each of the 86 elements in htmLawed's default set (see section:- #3.3) are defined in function 'hl_bal()'. This means that if a non-standard element besides 'embed' is being permitted through '$config["elements"]', the element's tag content will end up getting removed if '$config["balance"]' is set to '1'.
@ -879,6 +967,8 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages.
With '$config["direct_list_nest"]' set to '1', htmLawed will allow direct nesting of an 'ol' or 'ul' list within another 'ol' or 'ul' without requiring the child list to be within an 'li' of the parent list. While this is not standard-compliant, directly nested lists are rendered properly by almost all browsers. The parameter '$config["direct_list_nest"]' has no effect if tag-balancing (section:- #3.3.3) is turned off.
-- 3.3.5 Beautify or compact HTML ---------------------------------o
@ -901,7 +991,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 3.4 Attributes ------------------------------------------------oo
htmLawed will only permit attributes described in the HTML specs (including deprecated ones). It also permits some attributes for use with the 'embed' element (the non-standard 'embed' element is supported in htmLawed because of its widespread use), and the the 'xml:space' attribute (valid only in XHTML 1.1). A list of such 111 attributes and the elements they are allowed in is in section:- #5.2.
htmLawed will only permit attributes described in the HTML specs (including deprecated ones). It also permits some attributes for use with the 'embed' element (the non-standard 'embed' element is supported in htmLawed because of its widespread use), and the the 'xml:space' attribute (valid only in XHTML 1.1). A list of such 111 attributes and the elements they are allowed in is in section:- #5.2. Using the '$spec' argument, htmLawed can be forced to permit custom, non-standard attributes as well as custom rules for standard attributes (section:- #2.3).
When '$config["deny_attribute"]' is not set, or set to '0', or empty ('""'), all the 111 attributes are permitted. Otherwise, '$config["deny_attribute"]' can be set as a list of comma-separated names of the denied attributes. 'on*' can be used to refer to the group of potentially dangerous, script-accepting attributes: 'onblur', 'onchange', 'onclick', 'ondblclick', 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup', 'onreset', 'onselect' and 'onsubmit'.
@ -974,6 +1064,8 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
As a side-note, one may find 'style: *' useful as URLs in 'style' attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text.
'!' can be put in the list of schemes to disallow all protocols as well as `local` URLs. Thus, with 'href: http, style: !', '<a href="http://cnn.com" style="background-image: url('local.jpg');">CNN</a>' will become '<a href="http://cnn.com" style="background-image: url('denied:local.jpg');">CNN</a>'.
*Note*: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string 'src' (e.g., 'dynsrc') or starts with 'o' (e.g., 'onbeforecopy').
With '$config["safe"] = 1', all URLs are disallowed in the 'style' attribute values.
@ -1102,7 +1194,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 3.4.8 Inline style properties ----------------------------------o
htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the 'style' attributes. (CSS properties like 'background-image' that accept URLs in their values are noted in section:- #5.3.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting '$config["css_expression"]' to '1' (default setting).
htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the 'style' attributes. (CSS properties like 'background-image' that accept URLs in their values are noted in section:- #5.3.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting '$config["css_expression"]' to '1' (default setting). Note that when '$config["css_expression"]' is set to '1', htmLawed will remove '/*' from the 'style' values.
*Note*: Because of the various ways of representing characters in attribute values (URL-escapement, entitification, etc.), htmLawed might alter the values of the 'style' attribute values, and may even falsely identify dynamic CSS expressions and URL schemes in them. If this is an important issue, checking of URLs and dynamic expressions can be turned off ('$config["schemes"] = "...style:*..."', see section:- #3.4.3, and '$config["css_expression"] = 0'). Alternately, admins can use their own custom function for finer handling of 'style' values through the 'hook_tag' parameter (see section:- #3.4.9).
@ -1116,13 +1208,21 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.).
When '$config' parameter 'hook_tag' is set to the name of a function, htmLawed (function 'hl_tag()') will pass on the element name, and the `finalized` attribute name-value pairs as array elements to the function. The function is expected to return the full opening tag string like '<element_name attribute_1_name="attribute_1_value"...>' (for empty elements like 'img' and 'input', the element-closing slash '/' should also be included).
When '$config' parameter 'hook_tag' is set to the name of a function, htmLawed (function 'hl_tag()') will pass on the element name, and, in the case of an opening tag, the `finalized` attribute name-value pairs as array elements to the function. The function, after completing a task such as filtering or tag transformation, will typically return an empty string, the full opening tag string like '<element_name attribute_1_name="attribute_1_value"...>' (for empty elements like 'img' and 'input', the element-closing slash '/' should also be included), etc.
Any 'hook_tag' function, since htmLawed version 1.1.11, also receives names of elements in closing tags, such as 'a' in the closing '</a>' tag of the element '<a href="http://cnn.com">CNN</a>'. Unlike for opening tags, no other value (i.e., the attribute name-value array) is passed to the function since a closing tag contains only element names. Typically, the function will return an empty string or a full closing tag (like '</a>').
This is a *powerful functionality* that can be exploited for various objectives: consolidate-and-convert inline 'style' attributes to 'class', convert 'embed' elements to 'object', permit only one 'caption' element in a 'table' element, disallow embedding of certain types of media, *inject HTML*, use CSSTidy:- http://csstidy.sourceforge.net to sanitize 'style' attribute values, etc.
As an example, the custom hook code below can be used to force a series of specifically ordered 'id' attributes on all elements, and a specific 'param' element inside all 'object' elements:
function my_tag_function($element, $attribute_array){
function my_tag_function($element, $attribute_array=0){
// If second argument is not received, it means a closing tag is being handled
if(is_numeric($attribute_array)){
return "</$element>";
}
static $id = 0;
// Remove any duplicate element
if($element == 'param' && isset($attribute_array['allowscriptaccess'])){
@ -1145,6 +1245,9 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
foreach($attribute_array as $k=>$v){
$string .= " {$k}=\"{$v}\"";
}
static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1);
return "<{$element}{$string}". (isset($in_array($element, $empty_elements) ? ' /' : ''). '>'. $new_element;
}
@ -1166,12 +1269,14 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
htmLawed allows an admin to use '$config["safe"]' to auto-adjust multiple '$config' parameters (such as 'elements' which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by '"' in section:- #2.2). Thus, one can pass the '$config' argument with a simpler value.
With the value of '1', htmLawed considers 'CDATA' sections and HTML comments as plain text, and prohibits the 'applet', 'embed', 'iframe', 'object' and 'script' elements, and the 'on*' attributes like 'onclick'. ( There are '$config' parameters like 'css_expression' that are not affected by the value set for 'safe' but whose default values still contribute towards a more `safe` output.) Further, URLs with schemes (see section:- #3.4.3) are neutralized so that, e.g., 'style="moz-binding:url(http://danger)"' becomes 'style="moz-binding:url(denied:http://danger)"' while 'style="moz-binding:url(ok)"' remains intact.
With the value of '1', htmLawed considers 'CDATA' sections and HTML comments as plain text, and prohibits the 'applet', 'embed', 'iframe', 'object' and 'script' elements, and the 'on*' attributes like 'onclick'. ( There are '$config' parameters like 'css_expression' that are not affected by the value set for 'safe' but whose default values still contribute towards a more `safe` output.) Further, URLs with schemes (see section:- #3.4.3) are neutralized so that, e.g., 'style="moz-binding:url(http://danger)"' becomes 'style="moz-binding:url(denied:http://danger)"'.
Admins, however, may still want to completely deny the 'style' attribute, e.g., with code like
$processed = htmLawed($text, array('safe'=>1, 'deny_attribute'=>'style'));
Permitting the 'style' attribute brings in risks of `click-jacking`, etc. CSS property values can render a page non-functional or be used to deface it. Except for URLs, dynamic expressions, and some other things, htmLawed does not completely check 'style' values. It does provide ways for the code-developer implementing htmLawed to do such checks through the '$spec' argument, and through the 'hook_tag' parameter (see section:- #3.4.8 for more). Disallowing style completely and relying on CSS classes and stylesheet files is recommended.
If a value for a parameter auto-set through 'safe' is still manually provided, then that value can over-ride the auto-set value. E.g., with '$config["safe"] = 1' and '$config["elements"] = "*+script"', 'script', but not 'applet', is allowed.
A page illustrating the efficacy of htmLawed's anti-XSS abilities with 'safe' set to '1' against XSS vectors listed by RSnake:- http://ha.ckers.org/xss.html may be available here:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm.
@ -1200,7 +1305,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 3.9 Retaining non-HTML tags in input with mixed markup ---------o
htmLawed does not remove certain characters that though invalid are nevertheless discouraged in HTML documents as per the specs (see section:- #5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the '<', '>' and '&' characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).
htmLawed does not remove certain characters that, though invalid, are nevertheless `discouraged` in HTML documents as per the specifications (see section:- #5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the '<', '>' and '&' characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).
To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the '<', '>' and '&' characters with some of the HTML-discouraged characters (see section:- #3.1.2). Post-htmLawed processing, the replacements are reverted.
@ -1221,7 +1326,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 4.1 Support -----------------------------------------------------
A careful re-reading of this documentation will very likely answer your questions.
A careful reading of this documentation may provide an answer.
Software updates and forum-based community-support may be found at http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at http://php.net.
@ -1231,16 +1336,40 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
See section:- #2.8.
Readers are advised to cross-check information given in this document.
-- 4.3 Change-log -------------------------------------------------o
(The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the 'htmLawed.php' file may be updated independently if the secondary files are revised.)
(The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the 'htmLawed.php' file, may be updated without a change-log entry if the secondary files, but not htmLawed per se, are revised.)
`Version number - Release date. Notes`
1.1.16 - 29 August 2013. Fix for a potential security vulnerability arising from specialy encoded space characters in URL schemes/protocols
1.1.15 - 11 August 2013. Improved tidying/prettifying functionality
1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during 'tidying' when 'balance' is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like 'blockquote'.
1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes
1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the 'face' attribute
1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload enviroment. '$config["hook_tag"]', if specified, now receives names of elements in closing tags.
1.1.10 - 22 October 2011. Fix for a bug in the 'tidy' functionality that caused the entire input to be replaced with a single space; new parameter, '$config["direct_list_nest"]' to allow direct descendance of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.)
1.1.9.5 - 6 July 2011. Minor correction of a rule for nesting of 'li' within 'dir'
1.1.9.4 - 3 July 2010. Parameter 'schemes' now accepts '!' so any URL, even a local one, can be `denied`. An issue in which a second URL value in 'style' properties was not checked was fixed.
1.1.9.3 - 17 May 2010. Checks for correct nesting of 'param'
1.1.9.2 - 26 April 2010. Minor fix regarding rendering of denied URL schemes
1.1.9.1 - 26 February 2010. htmLawed now uses the LGPL version 3 license; support for 'flashvars' attribute for 'embed'
1.1.9 - 22 December 2009. Soft-hyphens are now removed only from URL-accepting attribute values
1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice
1.1.8 - 23 April 2009. Parameter 'deny_attribute' now accepts the wild-card '*', making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting '$spec'
@ -1253,11 +1382,11 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent
1.1 - 29 June 2008. '$config["hook_tag"]' and '$config["format"]' introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug
1.1 - 29 June 2008. '$config["hook_tag"]' and '$config["tidy"]' introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug
1.0.9 - 11 June 2008. Fixed bug in invalid HTML code-point entity check
1.0.9 - 11 June 2008. Fix for a bug in checks for invalid HTML code-point entities
1.0.8 - 15 May 2008. 'bordercolor' attribute for 'table', 'td' and 'tr'
1.0.8 - 15 May 2008. Permit 'bordercolor' attribute for 'table', 'td' and 'tr'
1.0.7 - 1 May 2008. Support for 'wmode' attribute for 'embed'; '$config["show_setting"]' introduced; improved '$config["elements"]' evaluation
@ -1267,7 +1396,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
1.0.4 - 10 March 2008. Improved corrections for 'blockquote', 'form', 'map' and 'noscript'
1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); a bug allowing 'td' directly inside 'table' fixed; 'safe' '$config' parameter added
1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); fix for a bug allowing 'td' directly inside 'table'; '$config["safe"]' introduced
1.0.2 - 13 February 2008. Improved implementation of '$config["keep_bad"]'
@ -1287,13 +1416,17 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
Upgrading is as simple as replacing the previous version of 'htmLawed.php' (assuming it was not modified for customized features). As htmLawed output is almost always used in static documents, upgrading should not affect old, finalized content.
*Important* The following upgrades may affect the functionality of a specific htmLawed installation:
(1) From version 1.1-1.1.10 to 1.1.11 (or later), if a 'hook_tag' function is in use: In version 1.1.11, elements in closing tags (and not just the opening tags) are also passed to the function. There are no attribute names/values to pass, so a 'hook_tag' function receives only the element name. The 'hook_tag' function therefore may have to be edited. See section:- #3.4.9.
Old versions of htmLawed may be available online. E.g., for version 1.0, check http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip, for 1.1.1, htmLawed111.zip, and for 1.1.10, htmLawed1110.zip.
-- 4.6 Comparison with 'HTMLPurifier' -----------------------------o
The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it:
The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of year 2010):
* does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2)
@ -1333,7 +1466,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 4.10 Acknowledgements ------------------------------------------o
Bryan Blakey, Ulf Harnhammer, Gareth Heyes, Lukasz Pilorz, Shelley Powers, Edward Yang, and many anonymous users.
Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Dac Chartrand, Ulf Harnhammer, Gareth Heyes, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Harro Verton, Edward Yang, and many anonymous users.
Thank you!
@ -1352,7 +1485,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
-- 5.2 Valid attribute-element combinations -----------------------o
Valid attribute-element combinations as per W3C specs.
Valid attribute-element combinations as per W3C:- http://www.w3c.org specs.
* includes deprecated attributes (marked '^'), attributes for the non-standard 'embed' element (marked '*'), and the proprietary 'bordercolor' (marked '~')
* only non-frameset, HTML body elements
@ -1397,6 +1530,7 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
disabled - button, input, optgroup, option, select, textarea
enctype - form
face - font
flashvars* - embed
for - label
frame - table
frameborder - iframe
@ -1555,11 +1689,11 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
*Function arguments* for htmLawed are:
* '$in' - 1st argument; a text string; the *input text* to be processed. Any extraneous slashes added by PHP when `magic quotes` are enabled should be removed beforehand using PHP's 'stripslashes()' function.
* '$in' - first argument; a text string; the *input text* to be processed. Any extraneous slashes added by PHP when `magic quotes` are enabled should be removed beforehand using PHP's 'stripslashes()' function.
* '$config' - 2nd argument; an associative array; optional (named '$C' in htmLawed code). The array has keys with names like 'balance' and 'keep_bad', and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the *configurable parameters* (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through '$config'. `Finalized` '$config' is thus a filtered and possibly larger array.
* '$config' - second argument; an associative array; optional; named '$C' within htmLawed code. The array has keys with names like 'balance' and 'keep_bad', and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the *configurable parameters* (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through '$config'. `Finalized` '$config' is thus a filtered and possibly larger array.
* '$spec' - 3rd argument; a text string; optional. The string has rules, written in an htmLawed-designated format, *specifying* element-specific attribute and attribute value restrictions. Function 'hl_spec()' is used to convert the string to an associative-array for internal use. `Finalized` '$spec' is thus an array.
* '$spec' - third argument; a text string; optional. The string has rules, written in an htmLawed-designated format, *specifying* element-specific attribute and attribute value restrictions. Function 'hl_spec()' is used to convert the string to an associative-array, named '$S' within htmLawed code, for internal use. `Finalized` '$spec' is thus an array.
`Finalized` '$config' and '$spec' are made *global variables* while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the `finalized` values, the 'show_settings' parameter of '$config' should be used). Depending on '$config', another global variable 'hl_Ids', to track 'id' attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing.
@ -1586,9 +1720,9 @@ A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/intern
After this `initial processing` 'htmLawed()' identifies tags using regex and processes them with the help of 'hl_tag()' -- a large function that analyzes tag content, filtering it as per HTML standards, '$config' and '$spec'. Among other things, 'hl_tag()' transforms deprecated elements using 'hl_tag2()', removes attributes from closing tags, checks attribute values as per '$spec' rules using 'hl_attrval()', and checks URL protocols using 'hl_prot()'. 'htmLawed()' performs tag balancing and nesting checks with a call to 'hl_bal()', and optionally compacts/beautifies the output with proper white-spacing with a call to 'hl_tidy()'. The latter temporarily replaces white-space, and '<', '>' and '&' characters inside 'pre', 'script' and 'textarea' elements, and HTML comments and CDATA sections with control characters (code-points '1' to '5', and '7').
htmLawed permits the use of custom code or *hook functions* at two stages. The first, called inside 'htmLawed()', allows the input text as well as the finalized $config and $spec values to be altered right after the initial processing (see section:- #3.7). The second is called by 'hl_tag()' once the tag content is finalized (see section:- #3.4.9).
htmLawed permits the use of custom code or *hook functions* at two stages. The first, called inside 'htmLawed()', allows the input text as well as the finalized '$config' and '$spec' values to be altered right after the initial processing (see section:- #3.7). The second is called by 'hl_tag()' once the tag content is finalized (see section:- #3.4.9).
Being dictated by the external and stable HTML standard, htmLawed's objective is very clear-cut and less concerned with tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specs will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid in quickly grasping the logic, at least when viewed with code syntax highlighted.
The functionality of htmLawed is dictated by the external HTML standard. It is thus coded for a clear-cut objective with not much concern for tweakability. The code is only minimally annotated with comments -- it is not meant to instruct; PHP developers familiar with the HTML specifications will see the logic, and others can always refer to the htmLawed documentation. The compact structuring of the statements is meant to aid a quick grasp of the logic.
___________________________________________________________________oo

View File

@ -1,8 +1,8 @@
/*
htmLawed_TESTCASE.txt, 23 April 2009
htmLawed 1.1.8.1, 16 July 2009
htmLawed_TESTCASE.txt, 27 August 2013
htmLawed 1.1.16, 29 August 2013
Copyright Santosh Patnaik
GPL v3 license
Dual licensed with LGPL 3 and GPL 2+
A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed
*/
@ -27,6 +27,8 @@ character encoding to Unicode/UTF-8
<strong>Duplicated:</strong> <a id="id5" id="id6">a</a><br />
<strong>Deprecated:</strong> <a id="id7" target="self" name="n">a</a>, <hr noshade="noshade" /><br />
<strong>Casing:</strong> <a HREF=""></a><br />
<strong>Custom:</strong> <img alt="image" my:data="portrait" /><br />
<strong>Data-*:</strong> <a data-xml="x" data-xmnt="x" data-xmlnt="x" data-xmn:t="x" data-xmxm="x">a</a><br />
<strong>Admin-restricted?:</strong> <a href="x" onclick="alert();"></a>
<h6>Attribute values</h6>
@ -46,6 +48,11 @@ character encoding to Unicode/UTF-8
<blockquote><div>abc</div>def</blockquote><br />
<blockquote>abc<div>def</div>ghi</blockquote><br />
abc<div>def</div>ghi<br />
<blockquote>QQQ<div>x</div><!-- comment --></blockquote><br />
<blockquote><div>x</div><!-- comment -->QQQ</blockquote><br />
<blockquote><!-- comment --><div>x</div>QQQ<div>x</div></blockquote><br />
<blockquote><div>x<!-- comment --></div>QQQ</blockquote><p>x</p><br />
<br />
(try with blockquote parent)
<h6>CDATA sections</h6>
@ -94,6 +101,15 @@ The PHP <s>software</s> script used for this <strike>web-page</strike> webpage i
<area href="5" shape="Rect" coords="0,0,118,28">
</map></object>
<param name="name">value</param>
<object id="obj1">
<param name="param1">
<object id="obj2">
<param name="param2">
</object>
</object>
<h6>Complex-4: nested and other tables</h6>
<table border="1" bgcolor="red"> <tr> <td> Cell </td> <td colspan="2" rowspan="2"> <table border="1" bgcolor="green"> <tr> <td> Cell </td> <td colspan="2" rowspan="2"> </td> </tr> <tr> <td> Cell </td> </tr> <tr> <td> Cell </td> <td> Cell </td> <td> Cell </td> </tr> </table> </td> </tr> <tr> <td> Cell </td> </tr> <tr> <td> Cell </td> <td> Cell </td> <td> Cell </td> </tr> </table><br />
@ -119,8 +135,9 @@ Disallowed tag p
<strong>Invalid:</strong> <image src="s" alt="a" /><br />
<strong>Empty:</strong> <img src="s" alt="a" />, <img src="s" alt="a"></img>, <img src="s" alt="a">text</img><br />
<strong>Content invalid:</strong> <a href="h">1<a>2</a></a><br />
<strong>Content invalid?:</strong> <form></form><br /> (try setting 'form' as parent)
<strong>Casing:</strong> <A href=""></a>
<strong>Content invalid?:</strong> <form></form><br /> (try setting 'form' as parent)<br />
<strong>Casing:</strong> <A href=""></a><br />
<strong>Check for tidy:</strong> <br /><hr /></div><hr /></div><hr /></div><div>hi</div>
<h6>Entities</h6>
@ -181,10 +198,20 @@ text <img src="none" alt="none" /> <b>t<em> e <strong> x </strong> t</em></b>
<h6>HTML comments (also CDATA)</h6>
Special characters inside: <!-- <![CDATA check ]]> -->, <!-- 3 < 4 > 3.5, & 4 &gt; 4 -->, <!-- che--ck -->, <!--[if !IE]> <--><a>c</a><!--> <![endif]--><br />
Normal: <!-- check -->, <!--check -->, <em>comment:<!-- check --></em><!-- check -->, <table><!-- check --><tr><td>text not allowed</td></tr></table><br />
Malformed: <![cdata check ]]>, < ![CDATA check ]]>, < ![CDATA check ] ]><br />
Invalid: <em <!-- check -->>comment in tag content</em>, <!--check-->
<strong>Script inside:</strong> <!--[if gte IE 4]>
<SCRIPT>alert('XSS');</SCRIPT>
<![endif]--><br />
<strong>Special characters inside: <!-- <![CDATA check ]]> -->, <!-- 3 < 4 > 3.5, & 4 &gt; 4 -->, <!-- che--ck -->, <!--[if !IE]> <--><a>c</a><!--> <![endif]--><br />
<strong>Normal:</strong> <!-- check -->, <!--check -->, <em>comment:<!-- check --></em><!-- check -->, <table><!-- check --><tr><td>text not allowed</td></tr></table><br />
<strong>Malformed:</strong> <![cdata check ]]>, < ![CDATA check ]]>, < ![CDATA check ] ]><br />
Invalid:</strong> <em <!-- check -->>comment in tag content</em>, <!--check-->
<h6>HTML5</h6>
<strong>figure and figcaption:</strong> <figure><img src="picture.jpg" alt="picture"><figcaption>Caption for the awesome picture</figcaption></figure>
<strong>article:</strong> <h1>A</h1><p>B</p><article><h2>C</h2></article><article><h2>E</h2><p>F</p><p>G</p></article>
<strong>meter</strong>: <p>Heat <meter min="100" max="200" value="150">150</meter>.</p>
<strong>datalist</strong>: <input list="b" /><datalist id="b"><option value="c"><option value="d"></datalist>
<h6>Ins-Del</h6>
@ -224,6 +251,11 @@ Invalid: <em <!-- check -->>comment in tag content</em>, <!--check-->
<li>l3</li>
<li>l4<ol><li>lo3</li><li>lo4<ol><li>lo5</li></ol></li></ol></li>
</ul><br />
<strong>Nested, directly</strong>: <ul>
<li>l1</li>
<ol>l2</ol>
<li>l3</li>
</ul><br />
<strong>Nested, close-tags omitted</strong>: <ul>
<li>l1</li>
<li>l2<ol><li>lo1<li>lo2</ol>
@ -241,12 +273,34 @@ Invalid: <em <!-- check -->>comment in tag content</em>, <!--check-->
</form>
</li></ul>
</td></tr></table></li></ol>
<strong>Menu</strong>: <menu type="toolbar"><li><menu label="File">
<button type="button" onclick="new()">New...</button>
</menu></li><li><menu label="Edit"><button type="button" onclick="cut()">Cut...</button></menu></li>
</menu>
<h6>Microdata</h6>
<div itemscope itemtype="http://data-vocabulary.org/Person">
I am <span itemprop="name">X</span> but people call me <span itemprop="nickname">Y</span>.
Find me at <a href="http://www.xy.com" itemprop="url">www.xy.com</a>
</div>
<h6>Microsoft Word</h6>
<strong>Proprietary tag</strong>: <p class=3DMsoNormal><o:p>&nbsp;</o:p></p><br />
<strong>XML declaration</strong>: <?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /><br />
<strong>XML-invalid character code-point (may not replicate)</strong>: <p class=3DMsoNormal>“Where is he?” asked both Mary the one so lovely and Jane.</p>
<h6>Nesting</h6>
<strong>Block or inline a</strong>: <p><a href="link">text</a></p><a href="link"><div>hi</div></a><br />
<h6>Non-English text-1</h6>
Inscrieţi-vă acum la a Zecea Conferinţă Internaţională<br />
გთხოვთ ახლავე გაიაროთ რეგისტრაცია<br />
večjezično računalništvo<br />
<a title="อ.อ่าง">อ.อ่าง</a><br />
<a title="הירשמו
כעת לכנס ">Зарегистрируйтесь сейчас
на Десятую Международную Конференцию по</a><br />
@ -288,6 +342,7 @@ na Alemanha.
<rp>(</rp><rt>aaa</rt><rp>)</rp>
</ruby>
<h6>Tables</h6>
<strong>Omitted closing tags:</strong> <table>
@ -314,12 +369,21 @@ na Alemanha.
<tr><td>r2c1<td>r2c2
</table><br />
<h6>Tag transformation</h6>
<strong>Font element intended as 'inline' element:</strong> <p><font color='red'>hi</font></p><br />
<strong>Font element intended as 'block' element:</strong> <div><font color='red'><div>hi</div></font></div><br />
<strong>Font element intended as 'block' element:</strong> <center><font color='red'><div>hi</div><div>QQQ</div></font></center><br />
<h6>Tidy</h6>
<strong>White-space handling:</strong> abc<em> def </em> ghi abc <em>def</em> ghi
<h6>URLs</h6>
<strong>Relative and absolute:</strong> <a href="mailto:x"></a>, <a href="http://a.com/b/c/d.f"></a>, <a href="./../d.f"></a>, <a href="./d.f"></a>, <a href="d.f"></a>, <a href="#s"></a>, <a href="./../../d.f#s"></a><br />
(try base URL value of 'http://a.com/b/')<br />
<strong>CSS URLs:</strong> <div style="background-image: url('a.gif');"></div>, <div style="background-image: URL(&quot;a.gif&quot;);"></div>, <div style="background-image: url('http://a.com/a.gif');"></div>, <div style="background-image: url('./../a.gif');"></div>, <div style="background-image: &#117;r&#x6C;('js&#58;xss'&#x29;"></div><br />
<strong>Anti-spam:</strong> (try regex for 'http://a.com', etc.) <a href="mailto:x@y.com"></a>, <a href="http://a.com/b@d.f"></a>, <a href="a.com/d.f" rel="nofollow"></a>, <a href="a.com/d.f" rel="1, 2"></a>, <a href="a.com/d.f"></a>, <a href="b.com/d.f"></a>, <a href="c.com/d.f"></a><br />
<strong>Double URLs:</strong> <a style="behaviour: url(foo) url(http://example.com/xss.htc)">b</a><br />
<strong>Anti-spam:</strong> (try regex for 'http://a.com', etc.) <a href="mailto:x@y.com"></a>, <a href="http://a.com/b@d.f"></a>, <a href="a.com/d.f" rel="nofollow"></a>, <a href="a.com/d.f" rel="1, 2"></a>, <a href="a.com/d.f"></a>, <a href="b.com/d.f"></a>, <a href="c.com/d.f">, <a href="denied:http://c.com/d.f"></a><br />
<h6>XSS</h6>
@ -338,6 +402,7 @@ src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#1
<a style=";-moz-binding:url(http://lukasz.pilorz.net/xss/xss.xml#xss)" href="http://example.com">test</a><br />
<strong>Bad IE7:</strong> <a href="http://x&x=%22+style%3d%22background-image%3a+expression%28alert
%28%27xss%3f%29%29">x</a><br />
<strong>Opera:</strong> <a href="\xE2\x80\x83javascript:alert(123)">link</a>
<strong>Bad IE7:</strong> <a style=color:expr/*comment*/ession(alert(document.domain))>xxx</a><br />
<strong>Bad IE7:</strong> <a href="xxx" style="background: exp&#x72;ession(alert('xss'));">xxx</a><br />
<strong>Bad IE7:</strong> <a href="xxx" style="background: &#101;xpression(alert('xss'));">xxx</a><br />
@ -368,3 +433,18 @@ script:eval(document.all.mycode.expr)')">hi</a><br />
3 < 4 <br />
3 > 4 <br />
> 3 <br />
<._.> hi! <br />
<<< ALERT >>> <br />
<![if !vml]> some stuff <![endif]> <br />
<?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /> <br />
<uml:ns ns = "urn:www"> <br />
<uml:ns ns = 'urn:www'> <br />
if(13<age AND 21>age){say 'teen'} <br />
age >51 and a smoking history of >51 pack-years <b>was</b> <br />
age > 51 and a smoking history of >51 pack-years <b>was</b> <br />
age <51 and a smoking history of <51 pack-years <b>was</b> <br />
age < 51 and a smoking history of < 51 pack-years <b>was</b> <br />
<b>age >51 and a smoking history of >51 pack-years</b> <br />
<b>age > 51 and a smoking history of >51 pack-years</b> <br />
<b>age <51 and a smoking history of <51 pack-years</b> <br />
<b>age < 51 and a smoking history of < 51 pack-years</b> <br />