includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9\x80-\xff]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /**
 320  * Character entity aliases accepted by MediaWiki
 321  */
 322 global $wgHtmlEntityAliases;
 323 $wgHtmlEntityAliases = array(
 324         'רלמ' => 'rlm',
 325         'رلم' => 'rlm',
 326 );
 327
 328
 329 /**
 330  * XHTML sanitizer for MediaWiki
 331  * @ingroup Parser
 332  */
 333 class Sanitizer {
 334         /**
 335          * Cleans up HTML, removes dangerous tags and attributes, and
 336          * removes HTML comments
 337          * @private
 338          * @param string $text
 339          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 340          * @param array $args for the processing callback
 341          * @return string
 342          */
 343         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array() ) {
 344                 global $wgUseTidy;
 345
 346                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 347                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 348
 349                 wfProfileIn( __METHOD__ );
 350
 351                 if ( !$staticInitialised ) {
 352
 353                         $htmlpairs = array_merge( $extratags, array( # Tags that must be closed
 354                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 355                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 356                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 357                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 358                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 359                         ) );
 360                         $htmlsingle = array(
 361                                 'br', 'hr', 'li', 'dt', 'dd'
 362                         );
 363                         $htmlsingleonly = array( # Elements that cannot have close tags
 364                                 'br', 'hr'
 365                         );
 366                         $htmlnest = array( # Tags that can be nested--??
 367                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 368                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 369                         );
 370                         $tabletags = array( # Can only appear inside table, we will close them
 371                                 'td', 'th', 'tr',
 372                         );
 373                         $htmllist = array( # Tags used by list
 374                                 'ul','ol',
 375                         );
 376                         $listtags = array( # Tags that can appear in a list
 377                                 'li',
 378                         );
 379
 380                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 381                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 382
 383                         # Convert them all to hashtables for faster lookup
 384                         $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 385                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
 386                         foreach ( $vars as $var ) {
 387                                 $$var = array_flip( $$var );
 388                         }
 389                         $staticInitialised = true;
 390                 }
 391
 392                 # Remove HTML comments
 393                 $text = Sanitizer::removeHTMLcomments( $text );
 394                 $bits = explode( '<', $text );
 395                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 396                 if(!$wgUseTidy) {
 397                         $tagstack = $tablestack = array();
 398                         foreach ( $bits as $x ) {
 399                                 $regs = array();
 400                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 401                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 402                                 } else {
 403                                         $slash = $t = $params = $brace = $rest = null;
 404                                 }
 405
 406                                 $badtag = 0 ;
 407                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 408                                         # Check our stack
 409                                         if ( $slash ) {
 410                                                 # Closing a tag...
 411                                                 if( isset( $htmlsingleonly[$t] ) ) {
 412                                                         $badtag = 1;
 413                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 414                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 415                                                                 # Pop all elements with an optional close tag
 416                                                                 # and see if we find a match below them
 417                                                                 $optstack = array();
 418                                                                 array_push ($optstack, $ot);
 419                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 420                                                                                 isset( $htmlsingleallowed[$ot] ) )
 421                                                                 {
 422                                                                         array_push ($optstack, $ot);
 423                                                                 }
 424                                                                 if ( $t != $ot ) {
 425                                                                         # No match. Push the optinal elements back again
 426                                                                         $badtag = 1;
 427                                                                         while ( $ot = @array_pop( $optstack ) ) {
 428                                                                                 array_push( $tagstack, $ot );
 429                                                                         }
 430                                                                 }
 431                                                         } else {
 432                                                                 @array_push( $tagstack, $ot );
 433                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 434                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 435                                                                         $badtag = 1;
 436                                                                 }
 437                                                         }
 438                                                 } else {
 439                                                         if ( $t == 'table' ) {
 440                                                                 $tagstack = array_pop( $tablestack );
 441                                                         }
 442                                                 }
 443                                                 $newparams = '';
 444                                         } else {
 445                                                 # Keep track for later
 446                                                 if ( isset( $tabletags[$t] ) &&
 447                                                 ! in_array( 'table', $tagstack ) ) {
 448                                                         $badtag = 1;
 449                                                 } else if ( in_array( $t, $tagstack ) &&
 450                                                 ! isset( $htmlnest [$t ] ) ) {
 451                                                         $badtag = 1 ;
 452                                                 # Is it a self closed htmlpair ? (bug 5487)
 453                                                 } else if( $brace == '/>' &&
 454                                                 isset( $htmlpairs[$t] ) ) {
 455                                                         $badtag = 1;
 456                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 457                                                         # Hack to force empty tag for uncloseable elements
 458                                                         $brace = '/>';
 459                                                 } else if( isset( $htmlsingle[$t] ) ) {
 460                                                         # Hack to not close $htmlsingle tags
 461                                                         $brace = NULL;
 462                                                 } else if( isset( $tabletags[$t] )
 463                                                 &&  in_array($t ,$tagstack) ) {
 464                                                         // New table tag but forgot to close the previous one
 465                                                         $text .= "</$t>";
 466                                                 } else {
 467                                                         if ( $t == 'table' ) {
 468                                                                 array_push( $tablestack, $tagstack );
 469                                                                 $tagstack = array();
 470                                                         }
 471                                                         array_push( $tagstack, $t );
 472                                                 }
 473
 474                                                 # Replace any variables or template parameters with
 475                                                 # plaintext results.
 476                                                 if( is_callable( $processCallback ) ) {
 477                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 478                                                 }
 479
 480                                                 # Strip non-approved attributes from the tag
 481                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 482                                         }
 483                                         if ( ! $badtag ) {
 484                                                 $rest = str_replace( '>', '&gt;', $rest );
 485                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 486                                                 $text .= "<$slash$t$newparams$close>$rest";
 487                                                 continue;
 488                                         }
 489                                 }
 490                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 491                         }
 492                         # Close off any remaining tags
 493                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 494                                 $text .= "</$t>\n";
 495                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 496                         }
 497                 } else {
 498                         # this might be possible using tidy itself
 499                         foreach ( $bits as $x ) {
 500                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 501                                 $x, $regs );
 502                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 503                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 504                                         if( is_callable( $processCallback ) ) {
 505                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 506                                         }
 507                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 508                                         $rest = str_replace( '>', '&gt;', $rest );
 509                                         $text .= "<$slash$t$newparams$brace$rest";
 510                                 } else {
 511                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 512                                 }
 513                         }
 514                 }
 515                 wfProfileOut( __METHOD__ );
 516                 return $text;
 517         }
 518
 519         /**
 520          * Remove '<!--', '-->', and everything between.
 521          * To avoid leaving blank lines, when a comment is both preceded
 522          * and followed by a newline (ignoring spaces), trim leading and
 523          * trailing spaces and one of the newlines.
 524          *
 525          * @private
 526          * @param string $text
 527          * @return string
 528          */
 529         static function removeHTMLcomments( $text ) {
 530                 wfProfileIn( __METHOD__ );
 531                 while (($start = strpos($text, '<!--')) !== false) {
 532                         $end = strpos($text, '-->', $start + 4);
 533                         if ($end === false) {
 534                                 # Unterminated comment; bail out
 535                                 break;
 536                         }
 537
 538                         $end += 3;
 539
 540                         # Trim space and newline if the comment is both
 541                         # preceded and followed by a newline
 542                         $spaceStart = max($start - 1, 0);
 543                         $spaceLen = $end - $spaceStart;
 544                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 545                                 $spaceStart--;
 546                                 $spaceLen++;
 547                         }
 548                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 549                                 $spaceLen++;
 550                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 551                                 # Remove the comment, leading and trailing
 552                                 # spaces, and leave only one newline.
 553                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 554                         }
 555                         else {
 556                                 # Remove just the comment.
 557                                 $text = substr_replace($text, '', $start, $end - $start);
 558                         }
 559                 }
 560                 wfProfileOut( __METHOD__ );
 561                 return $text;
 562         }
 563
 564         /**
 565          * Take an array of attribute names and values and normalize or discard
 566          * illegal values for the given element type.
 567          *
 568          * - Discards attributes not on a whitelist for the given element
 569          * - Unsafe style attributes are discarded
 570          * - Invalid id attributes are reencoded
 571          *
 572          * @param array $attribs
 573          * @param string $element
 574          * @return array
 575          *
 576          * @todo Check for legal values where the DTD limits things.
 577          * @todo Check for unique id attribute :P
 578          */
 579         static function validateTagAttributes( $attribs, $element ) {
 580                 return Sanitizer::validateAttributes( $attribs,
 581                         Sanitizer::attributeWhitelist( $element ) );
 582         }
 583
 584         /**
 585          * Take an array of attribute names and values and normalize or discard
 586          * illegal values for the given whitelist.
 587          *
 588          * - Discards attributes not the given whitelist
 589          * - Unsafe style attributes are discarded
 590          * - Invalid id attributes are reencoded
 591          *
 592          * @param array $attribs
 593          * @param array $whitelist list of allowed attribute names
 594          * @return array
 595          *
 596          * @todo Check for legal values where the DTD limits things.
 597          * @todo Check for unique id attribute :P
 598          */
 599         static function validateAttributes( $attribs, $whitelist ) {
 600                 $whitelist = array_flip( $whitelist );
 601                 $out = array();
 602                 foreach( $attribs as $attribute => $value ) {
 603                         if( !isset( $whitelist[$attribute] ) ) {
 604                                 continue;
 605                         }
 606                         # Strip javascript "expression" from stylesheets.
 607                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 608                         if( $attribute == 'style' ) {
 609                                 $value = Sanitizer::checkCss( $value );
 610                                 if( $value === false ) {
 611                                         # haxx0r
 612                                         continue;
 613                                 }
 614                         }
 615
 616                         if ( $attribute === 'id' ) {
 617                                 global $wgEnforceHtmlIds;
 618                                 $value = Sanitizer::escapeId( $value,
 619                                         $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
 620                         }
 621
 622                         // If this attribute was previously set, override it.
 623                         // Output should only have one attribute of each name.
 624                         $out[$attribute] = $value;
 625                 }
 626                 return $out;
 627         }
 628
 629         /**
 630          * Merge two sets of HTML attributes.  Conflicting items in the second set
 631          * will override those in the first, except for 'class' attributes which
 632          * will be combined (if they're both strings).
 633          *
 634          * @todo implement merging for other attributes such as style
 635          * @param array $a
 636          * @param array $b
 637          * @return array
 638          */
 639         static function mergeAttributes( $a, $b ) {
 640                 $out = array_merge( $a, $b );
 641                 if( isset( $a['class'] ) && isset( $b['class'] )
 642                 && is_string( $a['class'] ) && is_string( $b['class'] )
 643                 && $a['class'] !== $b['class'] ) {
 644                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 645                                 -1, PREG_SPLIT_NO_EMPTY );
 646                         $out['class'] = implode( ' ', array_unique( $classes ) );
 647                 }
 648                 return $out;
 649         }
 650
 651         /**
 652          * Pick apart some CSS and check it for forbidden or unsafe structures.
 653          * Returns a sanitized string, or false if it was just too evil.
 654          *
 655          * Currently URL references, 'expression', 'tps' are forbidden.
 656          *
 657          * @param string $value
 658          * @return mixed
 659          */
 660         static function checkCss( $value ) {
 661                 $value = Sanitizer::decodeCharReferences( $value );
 662
 663                 // Remove any comments; IE gets token splitting wrong
 664                 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
 665
 666                 // Decode escape sequences and line continuation
 667                 // See the grammar in the CSS 2 spec, appendix D, Mozilla implements it accurately.
 668                 // IE 8 doesn't implement it at all, but there's no way to introduce url() into
 669                 // IE that doesn't hit Mozilla also.
 670                 static $decodeRegex;
 671                 if ( !$decodeRegex ) {
 672                         $space = '[\\x20\\t\\r\\n\\f]';
 673                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
 674                         $backslash = '\\\\';
 675                         $decodeRegex = "/ $backslash
 676                                 (?:
 677                                         ($nl) |  # 1. Line continuation
 678                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
 679                                         (.) # 3. backslash cancelling special meaning
 680                                 )/xu";
 681                 }
 682                 $decoded = preg_replace_callback( $decodeRegex,
 683                         array( __CLASS__, 'cssDecodeCallback' ), $value );
 684                 if ( preg_match( '!expression|https?://|url\s*\(!i', $decoded ) ) {
 685                         // Not allowed
 686                         return false;
 687                 } else {
 688                         // Allowed, return CSS with comments stripped
 689                         return $value;
 690                 }
 691         }
 692
 693         static function cssDecodeCallback( $matches ) {
 694                 if ( $matches[1] !== '' ) {
 695                         return '';
 696                 } elseif ( $matches[2] !== '' ) {
 697                         return codepointToUtf8( hexdec( $matches[2] ) );
 698                 } elseif ( $matches[3] !== '' ) {
 699                         return $matches[3];
 700                 } else {
 701                         throw new MWException( __METHOD__.': invalid match' );
 702                 }
 703         }
 704
 705         /**
 706          * Take a tag soup fragment listing an HTML element's attributes
 707          * and normalize it to well-formed XML, discarding unwanted attributes.
 708          * Output is safe for further wikitext processing, with escaping of
 709          * values that could trigger problems.
 710          *
 711          * - Normalizes attribute names to lowercase
 712          * - Discards attributes not on a whitelist for the given element
 713          * - Turns broken or invalid entities into plaintext
 714          * - Double-quotes all attribute values
 715          * - Attributes without values are given the name as attribute
 716          * - Double attributes are discarded
 717          * - Unsafe style attributes are discarded
 718          * - Prepends space if there are attributes.
 719          *
 720          * @param string $text
 721          * @param string $element
 722          * @return string
 723          */
 724         static function fixTagAttributes( $text, $element ) {
 725                 if( trim( $text ) == '' ) {
 726                         return '';
 727                 }
 728
 729                 $stripped = Sanitizer::validateTagAttributes(
 730                         Sanitizer::decodeTagAttributes( $text ), $element );
 731
 732                 $attribs = array();
 733                 foreach( $stripped as $attribute => $value ) {
 734                         $encAttribute = htmlspecialchars( $attribute );
 735                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 736
 737                         $attribs[] = "$encAttribute=\"$encValue\"";
 738                 }
 739                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 740         }
 741
 742         /**
 743          * Encode an attribute value for HTML output.
 744          * @param $text
 745          * @return HTML-encoded text fragment
 746          */
 747         static function encodeAttribute( $text ) {
 748                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 749
 750                 // Whitespace is normalized during attribute decoding,
 751                 // so if we've been passed non-spaces we must encode them
 752                 // ahead of time or they won't be preserved.
 753                 $encValue = strtr( $encValue, array(
 754                         "\n" => '&#10;',
 755                         "\r" => '&#13;',
 756                         "\t" => '&#9;',
 757                 ) );
 758
 759                 return $encValue;
 760         }
 761
 762         /**
 763          * Encode an attribute value for HTML tags, with extra armoring
 764          * against further wiki processing.
 765          * @param $text
 766          * @return HTML-encoded text fragment
 767          */
 768         static function safeEncodeAttribute( $text ) {
 769                 $encValue = Sanitizer::encodeAttribute( $text );
 770
 771                 # Templates and links may be expanded in later parsing,
 772                 # creating invalid or dangerous output. Suppress this.
 773                 $encValue = strtr( $encValue, array(
 774                         '<'    => '&lt;',   // This should never happen,
 775                         '>'    => '&gt;',   // we've received invalid input
 776                         '"'    => '&quot;', // which should have been escaped.
 777                         '{'    => '&#123;',
 778                         '['    => '&#91;',
 779                         "''"   => '&#39;&#39;',
 780                         'ISBN' => '&#73;SBN',
 781                         'RFC'  => '&#82;FC',
 782                         'PMID' => '&#80;MID',
 783                         '|'    => '&#124;',
 784                         '__'   => '&#95;_',
 785                 ) );
 786
 787                 # Stupid hack
 788                 $encValue = preg_replace_callback(
 789                         '/(' . wfUrlProtocols() . ')/',
 790                         array( 'Sanitizer', 'armorLinksCallback' ),
 791                         $encValue );
 792                 return $encValue;
 793         }
 794
 795         /**
 796          * Given a value escape it so that it can be used in an id attribute and
 797          * return it, this does not validate the value however (see first link)
 798          *
 799          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 800          *                                                          in the id and
 801          *                                                          name attributes
 802          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 803          *
 804          * @param string $id      Id to validate
 805          * @param mixed  $options String or array of strings (default is array()):
 806          *   'noninitial': This is a non-initial fragment of an id, not a full id,
 807          *       so don't pay attention if the first character isn't valid at the
 808          *       beginning of an id.
 809          *   'xml': Don't restrict the id to be HTML4-compatible.  This option
 810          *       allows any alphabetic character to be used, per the XML standard.
 811          *       Therefore, it also completely changes the type of escaping: instead
 812          *       of weird dot-encoding, runs of invalid characters (mostly
 813          *       whitespace) are just compressed into a single underscore.
 814          * @return string
 815          */
 816         static function escapeId( $id, $options = array() ) {
 817                 $options = (array)$options;
 818
 819                 if ( !in_array( 'xml', $options ) ) {
 820                         # HTML4-style escaping
 821                         static $replace = array(
 822                                 '%3A' => ':',
 823                                 '%' => '.'
 824                         );
 825
 826                         $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 827                         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 828
 829                         if ( !preg_match( '/^[a-zA-Z]/', $id )
 830                         && !in_array( 'noninitial', $options ) )  {
 831                                 // Initial character must be a letter!
 832                                 $id = "x$id";
 833                         }
 834                         return $id;
 835                 }
 836
 837                 # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
 838                 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
 839                 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
 840                         . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
 841                         . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
 842                 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
 843                         . '\x{203F}-\x{2040}';
 844                 # Replace _ as well so we don't get multiple consecutive underscores
 845                 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
 846                 $id = trim( $id, '_' );
 847
 848                 if ( !preg_match( "/^[$nameStartChar]/u", $id )
 849                 && !in_array( 'noninitial', $options ) ) {
 850                         $id = "_$id";
 851                 }
 852
 853                 return $id;
 854         }
 855
 856         /**
 857          * Given a value, escape it so that it can be used as a CSS class and
 858          * return it.
 859          *
 860          * @todo For extra validity, input should be validated UTF-8.
 861          *
 862          * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 863          *
 864          * @param string $class
 865          * @return string
 866          */
 867         static function escapeClass( $class ) {
 868                 // Convert ugly stuff to underscores and kill underscores in ugly places
 869                 return rtrim(preg_replace(
 870                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 871                         '_',
 872                         $class ), '_');
 873         }
 874
 875         /**
 876          * Given HTML input, escape with htmlspecialchars but un-escape entites.
 877          * This allows (generally harmless) entities like &nbsp; to survive.
 878          *
 879          * @param  string $html String to escape
 880          * @return string Escaped input
 881          */
 882         static function escapeHtmlAllowEntities( $html ) {
 883                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
 884                 # hurt.
 885                 $html = htmlspecialchars( $html, ENT_QUOTES );
 886                 $html = str_replace( '&amp;', '&', $html );
 887                 $html = Sanitizer::normalizeCharReferences( $html );
 888                 return $html;
 889         }
 890
 891         /**
 892          * Regex replace callback for armoring links against further processing.
 893          * @param array $matches
 894          * @return string
 895          * @private
 896          */
 897         private static function armorLinksCallback( $matches ) {
 898                 return str_replace( ':', '&#58;', $matches[1] );
 899         }
 900
 901         /**
 902          * Return an associative array of attribute names and values from
 903          * a partial tag string. Attribute names are forces to lowercase,
 904          * character references are decoded to UTF-8 text.
 905          *
 906          * @param string
 907          * @return array
 908          */
 909         public static function decodeTagAttributes( $text ) {
 910                 $attribs = array();
 911
 912                 if( trim( $text ) == '' ) {
 913                         return $attribs;
 914                 }
 915
 916                 $pairs = array();
 917                 if( !preg_match_all(
 918                         MW_ATTRIBS_REGEX,
 919                         $text,
 920                         $pairs,
 921                         PREG_SET_ORDER ) ) {
 922                         return $attribs;
 923                 }
 924
 925                 foreach( $pairs as $set ) {
 926                         $attribute = strtolower( $set[1] );
 927                         $value = Sanitizer::getTagAttributeCallback( $set );
 928
 929                         // Normalize whitespace
 930                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 931                         $value = trim( $value );
 932
 933                         // Decode character references
 934                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 935                 }
 936                 return $attribs;
 937         }
 938
 939         /**
 940          * Pick the appropriate attribute value from a match set from the
 941          * MW_ATTRIBS_REGEX matches.
 942          *
 943          * @param array $set
 944          * @return string
 945          * @private
 946          */
 947         private static function getTagAttributeCallback( $set ) {
 948                 if( isset( $set[6] ) ) {
 949                         # Illegal #XXXXXX color with no quotes.
 950                         return $set[6];
 951                 } elseif( isset( $set[5] ) ) {
 952                         # No quotes.
 953                         return $set[5];
 954                 } elseif( isset( $set[4] ) ) {
 955                         # Single-quoted
 956                         return $set[4];
 957                 } elseif( isset( $set[3] ) ) {
 958                         # Double-quoted
 959                         return $set[3];
 960                 } elseif( !isset( $set[2] ) ) {
 961                         # In XHTML, attributes must have a value.
 962                         # For 'reduced' form, return explicitly the attribute name here.
 963                         return $set[1];
 964                 } else {
 965                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 966                 }
 967         }
 968
 969         /**
 970          * Normalize whitespace and character references in an XML source-
 971          * encoded text for an attribute value.
 972          *
 973          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 974          * but note that we're not returning the value, but are returning
 975          * XML source fragments that will be slapped into output.
 976          *
 977          * @param string $text
 978          * @return string
 979          * @private
 980          */
 981         private static function normalizeAttributeValue( $text ) {
 982                 return str_replace( '"', '&quot;',
 983                         self::normalizeWhitespace(
 984                                 Sanitizer::normalizeCharReferences( $text ) ) );
 985         }
 986
 987         private static function normalizeWhitespace( $text ) {
 988                 return preg_replace(
 989                         '/\r\n|[\x20\x0d\x0a\x09]/',
 990                         ' ',
 991                         $text );
 992         }
 993
 994         /**
 995          * Ensure that any entities and character references are legal
 996          * for XML and XHTML specifically. Any stray bits will be
 997          * &amp;-escaped to result in a valid text fragment.
 998          *
 999          * a. any named char refs must be known in XHTML
1000          * b. any numeric char refs must be legal chars, not invalid or forbidden
1001          * c. use &#x, not &#X
1002          * d. fix or reject non-valid attributes
1003          *
1004          * @param string $text
1005          * @return string
1006          * @private
1007          */
1008         static function normalizeCharReferences( $text ) {
1009                 return preg_replace_callback(
1010                         MW_CHAR_REFS_REGEX,
1011                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1012                         $text );
1013         }
1014         /**
1015          * @param string $matches
1016          * @return string
1017          */
1018         static function normalizeCharReferencesCallback( $matches ) {
1019                 $ret = null;
1020                 if( $matches[1] != '' ) {
1021                         $ret = Sanitizer::normalizeEntity( $matches[1] );
1022                 } elseif( $matches[2] != '' ) {
1023                         $ret = Sanitizer::decCharReference( $matches[2] );
1024                 } elseif( $matches[3] != ''  ) {
1025                         $ret = Sanitizer::hexCharReference( $matches[3] );
1026                 } elseif( $matches[4] != '' ) {
1027                         $ret = Sanitizer::hexCharReference( $matches[4] );
1028                 }
1029                 if( is_null( $ret ) ) {
1030                         return htmlspecialchars( $matches[0] );
1031                 } else {
1032                         return $ret;
1033                 }
1034         }
1035
1036         /**
1037          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1038          * return the named entity reference as is. If the entity is a
1039          * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
1040          * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
1041          *
1042          * @param string $name
1043          * @return string
1044          * @static
1045          */
1046         static function normalizeEntity( $name ) {
1047                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1048                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1049                         return "&{$wgHtmlEntityAliases[$name]};";
1050                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
1051                         return "&$name;";
1052                 } else {
1053                         return "&amp;$name;";
1054                 }
1055         }
1056
1057         static function decCharReference( $codepoint ) {
1058                 $point = intval( $codepoint );
1059                 if( Sanitizer::validateCodepoint( $point ) ) {
1060                         return sprintf( '&#%d;', $point );
1061                 } else {
1062                         return null;
1063                 }
1064         }
1065
1066         static function hexCharReference( $codepoint ) {
1067                 $point = hexdec( $codepoint );
1068                 if( Sanitizer::validateCodepoint( $point ) ) {
1069                         return sprintf( '&#x%x;', $point );
1070                 } else {
1071                         return null;
1072                 }
1073         }
1074
1075         /**
1076          * Returns true if a given Unicode codepoint is a valid character in XML.
1077          * @param int $codepoint
1078          * @return bool
1079          */
1080         private static function validateCodepoint( $codepoint ) {
1081                 return ($codepoint ==    0x09)
1082                         || ($codepoint ==    0x0a)
1083                         || ($codepoint ==    0x0d)
1084                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
1085                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
1086                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1087         }
1088
1089         /**
1090          * Decode any character references, numeric or named entities,
1091          * in the text and return a UTF-8 string.
1092          *
1093          * @param string $text
1094          * @return string
1095          * @public
1096          * @static
1097          */
1098         public static function decodeCharReferences( $text ) {
1099                 return preg_replace_callback(
1100                         MW_CHAR_REFS_REGEX,
1101                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1102                         $text );
1103         }
1104
1105         /**
1106          * @param string $matches
1107          * @return string
1108          */
1109         static function decodeCharReferencesCallback( $matches ) {
1110                 if( $matches[1] != '' ) {
1111                         return Sanitizer::decodeEntity( $matches[1] );
1112                 } elseif( $matches[2] != '' ) {
1113                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
1114                 } elseif( $matches[3] != ''  ) {
1115                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
1116                 } elseif( $matches[4] != '' ) {
1117                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
1118                 }
1119                 # Last case should be an ampersand by itself
1120                 return $matches[0];
1121         }
1122
1123         /**
1124          * Return UTF-8 string for a codepoint if that is a valid
1125          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1126          * @param int $codepoint
1127          * @return string
1128          * @private
1129          */
1130         static function decodeChar( $codepoint ) {
1131                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1132                         return codepointToUtf8( $codepoint );
1133                 } else {
1134                         return UTF8_REPLACEMENT;
1135                 }
1136         }
1137
1138         /**
1139          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1140          * return the UTF-8 encoding of that character. Otherwise, returns
1141          * pseudo-entity source (eg &foo;)
1142          *
1143          * @param string $name
1144          * @return string
1145          */
1146         static function decodeEntity( $name ) {
1147                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1148                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1149                         $name = $wgHtmlEntityAliases[$name];
1150                 }
1151                 if( isset( $wgHtmlEntities[$name] ) ) {
1152                         return codepointToUtf8( $wgHtmlEntities[$name] );
1153                 } else {
1154                         return "&$name;";
1155                 }
1156         }
1157
1158         /**
1159          * Fetch the whitelist of acceptable attributes for a given
1160          * element name.
1161          *
1162          * @param string $element
1163          * @return array
1164          */
1165         static function attributeWhitelist( $element ) {
1166                 static $list;
1167                 if( !isset( $list ) ) {
1168                         $list = Sanitizer::setupAttributeWhitelist();
1169                 }
1170                 return isset( $list[$element] )
1171                         ? $list[$element]
1172                         : array();
1173         }
1174
1175         /**
1176          * Foreach array key (an allowed HTML element), return an array
1177          * of allowed attributes
1178          * @return array
1179          */
1180         static function setupAttributeWhitelist() {
1181                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1182                 $block = array_merge( $common, array( 'align' ) );
1183                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1184                 $tablecell = array( 'abbr',
1185                                     'axis',
1186                                     'headers',
1187                                     'scope',
1188                                     'rowspan',
1189                                     'colspan',
1190                                     'nowrap', # deprecated
1191                                     'width',  # deprecated
1192                                     'height', # deprecated
1193                                     'bgcolor' # deprecated
1194                                     );
1195
1196                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1197                 # See: http://www.w3.org/TR/html4/
1198                 $whitelist = array (
1199                         # 7.5.4
1200                         'div'        => $block,
1201                         'center'     => $common, # deprecated
1202                         'span'       => $block, # ??
1203
1204                         # 7.5.5
1205                         'h1'         => $block,
1206                         'h2'         => $block,
1207                         'h3'         => $block,
1208                         'h4'         => $block,
1209                         'h5'         => $block,
1210                         'h6'         => $block,
1211
1212                         # 7.5.6
1213                         # address
1214
1215                         # 8.2.4
1216                         # bdo
1217
1218                         # 9.2.1
1219                         'em'         => $common,
1220                         'strong'     => $common,
1221                         'cite'       => $common,
1222                         # dfn
1223                         'code'       => $common,
1224                         # samp
1225                         # kbd
1226                         'var'        => $common,
1227                         # abbr
1228                         # acronym
1229
1230                         # 9.2.2
1231                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1232                         # q
1233
1234                         # 9.2.3
1235                         'sub'        => $common,
1236                         'sup'        => $common,
1237
1238                         # 9.3.1
1239                         'p'          => $block,
1240
1241                         # 9.3.2
1242                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1243
1244                         # 9.3.4
1245                         'pre'        => array_merge( $common, array( 'width' ) ),
1246
1247                         # 9.4
1248                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1249                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1250
1251                         # 10.2
1252                         'ul'         => array_merge( $common, array( 'type' ) ),
1253                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1254                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1255
1256                         # 10.3
1257                         'dl'         => $common,
1258                         'dd'         => $common,
1259                         'dt'         => $common,
1260
1261                         # 11.2.1
1262                         'table'      => array_merge( $common,
1263                                                                 array( 'summary', 'width', 'border', 'frame',
1264                                                                                 'rules', 'cellspacing', 'cellpadding',
1265                                                                                 'align', 'bgcolor',
1266                                                                 ) ),
1267
1268                         # 11.2.2
1269                         'caption'    => array_merge( $common, array( 'align' ) ),
1270
1271                         # 11.2.3
1272                         'thead'      => array_merge( $common, $tablealign ),
1273                         'tfoot'      => array_merge( $common, $tablealign ),
1274                         'tbody'      => array_merge( $common, $tablealign ),
1275
1276                         # 11.2.4
1277                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1278                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1279
1280                         # 11.2.5
1281                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1282
1283                         # 11.2.6
1284                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1285                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1286
1287                         # 13.2
1288                         # Not usually allowed, but may be used for extension-style hooks
1289                         # such as <math> when it is rasterized
1290                         'img'        => array_merge( $common, array( 'alt' ) ),
1291
1292                         # 15.2.1
1293                         'tt'         => $common,
1294                         'b'          => $common,
1295                         'i'          => $common,
1296                         'big'        => $common,
1297                         'small'      => $common,
1298                         'strike'     => $common,
1299                         's'          => $common,
1300                         'u'          => $common,
1301
1302                         # 15.2.2
1303                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1304                         # basefont
1305
1306                         # 15.3
1307                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1308
1309                         # XHTML Ruby annotation text module, simple ruby only.
1310                         # http://www.w3c.org/TR/ruby/
1311                         'ruby'       => $common,
1312                         # rbc
1313                         # rtc
1314                         'rb'         => $common,
1315                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1316                         'rp'         => $common,
1317
1318                         # MathML root element, where used for extensions
1319                         # 'title' may not be 100% valid here; it's XHTML
1320                         # http://www.w3.org/TR/REC-MathML/
1321                         'math'       => array( 'class', 'style', 'id', 'title' ),
1322                         );
1323                 return $whitelist;
1324         }
1325
1326         /**
1327          * Take a fragment of (potentially invalid) HTML and return
1328          * a version with any tags removed, encoded as plain text.
1329          *
1330          * Warning: this return value must be further escaped for literal
1331          * inclusion in HTML output as of 1.10!
1332          *
1333          * @param string $text HTML fragment
1334          * @return string
1335          */
1336         static function stripAllTags( $text ) {
1337                 # Actual <tags>
1338                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1339
1340                 # Normalize &entities and whitespace
1341                 $text = self::decodeCharReferences( $text );
1342                 $text = self::normalizeWhitespace( $text );
1343
1344                 return $text;
1345         }
1346
1347         /**
1348          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1349          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1350          * PHP 5.1 doesn't.
1351          *
1352          * Use for passing XHTML fragments to PHP's XML parsing functions
1353          *
1354          * @return string
1355          * @static
1356          */
1357         static function hackDocType() {
1358                 global $wgHtmlEntities;
1359                 $out = "<!DOCTYPE html [\n";
1360                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1361                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1362                 }
1363                 $out .= "]>\n";
1364                 return $out;
1365         }
1366
1367         static function cleanUrl( $url ) {
1368                 # Normalize any HTML entities in input. They will be
1369                 # re-escaped by makeExternalLink().
1370                 $url = Sanitizer::decodeCharReferences( $url );
1371
1372                 # Escape any control characters introduced by the above step
1373                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1374
1375                 # Validate hostname portion
1376                 $matches = array();
1377                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1378                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1379
1380                         // Characters that will be ignored in IDNs.
1381                         // http://tools.ietf.org/html/3454#section-3.1
1382                         // Strip them before further processing so blacklists and such work.
1383                         $strip = "/
1384                                 \\s|          # general whitespace
1385                                 \xc2\xad|     # 00ad SOFT HYPHEN
1386                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1387                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1388                                 \xe2\x81\xa0| # 2060 WORD JOINER
1389                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1390                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1391                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1392                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1393                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1394                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1395                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1396                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1397                                 /xuD";
1398
1399                         $host = preg_replace( $strip, '', $host );
1400
1401                         // @fixme: validate hostnames here
1402
1403                         return $protocol . $host . $rest;
1404                 } else {
1405                         return $url;
1406                 }
1407         }
1408
1409 }