includes/Sanitizer.php

   1 <?php
   2 /**
   3  * (X)HTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @access private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @access private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr'
 355                         );
 356                 } else {
 357                         $htmlpairs = array();
 358                         $htmlsingle = array();
 359                         $htmlnest = array();
 360                         $tabletags = array();
 361                 }
 362
 363                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 364                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 365
 366                 # Remove HTML comments
 367                 $text = Sanitizer::removeHTMLcomments( $text );
 368
 369                 $bits = explode( '<', $text );
 370                 $text = array_shift( $bits );
 371                 if(!$wgUseTidy) {
 372                         $tagstack = array(); $tablestack = array();
 373                         foreach ( $bits as $x ) {
 374                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 375                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 376                                 $x, $regs );
 377                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 378                                 error_reporting( $prev );
 379
 380                                 $badtag = 0 ;
 381                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 382                                         # Check our stack
 383                                         if ( $slash ) {
 384                                                 # Closing a tag...
 385                                                 if( in_array( $t, $htmlsingleonly ) ) {
 386                                                         $badtag = 1;
 387                                                 } elseif( !in_array( $t, $htmlsingle ) &&
 388                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
 389                                                         @array_push( $tagstack, $ot );
 390                                                         $badtag = 1;
 391                                                 } else {
 392                                                         if ( $t == 'table' ) {
 393                                                                 $tagstack = array_pop( $tablestack );
 394                                                         }
 395                                                         $newparams = '';
 396                                                 }
 397                                         } else {
 398                                                 # Keep track for later
 399                                                 if ( in_array( $t, $tabletags ) &&
 400                                                 ! in_array( 'table', $tagstack ) ) {
 401                                                         $badtag = 1;
 402                                                 } else if ( in_array( $t, $tagstack ) &&
 403                                                 ! in_array ( $t , $htmlnest ) ) {
 404                                                         $badtag = 1 ;
 405                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 406                                                         # Hack to force empty tag for uncloseable elements
 407                                                         $brace = '/>';
 408                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 409                                                         if ( $t == 'table' ) {
 410                                                                 array_push( $tablestack, $tagstack );
 411                                                                 $tagstack = array();
 412                                                         }
 413                                                         array_push( $tagstack, $t );
 414                                                 }
 415
 416                                                 # Replace any variables or template parameters with
 417                                                 # plaintext results.
 418                                                 if( is_callable( $processCallback ) ) {
 419                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 420                                                 }
 421
 422                                                 # Strip non-approved attributes from the tag
 423                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 424                                         }
 425                                         if ( ! $badtag ) {
 426                                                 $rest = str_replace( '>', '&gt;', $rest );
 427                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 428                                                 $text .= "<$slash$t$newparams$close>$rest";
 429                                                 continue;
 430                                         }
 431                                 }
 432                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 433                         }
 434                         # Close off any remaining tags
 435                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 436                                 $text .= "</$t>\n";
 437                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 438                         }
 439                 } else {
 440                         # this might be possible using tidy itself
 441                         foreach ( $bits as $x ) {
 442                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 443                                 $x, $regs );
 444                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 445                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 446                                         if( is_callable( $processCallback ) ) {
 447                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 448                                         }
 449                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 450                                         $rest = str_replace( '>', '&gt;', $rest );
 451                                         $text .= "<$slash$t$newparams$brace$rest";
 452                                 } else {
 453                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 454                                 }
 455                         }
 456                 }
 457                 wfProfileOut( $fname );
 458                 return $text;
 459         }
 460
 461         /**
 462          * Remove '<!--', '-->', and everything between.
 463          * To avoid leaving blank lines, when a comment is both preceded
 464          * and followed by a newline (ignoring spaces), trim leading and
 465          * trailing spaces and one of the newlines.
 466          *
 467          * @access private
 468          * @param string $text
 469          * @return string
 470          */
 471         function removeHTMLcomments( $text ) {
 472                 $fname='Parser::removeHTMLcomments';
 473                 wfProfileIn( $fname );
 474                 while (($start = strpos($text, '<!--')) !== false) {
 475                         $end = strpos($text, '-->', $start + 4);
 476                         if ($end === false) {
 477                                 # Unterminated comment; bail out
 478                                 break;
 479                         }
 480
 481                         $end += 3;
 482
 483                         # Trim space and newline if the comment is both
 484                         # preceded and followed by a newline
 485                         $spaceStart = max($start - 1, 0);
 486                         $spaceLen = $end - $spaceStart;
 487                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 488                                 $spaceStart--;
 489                                 $spaceLen++;
 490                         }
 491                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 492                                 $spaceLen++;
 493                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 494                                 # Remove the comment, leading and trailing
 495                                 # spaces, and leave only one newline.
 496                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 497                         }
 498                         else {
 499                                 # Remove just the comment.
 500                                 $text = substr_replace($text, '', $start, $end - $start);
 501                         }
 502                 }
 503                 wfProfileOut( $fname );
 504                 return $text;
 505         }
 506
 507         /**
 508          * Take a tag soup fragment listing an HTML element's attributes
 509          * and normalize it to well-formed XML, discarding unwanted attributes.
 510          *
 511          * - Normalizes attribute names to lowercase
 512          * - Discards attributes not on a whitelist for the given element
 513          * - Turns broken or invalid entities into plaintext
 514          * - Double-quotes all attribute values
 515          * - Attributes without values are given the name as attribute
 516          * - Double attributes are discarded
 517          * - Unsafe style attributes are discarded
 518          * - Prepends space if there are attributes.
 519          *
 520          * @param string $text
 521          * @param string $element
 522          * @return string
 523          *
 524          * @todo Check for legal values where the DTD limits things.
 525          * @todo Check for unique id attribute :P
 526          */
 527         function fixTagAttributes( $text, $element ) {
 528                 global $wgUrlProtocols;
 529                 if( trim( $text ) == '' ) {
 530                         return '';
 531                 }
 532
 533                 # Unquoted attribute
 534                 # Since we quote this later, this can be anything distinguishable
 535                 # from the end of the attribute
 536                 if( !preg_match_all(
 537                         MW_ATTRIBS_REGEX,
 538                         $text,
 539                         $pairs,
 540                         PREG_SET_ORDER ) ) {
 541                         return '';
 542                 }
 543
 544                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 545                 $attribs = array();
 546                 foreach( $pairs as $set ) {
 547                         $attribute = strtolower( $set[1] );
 548                         if( !isset( $whitelist[$attribute] ) ) {
 549                                 continue;
 550                         }
 551
 552                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 553                         $value = Sanitizer::normalizeAttributeValue( $raw );
 554
 555                         # Strip javascript "expression" from stylesheets.
 556                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 557                         if( $attribute == 'style' ) {
 558                                 $stripped = Sanitizer::decodeCharReferences( $value );
 559
 560                                 // Remove any comments; IE gets token splitting wrong
 561                                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 562                                 $value = htmlspecialchars( $stripped );
 563
 564                                 // ... and continue checks
 565                                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 566                                         'codepointToUtf8(hexdec("$1"))', $stripped );
 567                                 $stripped = str_replace( '\\', '', $stripped );
 568                                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 569                                                 $stripped ) ) {
 570                                         # haxx0r
 571                                         continue;
 572                                 }
 573                         }
 574
 575                         # Templates and links may be expanded in later parsing,
 576                         # creating invalid or dangerous output. Suppress this.
 577                         $value = strtr( $value, array(
 578                                 '{'    => '&#123;',
 579                                 '['    => '&#91;',
 580                                 "''"   => '&#39;&#39;',
 581                                 'ISBN' => '&#73;SBN',
 582                                 'RFC'  => '&#82;FC',
 583                                 'PMID' => '&#80;MID',
 584                         ) );
 585
 586                         # Stupid hack
 587                         $value = preg_replace_callback(
 588                                 '/(' . $wgUrlProtocols . ')/',
 589                                 array( 'Sanitizer', 'armorLinksCallback' ),
 590                                 $value );
 591
 592                         // If this attribute was previously set, override it.
 593                         // Output should only have one attribute of each name.
 594                         $attribs[$attribute] = "$attribute=\"$value\"";
 595                 }
 596                 if( empty( $attribs ) ) {
 597                         return '';
 598                 } else {
 599                         return ' ' . implode( ' ', $attribs );
 600                 }
 601         }
 602
 603         /**
 604          * Regex replace callback for armoring links against further processing.
 605          * @param array $matches
 606          * @return string
 607          * @access private
 608          */
 609         function armorLinksCallback( $matches ) {
 610                 return str_replace( ':', '&#58;', $matches[1] );
 611         }
 612
 613         /**
 614          * Return an associative array of attribute names and values from
 615          * a partial tag string. Attribute names are forces to lowercase,
 616          * character references are decoded to UTF-8 text.
 617          *
 618          * @param string
 619          * @return array
 620          */
 621         function decodeTagAttributes( $text ) {
 622                 $attribs = array();
 623
 624                 if( trim( $text ) == '' ) {
 625                         return $attribs;
 626                 }
 627
 628                 if( !preg_match_all(
 629                         MW_ATTRIBS_REGEX,
 630                         $text,
 631                         $pairs,
 632                         PREG_SET_ORDER ) ) {
 633                         return $attribs;
 634                 }
 635
 636                 foreach( $pairs as $set ) {
 637                         $attribute = strtolower( $set[1] );
 638                         $value = Sanitizer::getTagAttributeCallback( $set );
 639                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 640                 }
 641                 return $attribs;
 642         }
 643
 644         /**
 645          * Pick the appropriate attribute value from a match set from the
 646          * MW_ATTRIBS_REGEX matches.
 647          *
 648          * @param array $set
 649          * @return string
 650          * @access private
 651          */
 652         function getTagAttributeCallback( $set ) {
 653                 if( isset( $set[6] ) ) {
 654                         # Illegal #XXXXXX color with no quotes.
 655                         return $set[6];
 656                 } elseif( isset( $set[5] ) ) {
 657                         # No quotes.
 658                         return $set[5];
 659                 } elseif( isset( $set[4] ) ) {
 660                         # Single-quoted
 661                         return $set[4];
 662                 } elseif( isset( $set[3] ) ) {
 663                         # Double-quoted
 664                         return $set[3];
 665                 } elseif( !isset( $set[2] ) ) {
 666                         # In XHTML, attributes must have a value.
 667                         # For 'reduced' form, return explicitly the attribute name here.
 668                         return $set[1];
 669                 } else {
 670                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 671                 }
 672         }
 673
 674         /**
 675          * Normalize whitespace and character references in an XML source-
 676          * encoded text for an attribute value.
 677          *
 678          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 679          * but note that we're not returning the value, but are returning
 680          * XML source fragments that will be slapped into output.
 681          *
 682          * @param string $text
 683          * @return string
 684          * @access private
 685          */
 686         function normalizeAttributeValue( $text ) {
 687                 return str_replace( '"', '&quot;',
 688                         preg_replace(
 689                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 690                                 ' ',
 691                                 Sanitizer::normalizeCharReferences( $text ) ) );
 692         }
 693
 694         /**
 695          * Ensure that any entities and character references are legal
 696          * for XML and XHTML specifically. Any stray bits will be
 697          * &amp;-escaped to result in a valid text fragment.
 698          *
 699          * a. any named char refs must be known in XHTML
 700          * b. any numeric char refs must be legal chars, not invalid or forbidden
 701          * c. use &#x, not &#X
 702          * d. fix or reject non-valid attributes
 703          *
 704          * @param string $text
 705          * @return string
 706          * @access private
 707          */
 708         function normalizeCharReferences( $text ) {
 709                 return preg_replace_callback(
 710                         MW_CHAR_REFS_REGEX,
 711                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 712                         $text );
 713         }
 714         /**
 715          * @param string $matches
 716          * @return string
 717          */
 718         function normalizeCharReferencesCallback( $matches ) {
 719                 $ret = null;
 720                 if( $matches[1] != '' ) {
 721                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 722                 } elseif( $matches[2] != '' ) {
 723                         $ret = Sanitizer::decCharReference( $matches[2] );
 724                 } elseif( $matches[3] != ''  ) {
 725                         $ret = Sanitizer::hexCharReference( $matches[3] );
 726                 } elseif( $matches[4] != '' ) {
 727                         $ret = Sanitizer::hexCharReference( $matches[4] );
 728                 }
 729                 if( is_null( $ret ) ) {
 730                         return htmlspecialchars( $matches[0] );
 731                 } else {
 732                         return $ret;
 733                 }
 734         }
 735
 736         /**
 737          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 738          * return the named entity reference as is. Otherwise, returns
 739          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 740          *
 741          * @param string $name
 742          * @return string
 743          */
 744         function normalizeEntity( $name ) {
 745                 global $wgHtmlEntities;
 746                 if( isset( $wgHtmlEntities[$name] ) ) {
 747                         return "&$name;";
 748                 } else {
 749                         return "&amp;$name;";
 750                 }
 751         }
 752
 753         function decCharReference( $codepoint ) {
 754                 $point = IntVal( $codepoint );
 755                 if( Sanitizer::validateCodepoint( $point ) ) {
 756                         return sprintf( '&#%d;', $point );
 757                 } else {
 758                         return null;
 759                 }
 760         }
 761
 762         function hexCharReference( $codepoint ) {
 763                 $point = hexdec( $codepoint );
 764                 if( Sanitizer::validateCodepoint( $point ) ) {
 765                         return sprintf( '&#x%x;', $point );
 766                 } else {
 767                         return null;
 768                 }
 769         }
 770
 771         /**
 772          * Returns true if a given Unicode codepoint is a valid character in XML.
 773          * @param int $codepoint
 774          * @return bool
 775          */
 776         function validateCodepoint( $codepoint ) {
 777                 return ($codepoint ==    0x09)
 778                         || ($codepoint ==    0x0a)
 779                         || ($codepoint ==    0x0d)
 780                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 781                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 782                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 783         }
 784
 785         /**
 786          * Decode any character references, numeric or named entities,
 787          * in the text and return a UTF-8 string.
 788          *
 789          * @param string $text
 790          * @return string
 791          * @access public
 792          */
 793         function decodeCharReferences( $text ) {
 794                 return preg_replace_callback(
 795                         MW_CHAR_REFS_REGEX,
 796                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 797                         $text );
 798         }
 799
 800         /**
 801          * @param string $matches
 802          * @return string
 803          */
 804         function decodeCharReferencesCallback( $matches ) {
 805                 if( $matches[1] != '' ) {
 806                         return Sanitizer::decodeEntity( $matches[1] );
 807                 } elseif( $matches[2] != '' ) {
 808                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 809                 } elseif( $matches[3] != ''  ) {
 810                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 811                 } elseif( $matches[4] != '' ) {
 812                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 813                 }
 814                 # Last case should be an ampersand by itself
 815                 return $matches[0];
 816         }
 817
 818         /**
 819          * Return UTF-8 string for a codepoint if that is a valid
 820          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 821          * @param int $codepoint
 822          * @return string
 823          * @access private
 824          */
 825         function decodeChar( $codepoint ) {
 826                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 827                         return codepointToUtf8( $codepoint );
 828                 } else {
 829                         return UTF8_REPLACEMENT;
 830                 }
 831         }
 832
 833         /**
 834          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 835          * return the UTF-8 encoding of that character. Otherwise, returns
 836          * pseudo-entity source (eg &foo;)
 837          *
 838          * @param string $name
 839          * @return string
 840          */
 841         function decodeEntity( $name ) {
 842                 global $wgHtmlEntities;
 843                 if( isset( $wgHtmlEntities[$name] ) ) {
 844                         return codepointToUtf8( $wgHtmlEntities[$name] );
 845                 } else {
 846                         return "&$name;";
 847                 }
 848         }
 849
 850         /**
 851          * Fetch the whitelist of acceptable attributes for a given
 852          * element name.
 853          *
 854          * @param string $element
 855          * @return array
 856          */
 857         function attributeWhitelist( $element ) {
 858                 static $list;
 859                 if( !isset( $list ) ) {
 860                         $list = Sanitizer::setupAttributeWhitelist();
 861                 }
 862                 return isset( $list[$element] )
 863                         ? $list[$element]
 864                         : array();
 865         }
 866
 867         /**
 868          * @return array
 869          */
 870         function setupAttributeWhitelist() {
 871                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 872                 $block = array_merge( $common, array( 'align' ) );
 873                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 874                 $tablecell = array( 'abbr',
 875                                     'axis',
 876                                     'headers',
 877                                     'scope',
 878                                     'rowspan',
 879                                     'colspan',
 880                                     'nowrap', # deprecated
 881                                     'width',  # deprecated
 882                                     'height', # deprecated
 883                                     'bgcolor' # deprecated
 884                                     );
 885
 886                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 887                 # See: http://www.w3.org/TR/html4/
 888                 $whitelist = array (
 889                         # 7.5.4
 890                         'div'        => $block,
 891                         'center'     => $common, # deprecated
 892                         'span'       => $block, # ??
 893
 894                         # 7.5.5
 895                         'h1'         => $block,
 896                         'h2'         => $block,
 897                         'h3'         => $block,
 898                         'h4'         => $block,
 899                         'h5'         => $block,
 900                         'h6'         => $block,
 901
 902                         # 7.5.6
 903                         # address
 904
 905                         # 8.2.4
 906                         # bdo
 907
 908                         # 9.2.1
 909                         'em'         => $common,
 910                         'strong'     => $common,
 911                         'cite'       => $common,
 912                         # dfn
 913                         'code'       => $common,
 914                         # samp
 915                         # kbd
 916                         'var'        => $common,
 917                         # abbr
 918                         # acronym
 919
 920                         # 9.2.2
 921                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 922                         # q
 923
 924                         # 9.2.3
 925                         'sub'        => $common,
 926                         'sup'        => $common,
 927
 928                         # 9.3.1
 929                         'p'          => $block,
 930
 931                         # 9.3.2
 932                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 933
 934                         # 9.3.4
 935                         'pre'        => array_merge( $common, array( 'width' ) ),
 936
 937                         # 9.4
 938                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 939                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 940
 941                         # 10.2
 942                         'ul'         => array_merge( $common, array( 'type' ) ),
 943                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 944                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 945
 946                         # 10.3
 947                         'dl'         => $common,
 948                         'dd'         => $common,
 949                         'dt'         => $common,
 950
 951                         # 11.2.1
 952                         'table'      => array_merge( $common,
 953                                                                 array( 'summary', 'width', 'border', 'frame',
 954                                                                                          'rules', 'cellspacing', 'cellpadding',
 955                                                                                          'align', 'bgcolor', 'frame', 'rules',
 956                                                                                          'border' ) ),
 957
 958                         # 11.2.2
 959                         'caption'    => array_merge( $common, array( 'align' ) ),
 960
 961                         # 11.2.3
 962                         'thead'      => array_merge( $common, $tablealign ),
 963                         'tfoot'      => array_merge( $common, $tablealign ),
 964                         'tbody'      => array_merge( $common, $tablealign ),
 965
 966                         # 11.2.4
 967                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 968                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 969
 970                         # 11.2.5
 971                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 972
 973                         # 11.2.6
 974                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 975                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 976
 977                         # 15.2.1
 978                         'tt'         => $common,
 979                         'b'          => $common,
 980                         'i'          => $common,
 981                         'big'        => $common,
 982                         'small'      => $common,
 983                         'strike'     => $common,
 984                         's'          => $common,
 985                         'u'          => $common,
 986
 987                         # 15.2.2
 988                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 989                         # basefont
 990
 991                         # 15.3
 992                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 993
 994                         # XHTML Ruby annotation text module, simple ruby only.
 995                         # http://www.w3c.org/TR/ruby/
 996                         'ruby'       => $common,
 997                         # rbc
 998                         # rtc
 999                         'rb'         => $common,
1000                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1001                         'rp'         => $common,
1002                         );
1003                 return $whitelist;
1004         }
1005
1006         /**
1007          * Take a fragment of (potentially invalid) HTML and return
1008          * a version with any tags removed, encoded suitably for literal
1009          * inclusion in an attribute value.
1010          *
1011          * @param string $text HTML fragment
1012          * @return string
1013          */
1014         function stripAllTags( $text ) {
1015                 # Actual <tags>
1016                 $text = preg_replace( '/<[^>]*>/', '', $text );
1017
1018                 # Normalize &entities and whitespace
1019                 $text = Sanitizer::normalizeAttributeValue( $text );
1020
1021                 # Will be placed into "double-quoted" attributes,
1022                 # make sure remaining bits are safe.
1023                 $text = str_replace(
1024                         array('<', '>', '"'),
1025                         array('&lt;', '&gt;', '&quot;'),
1026                         $text );
1027
1028                 return $text;
1029         }
1030
1031 }
1032
1033 ?>