Wordpress 3.7
[autoinstalls/wordpress.git] / wp-includes / SimplePie / Sanitize.php
1 <?php
2 /**
3  * SimplePie
4  *
5  * A PHP-Based RSS and Atom Feed Framework.
6  * Takes the hard work out of managing a complete RSS/Atom solution.
7  *
8  * Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without modification, are
12  * permitted provided that the following conditions are met:
13  *
14  *      * Redistributions of source code must retain the above copyright notice, this list of
15  *        conditions and the following disclaimer.
16  *
17  *      * Redistributions in binary form must reproduce the above copyright notice, this list
18  *        of conditions and the following disclaimer in the documentation and/or other materials
19  *        provided with the distribution.
20  *
21  *      * Neither the name of the SimplePie Team nor the names of its contributors may be used
22  *        to endorse or promote products derived from this software without specific prior
23  *        written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
26  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
28  * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33  * POSSIBILITY OF SUCH DAMAGE.
34  *
35  * @package SimplePie
36  * @version 1.3.1
37  * @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
38  * @author Ryan Parman
39  * @author Geoffrey Sneddon
40  * @author Ryan McCue
41  * @link http://simplepie.org/ SimplePie
42  * @license http://www.opensource.org/licenses/bsd-license.php BSD License
43  */
44
45 /**
46  * Used for data cleanup and post-processing
47  *
48  *
49  * This class can be overloaded with {@see SimplePie::set_sanitize_class()}
50  *
51  * @package SimplePie
52  * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
53  */
54 class SimplePie_Sanitize
55 {
56         // Private vars
57         var $base;
58
59         // Options
60         var $remove_div = true;
61         var $image_handler = '';
62         var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
63         var $encode_instead_of_strip = false;
64         var $strip_attributes = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
65         var $strip_comments = false;
66         var $output_encoding = 'UTF-8';
67         var $enable_cache = true;
68         var $cache_location = './cache';
69         var $cache_name_function = 'md5';
70         var $timeout = 10;
71         var $useragent = '';
72         var $force_fsockopen = false;
73         var $replace_url_attributes = null;
74
75         public function __construct()
76         {
77                 // Set defaults
78                 $this->set_url_replacements(null);
79         }
80
81         public function remove_div($enable = true)
82         {
83                 $this->remove_div = (bool) $enable;
84         }
85
86         public function set_image_handler($page = false)
87         {
88                 if ($page)
89                 {
90                         $this->image_handler = (string) $page;
91                 }
92                 else
93                 {
94                         $this->image_handler = false;
95                 }
96         }
97
98         public function set_registry(SimplePie_Registry $registry)
99         {
100                 $this->registry = $registry;
101         }
102
103         public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
104         {
105                 if (isset($enable_cache))
106                 {
107                         $this->enable_cache = (bool) $enable_cache;
108                 }
109
110                 if ($cache_location)
111                 {
112                         $this->cache_location = (string) $cache_location;
113                 }
114
115                 if ($cache_name_function)
116                 {
117                         $this->cache_name_function = (string) $cache_name_function;
118                 }
119         }
120
121         public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
122         {
123                 if ($timeout)
124                 {
125                         $this->timeout = (string) $timeout;
126                 }
127
128                 if ($useragent)
129                 {
130                         $this->useragent = (string) $useragent;
131                 }
132
133                 if ($force_fsockopen)
134                 {
135                         $this->force_fsockopen = (string) $force_fsockopen;
136                 }
137         }
138
139         public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
140         {
141                 if ($tags)
142                 {
143                         if (is_array($tags))
144                         {
145                                 $this->strip_htmltags = $tags;
146                         }
147                         else
148                         {
149                                 $this->strip_htmltags = explode(',', $tags);
150                         }
151                 }
152                 else
153                 {
154                         $this->strip_htmltags = false;
155                 }
156         }
157
158         public function encode_instead_of_strip($encode = false)
159         {
160                 $this->encode_instead_of_strip = (bool) $encode;
161         }
162
163         public function strip_attributes($attribs = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
164         {
165                 if ($attribs)
166                 {
167                         if (is_array($attribs))
168                         {
169                                 $this->strip_attributes = $attribs;
170                         }
171                         else
172                         {
173                                 $this->strip_attributes = explode(',', $attribs);
174                         }
175                 }
176                 else
177                 {
178                         $this->strip_attributes = false;
179                 }
180         }
181
182         public function strip_comments($strip = false)
183         {
184                 $this->strip_comments = (bool) $strip;
185         }
186
187         public function set_output_encoding($encoding = 'UTF-8')
188         {
189                 $this->output_encoding = (string) $encoding;
190         }
191
192         /**
193          * Set element/attribute key/value pairs of HTML attributes
194          * containing URLs that need to be resolved relative to the feed
195          *
196          * Defaults to |a|@href, |area|@href, |blockquote|@cite, |del|@cite,
197          * |form|@action, |img|@longdesc, |img|@src, |input|@src, |ins|@cite,
198          * |q|@cite
199          *
200          * @since 1.0
201          * @param array|null $element_attribute Element/attribute key/value pairs, null for default
202          */
203         public function set_url_replacements($element_attribute = null)
204         {
205                 if ($element_attribute === null)
206                 {
207                         $element_attribute = array(
208                                 'a' => 'href',
209                                 'area' => 'href',
210                                 'blockquote' => 'cite',
211                                 'del' => 'cite',
212                                 'form' => 'action',
213                                 'img' => array(
214                                         'longdesc',
215                                         'src'
216                                 ),
217                                 'input' => 'src',
218                                 'ins' => 'cite',
219                                 'q' => 'cite'
220                         );
221                 }
222                 $this->replace_url_attributes = (array) $element_attribute;
223         }
224
225         public function sanitize($data, $type, $base = '')
226         {
227                 $data = trim($data);
228                 if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI)
229                 {
230                         if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML)
231                         {
232                                 if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data))
233                                 {
234                                         $type |= SIMPLEPIE_CONSTRUCT_HTML;
235                                 }
236                                 else
237                                 {
238                                         $type |= SIMPLEPIE_CONSTRUCT_TEXT;
239                                 }
240                         }
241
242                         if ($type & SIMPLEPIE_CONSTRUCT_BASE64)
243                         {
244                                 $data = base64_decode($data);
245                         }
246
247                         if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
248                         {
249
250                                 if (!class_exists('DOMDocument'))
251                                 {
252                                         $this->registry->call('Misc', 'error', array('DOMDocument not found, unable to use sanitizer', E_USER_WARNING, __FILE__, __LINE__));
253                                         return '';
254                                 }
255                                 $document = new DOMDocument();
256                                 $document->encoding = 'UTF-8';
257                                 $data = $this->preprocess($data, $type);
258
259                                 set_error_handler(array('SimplePie_Misc', 'silence_errors'));
260                                 $document->loadHTML($data);
261                                 restore_error_handler();
262
263                                 // Strip comments
264                                 if ($this->strip_comments)
265                                 {
266                                         $xpath = new DOMXPath($document);
267                                         $comments = $xpath->query('//comment()');
268
269                                         foreach ($comments as $comment)
270                                         {
271                                                 $comment->parentNode->removeChild($comment);
272                                         }
273                                 }
274
275                                 // Strip out HTML tags and attributes that might cause various security problems.
276                                 // Based on recommendations by Mark Pilgrim at:
277                                 // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
278                                 if ($this->strip_htmltags)
279                                 {
280                                         foreach ($this->strip_htmltags as $tag)
281                                         {
282                                                 $this->strip_tag($tag, $document, $type);
283                                         }
284                                 }
285
286                                 if ($this->strip_attributes)
287                                 {
288                                         foreach ($this->strip_attributes as $attrib)
289                                         {
290                                                 $this->strip_attr($attrib, $document);
291                                         }
292                                 }
293
294                                 // Replace relative URLs
295                                 $this->base = $base;
296                                 foreach ($this->replace_url_attributes as $element => $attributes)
297                                 {
298                                         $this->replace_urls($document, $element, $attributes);
299                                 }
300
301                                 // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
302                                 if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
303                                 {
304                                         $images = $document->getElementsByTagName('img');
305                                         foreach ($images as $img)
306                                         {
307                                                 if ($img->hasAttribute('src'))
308                                                 {
309                                                         $image_url = call_user_func($this->cache_name_function, $img->getAttribute('src'));
310                                                         $cache = $this->registry->call('Cache', 'get_handler', array($this->cache_location, $image_url, 'spi'));
311
312                                                         if ($cache->load())
313                                                         {
314                                                                 $img->setAttribute('src', $this->image_handler . $image_url);
315                                                         }
316                                                         else
317                                                         {
318                                                                 $file = $this->registry->create('File', array($img->getAttribute('src'), $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
319                                                                 $headers = $file->headers;
320
321                                                                 if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
322                                                                 {
323                                                                         if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
324                                                                         {
325                                                                                 $img->setAttribute('src', $this->image_handler . $image_url);
326                                                                         }
327                                                                         else
328                                                                         {
329                                                                                 trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
330                                                                         }
331                                                                 }
332                                                         }
333                                                 }
334                                         }
335                                 }
336
337                                 // Remove the DOCTYPE
338                                 // Seems to cause segfaulting if we don't do this
339                                 if ($document->firstChild instanceof DOMDocumentType)
340                                 {
341                                         $document->removeChild($document->firstChild);
342                                 }
343
344                                 // Move everything from the body to the root
345                                 $real_body = $document->getElementsByTagName('body')->item(0)->childNodes->item(0);
346                                 $document->replaceChild($real_body, $document->firstChild);
347
348                                 // Finally, convert to a HTML string
349                                 $data = trim($document->saveHTML());
350
351                                 if ($this->remove_div)
352                                 {
353                                         $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
354                                         $data = preg_replace('/<\/div>$/', '', $data);
355                                 }
356                                 else
357                                 {
358                                         $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
359                                 }
360                         }
361
362                         if ($type & SIMPLEPIE_CONSTRUCT_IRI)
363                         {
364                                 $absolute = $this->registry->call('Misc', 'absolutize_url', array($data, $base));
365                                 if ($absolute !== false)
366                                 {
367                                         $data = $absolute;
368                                 }
369                         }
370
371                         if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
372                         {
373                                 $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
374                         }
375
376                         if ($this->output_encoding !== 'UTF-8')
377                         {
378                                 $data = $this->registry->call('Misc', 'change_encoding', array($data, 'UTF-8', $this->output_encoding));
379                         }
380                 }
381                 return $data;
382         }
383
384         protected function preprocess($html, $type)
385         {
386                 $ret = '';
387                 if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML)
388                 {
389                         // Atom XHTML constructs are wrapped with a div by default
390                         // Note: No protection if $html contains a stray </div>!
391                         $html = '<div>' . $html . '</div>';
392                         $ret .= '<!DOCTYPE html>';
393                         $content_type = 'text/html';
394                 }
395                 else
396                 {
397                         $ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
398                         $content_type = 'application/xhtml+xml';
399                 }
400
401                 $ret .= '<html><head>';
402                 $ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />';
403                 $ret .= '</head><body>' . $html . '</body></html>';
404                 return $ret;
405         }
406
407         public function replace_urls($document, $tag, $attributes)
408         {
409                 if (!is_array($attributes))
410                 {
411                         $attributes = array($attributes);
412                 }
413
414                 if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
415                 {
416                         $elements = $document->getElementsByTagName($tag);
417                         foreach ($elements as $element)
418                         {
419                                 foreach ($attributes as $attribute)
420                                 {
421                                         if ($element->hasAttribute($attribute))
422                                         {
423                                                 $value = $this->registry->call('Misc', 'absolutize_url', array($element->getAttribute($attribute), $this->base));
424                                                 if ($value !== false)
425                                                 {
426                                                         $element->setAttribute($attribute, $value);
427                                                 }
428                                         }
429                                 }
430                         }
431                 }
432         }
433
434         public function do_strip_htmltags($match)
435         {
436                 if ($this->encode_instead_of_strip)
437                 {
438                         if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
439                         {
440                                 $match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
441                                 $match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
442                                 return "&lt;$match[1]$match[2]&gt;$match[3]&lt;/$match[1]&gt;";
443                         }
444                         else
445                         {
446                                 return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
447                         }
448                 }
449                 elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
450                 {
451                         return $match[4];
452                 }
453                 else
454                 {
455                         return '';
456                 }
457         }
458
459         protected function strip_tag($tag, $document, $type)
460         {
461                 $xpath = new DOMXPath($document);
462                 $elements = $xpath->query('body//' . $tag);
463                 if ($this->encode_instead_of_strip)
464                 {
465                         foreach ($elements as $element)
466                         {
467                                 $fragment = $document->createDocumentFragment();
468
469                                 // For elements which aren't script or style, include the tag itself
470                                 if (!in_array($tag, array('script', 'style')))
471                                 {
472                                         $text = '<' . $tag;
473                                         if ($element->hasAttributes())
474                                         {
475                                                 $attrs = array();
476                                                 foreach ($element->attributes as $name => $attr)
477                                                 {
478                                                         $value = $attr->value;
479
480                                                         // In XHTML, empty values should never exist, so we repeat the value
481                                                         if (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_XHTML))
482                                                         {
483                                                                 $value = $name;
484                                                         }
485                                                         // For HTML, empty is fine
486                                                         elseif (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_HTML))
487                                                         {
488                                                                 $attrs[] = $name;
489                                                                 continue;
490                                                         }
491
492                                                         // Standard attribute text
493                                                         $attrs[] = $name . '="' . $attr->value . '"';
494                                                 }
495                                                 $text .= ' ' . implode(' ', $attrs);
496                                         }
497                                         $text .= '>';
498                                         $fragment->appendChild(new DOMText($text));
499                                 }
500
501                                 $number = $element->childNodes->length;
502                                 for ($i = $number; $i > 0; $i--)
503                                 {
504                                         $child = $element->childNodes->item(0);
505                                         $fragment->appendChild($child);
506                                 }
507
508                                 if (!in_array($tag, array('script', 'style')))
509                                 {
510                                         $fragment->appendChild(new DOMText('</' . $tag . '>'));
511                                 }
512
513                                 $element->parentNode->replaceChild($fragment, $element);
514                         }
515
516                         return;
517                 }
518                 elseif (in_array($tag, array('script', 'style')))
519                 {
520                         foreach ($elements as $element)
521                         {
522                                 $element->parentNode->removeChild($element);
523                         }
524
525                         return;
526                 }
527                 else
528                 {
529                         foreach ($elements as $element)
530                         {
531                                 $fragment = $document->createDocumentFragment();
532                                 $number = $element->childNodes->length;
533                                 for ($i = $number; $i > 0; $i--)
534                                 {
535                                         $child = $element->childNodes->item(0);
536                                         $fragment->appendChild($child);
537                                 }
538
539                                 $element->parentNode->replaceChild($fragment, $element);
540                         }
541                 }
542         }
543
544         protected function strip_attr($attrib, $document)
545         {
546                 $xpath = new DOMXPath($document);
547                 $elements = $xpath->query('//*[@' . $attrib . ']');
548
549                 foreach ($elements as $element)
550                 {
551                         $element->removeAttribute($attrib);
552                 }
553         }
554 }