3 namespace RemexHtml\Serializer;
4 use RemexHtml\HTMLData;
5 use RemexHtml\DOM\DOMUtils;
6 use RemexHtml\DOM\DOMFormatter;
9 * A formatter which follows the HTML 5 fragment serialization algorithm.
11 class HtmlFormatter implements Formatter, DOMFormatter {
13 * The elements for which a closing tag is omitted.
15 protected $voidElements = [
38 * The elements which need a leading newline in their contents to be
39 * duplicated, since the parser strips a leading newline.
41 protected $prefixLfElements = [
48 * The elements which have unescaped contents.
50 protected $rawTextElements = [
60 * The escape table for attribute values
62 protected $attributeEscapes = [
64 "\xc2\xa0" => ' ',
68 * The escape table for text nodes
70 protected $textEscapes = [
72 "\xc2\xa0" => ' ',
78 * Attribute namespaces which have unqualified local names
80 protected $unqualifiedNamespaces = [
81 HTMLData::NS_HTML => true,
82 HTMLData::NS_MATHML => true,
83 HTMLData::NS_SVG => true,
86 protected $useSourceDoctype;
87 protected $reverseCoercion;
92 * @param array $options An associative array of options:
93 * - scriptingFlag : Set this to false to disable scripting. True by default.
94 * - useSourceDoctype : Emit the doctype used in the source. If this is
95 * false or absent, an HTML doctype will be used.
96 * - reverseCoercion : When formatting a DOM node, reverse the encoding
97 * of invalid names. False by default.
99 public function __construct( $options = [] ) {
101 'scriptingFlag' => true,
102 'useSourceDoctype' => false,
103 'reverseCoercion' => false,
105 if ( $options['scriptingFlag'] ) {
106 $this->rawTextElements['noscript'] = true;
108 $this->useSourceDoctype = $options['useSourceDoctype'];
109 $this->reverseCoercion = $options['reverseCoercion'];
112 public function startDocument( $fragmentNamespace, $fragmentName ) {
113 return "<!DOCTYPE html>";
116 public function characters( SerializerNode $parent, $text, $start, $length ) {
117 $text = substr( $text, $start, $length );
118 if ( $parent->namespace !== HTMLData::NS_HTML
119 || !isset( $this->rawTextElements[$parent->name] )
121 $text = strtr( $text, $this->textEscapes );
126 public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
129 foreach ( $node->attrs->getValues() as $attrName => $attrValue ) {
130 $encValue = strtr( $attrValue, $this->attributeEscapes );
131 $s .= " $attrName=\"$encValue\"";
134 if ( $node->namespace === HTMLData::NS_HTML ) {
135 if ( isset( $contents[0] ) && $contents[0] === "\n"
136 && isset( $this->prefixLfElements[$name] )
138 $s .= "\n$contents</$name>";
139 } elseif ( !isset( $this->voidElements[$name] ) ) {
140 $s .= "$contents</$name>";
143 $s .= "$contents</$name>";
148 public function comment( SerializerNode $parent, $text ) {
149 return "<!--$text-->";
152 public function doctype( $name, $public, $system ) {
156 public function formatDOMNode( \DOMNode $node ) {
158 if ( $node->firstChild ) {
159 foreach ( $node->childNodes as $child ) {
160 $contents .= $this->formatDOMNode( $child );
164 switch ( $node->nodeType ) {
165 case XML_ELEMENT_NODE:
166 return $this->formatDOMElement( $node, $contents );
168 case XML_DOCUMENT_NODE:
169 if ( !$this->useSourceDoctype ) {
170 return "<!DOCTYPE html>" . $contents;
175 case XML_DOCUMENT_FRAG_NODE:
180 $parent = $node->parentNode;
181 if ( $parent->namespaceURI !== HTMLData::NS_HTML
182 || !isset( $this->rawTextElements[$parent->nodeName] )
184 $text = strtr( $text, $this->textEscapes );
188 case XML_CDATA_SECTION_NODE:
189 $parent = $node->parentNode;
190 if ( $parent->namespaceURI === HTMLData::NS_HTML ) {
191 // CDATA is not allowed in HTML nodes
194 return "<![CDATA[{$node->data}]]>";
198 return "<?{$node->target} {$node->data}>";
200 case XML_COMMENT_NODE:
201 return "<!--{$node->data}-->";
203 case XML_DOCUMENT_TYPE_NODE:
204 if ( $this->useSourceDoctype ) {
205 return "<!DOCTYPE {$node->name}>";
215 public function formatDOMElement( \DOMElement $node, $contents ) {
216 $ns = $node->namespaceURI;
218 || isset( $this->unqualifiedNamespaces[$ns] )
219 || $node->prefix === null
221 $name = $node->localName;
223 $name = $node->prefix . ':' . $node->localName;
225 if ( $this->reverseCoercion ) {
226 $name = DOMUtils::uncoerceName( $name );
230 foreach ( $node->attributes as $attr ) {
231 switch ( $attr->namespaceURI ) {
232 case HTMLData::NS_XML:
233 $attrName = 'xml:' . $attr->localName;
235 case HTMLData::NS_XMLNS:
236 if ( $attr->localName === 'xmlns' ) {
239 $attrName = 'xmlns:' . $attr->localName;
242 case HTMLData::NS_XLINK:
243 $attrName = 'xlink:' . $attr->localName;
246 if ( strlen( $attr->prefix ) ) {
247 $attrName = $attr->prefix . ':' . $attr->localName;
249 $attrName = $attr->localName;
252 if ( $this->reverseCoercion ) {
253 $attrName = DOMUtils::uncoerceName( $attrName );
255 $encValue = strtr( $attr->value, $this->attributeEscapes );
256 $s .= " $attrName=\"$encValue\"";
259 if ( $ns === HTMLData::NS_HTML ) {
260 if ( isset( $contents[0] ) && $contents[0] === "\n"
261 && isset( $this->prefixLfElements[$name] )
263 $s .= "\n$contents</$name>";
264 } elseif ( !isset( $this->voidElements[$name] ) ) {
265 $s .= "$contents</$name>";
268 $s .= "$contents</$name>";