2 * Word or character counting functionality. Count words or characters in a provided text string.
4 * @summary Count words or characters in a text.
12 * Word counting utility
14 * @namespace wp.utils.wordcounter
19 * @param {Object} settings Optional. Key-value object containing overrides for
21 * @param {RegExp} settings.HTMLRegExp Optional. Regular expression to find HTML elements.
22 * @param {RegExp} settings.HTMLcommentRegExp Optional. Regular expression to find HTML comments.
23 * @param {RegExp} settings.spaceRegExp Optional. Regular expression to find irregular space
25 * @param {RegExp} settings.HTMLEntityRegExp Optional. Regular expression to find HTML entities.
26 * @param {RegExp} settings.connectorRegExp Optional. Regular expression to find connectors that
28 * @param {RegExp} settings.removeRegExp Optional. Regular expression to find remove unwanted
29 * characters to reduce false-positives.
30 * @param {RegExp} settings.astralRegExp Optional. Regular expression to find unwanted
31 * characters when searching for non-words.
32 * @param {RegExp} settings.wordsRegExp Optional. Regular expression to find words by spaces.
33 * @param {RegExp} settings.characters_excluding_spacesRegExp Optional. Regular expression to find characters which
35 * @param {RegExp} settings.characters_including_spacesRegExp Optional. Regular expression to find characters
37 * @param {RegExp} settings.shortcodesRegExp Optional. Regular expression to find shortcodes.
38 * @param {Object} settings.l10n Optional. Localization object containing specific
39 * configuration for the current localization.
40 * @param {String} settings.l10n.type Optional. Method of finding words to count.
41 * @param {Array} settings.l10n.shortcodes Optional. Array of shortcodes that should be removed
46 function WordCounter( settings ) {
50 // Apply provided settings to object settings.
52 for ( key in settings ) {
54 // Only apply valid settings.
55 if ( settings.hasOwnProperty( key ) ) {
56 this.settings[ key ] = settings[ key ];
61 shortcodes = this.settings.l10n.shortcodes;
63 // If there are any localization shortcodes, add this as type in the settings.
64 if ( shortcodes && shortcodes.length ) {
65 this.settings.shortcodesRegExp = new RegExp( '\\[\\/?(?:' + shortcodes.join( '|' ) + ')[^\\]]*?\\]', 'g' );
70 WordCounter.prototype.settings = {
71 HTMLRegExp: /<\/?[a-z][^>]*?>/gi,
72 HTMLcommentRegExp: /<!--[\s\S]*?-->/g,
73 spaceRegExp: / | /gi,
74 HTMLEntityRegExp: /&\S+?;/g,
77 connectorRegExp: /--|\u2014/g,
79 // Characters to be removed from input text.
80 removeRegExp: new RegExp( [
83 // Basic Latin (extract)
84 '\u0021-\u0040\u005B-\u0060\u007B-\u007E',
86 // Latin-1 Supplement (extract)
87 '\u0080-\u00BF\u00D7\u00F7',
90 * The following range consists of:
92 * Superscripts and Subscripts
94 * Combining Diacritical Marks for Symbols
98 * Mathematical Operators
99 * Miscellaneous Technical
101 * Optical Character Recognition
102 * Enclosed Alphanumerics
106 * Miscellaneous Symbols
108 * Miscellaneous Mathematical Symbols-A
109 * Supplemental Arrows-A
111 * Supplemental Arrows-B
112 * Miscellaneous Mathematical Symbols-B
113 * Supplemental Mathematical Operators
114 * Miscellaneous Symbols and Arrows
118 // Supplemental Punctuation
123 // Remove UTF-16 surrogate points, see https://en.wikipedia.org/wiki/UTF-16#U.2BD800_to_U.2BDFFF
124 astralRegExp: /[\uD800-\uDBFF][\uDC00-\uDFFF]/g,
125 wordsRegExp: /\S\s+/g,
126 characters_excluding_spacesRegExp: /\S/g,
129 * Match anything that is not a formatting character, excluding:
132 * \r = carriage return
135 * \u00AD = soft hyphen
136 * \u2028 = line separator
137 * \u2029 = paragraph separator
139 characters_including_spacesRegExp: /[^\f\n\r\t\v\u00AD\u2028\u2029]/g,
140 l10n: window.wordCountL10n || {}
144 * Counts the number of words (or other specified type) in the specified text.
146 * @summary Count the number of elements in a text.
149 * @memberof wp.utils.wordcounter
151 * @param {String} text Text to count elements in.
152 * @param {String} type Optional. Specify type to use.
154 * @return {Number} The number of items counted.
156 WordCounter.prototype.count = function( text, type ) {
159 // Use default type if none was provided.
160 type = type || this.settings.l10n.type;
162 // Sanitize type to one of three possibilities: 'words', 'characters_excluding_spaces' or 'characters_including_spaces'.
163 if ( type !== 'characters_excluding_spaces' && type !== 'characters_including_spaces' ) {
167 // If we have any text at all.
171 // Replace all HTML with a new-line.
172 text = text.replace( this.settings.HTMLRegExp, '\n' );
174 // Remove all HTML comments.
175 text = text.replace( this.settings.HTMLcommentRegExp, '' );
177 // If a shortcode regular expression has been provided use it to remove shortcodes.
178 if ( this.settings.shortcodesRegExp ) {
179 text = text.replace( this.settings.shortcodesRegExp, '\n' );
182 // Normalize non-breaking space to a normal space.
183 text = text.replace( this.settings.spaceRegExp, ' ' );
185 if ( type === 'words' ) {
187 // Remove HTML Entities.
188 text = text.replace( this.settings.HTMLEntityRegExp, '' );
190 // Convert connectors to spaces to count attached text as words.
191 text = text.replace( this.settings.connectorRegExp, ' ' );
193 // Remove unwanted characters.
194 text = text.replace( this.settings.removeRegExp, '' );
197 // Convert HTML Entities to "a".
198 text = text.replace( this.settings.HTMLEntityRegExp, 'a' );
200 // Remove surrogate points.
201 text = text.replace( this.settings.astralRegExp, 'a' );
204 // Match with the selected type regular expression to count the items.
205 text = text.match( this.settings[ type + 'RegExp' ] );
207 // If we have any matches, set the count to the number of items found.
216 // Add the WordCounter to the WP Utils.
217 window.wp = window.wp || {};
218 window.wp.utils = window.wp.utils || {};
219 window.wp.utils.WordCounter = WordCounter;