word-count.js 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. /**
  2. * Word or character counting functionality. Count words or characters in a
  3. * provided text string.
  4. *
  5. * @namespace wp.utils
  6. *
  7. * @since 2.6.0
  8. * @output wp-admin/js/word-count.js
  9. */
  10. ( function() {
  11. /**
  12. * Word counting utility
  13. *
  14. * @namespace wp.utils.wordcounter
  15. * @memberof wp.utils
  16. *
  17. * @class
  18. *
  19. * @param {Object} settings Optional. Key-value object containing overrides for
  20. * settings.
  21. * @param {RegExp} settings.HTMLRegExp Optional. Regular expression to find HTML elements.
  22. * @param {RegExp} settings.HTMLcommentRegExp Optional. Regular expression to find HTML comments.
  23. * @param {RegExp} settings.spaceRegExp Optional. Regular expression to find irregular space
  24. * characters.
  25. * @param {RegExp} settings.HTMLEntityRegExp Optional. Regular expression to find HTML entities.
  26. * @param {RegExp} settings.connectorRegExp Optional. Regular expression to find connectors that
  27. * split words.
  28. * @param {RegExp} settings.removeRegExp Optional. Regular expression to find remove unwanted
  29. * characters to reduce false-positives.
  30. * @param {RegExp} settings.astralRegExp Optional. Regular expression to find unwanted
  31. * characters when searching for non-words.
  32. * @param {RegExp} settings.wordsRegExp Optional. Regular expression to find words by spaces.
  33. * @param {RegExp} settings.characters_excluding_spacesRegExp Optional. Regular expression to find characters which
  34. * are non-spaces.
  35. * @param {RegExp} settings.characters_including_spacesRegExp Optional. Regular expression to find characters
  36. * including spaces.
  37. * @param {RegExp} settings.shortcodesRegExp Optional. Regular expression to find shortcodes.
  38. * @param {Object} settings.l10n Optional. Localization object containing specific
  39. * configuration for the current localization.
  40. * @param {string} settings.l10n.type Optional. Method of finding words to count.
  41. * @param {Array} settings.l10n.shortcodes Optional. Array of shortcodes that should be removed
  42. * from the text.
  43. *
  44. * @return {void}
  45. */
  46. function WordCounter( settings ) {
  47. var key,
  48. shortcodes;
  49. // Apply provided settings to object settings.
  50. if ( settings ) {
  51. for ( key in settings ) {
  52. // Only apply valid settings.
  53. if ( settings.hasOwnProperty( key ) ) {
  54. this.settings[ key ] = settings[ key ];
  55. }
  56. }
  57. }
  58. shortcodes = this.settings.l10n.shortcodes;
  59. // If there are any localization shortcodes, add this as type in the settings.
  60. if ( shortcodes && shortcodes.length ) {
  61. this.settings.shortcodesRegExp = new RegExp( '\\[\\/?(?:' + shortcodes.join( '|' ) + ')[^\\]]*?\\]', 'g' );
  62. }
  63. }
  64. // Default settings.
  65. WordCounter.prototype.settings = {
  66. HTMLRegExp: /<\/?[a-z][^>]*?>/gi,
  67. HTMLcommentRegExp: /<!--[\s\S]*?-->/g,
  68. spaceRegExp: /&nbsp;|&#160;/gi,
  69. HTMLEntityRegExp: /&\S+?;/g,
  70. // \u2014 = em-dash.
  71. connectorRegExp: /--|\u2014/g,
  72. // Characters to be removed from input text.
  73. removeRegExp: new RegExp( [
  74. '[',
  75. // Basic Latin (extract).
  76. '\u0021-\u0040\u005B-\u0060\u007B-\u007E',
  77. // Latin-1 Supplement (extract).
  78. '\u0080-\u00BF\u00D7\u00F7',
  79. /*
  80. * The following range consists of:
  81. * General Punctuation
  82. * Superscripts and Subscripts
  83. * Currency Symbols
  84. * Combining Diacritical Marks for Symbols
  85. * Letterlike Symbols
  86. * Number Forms
  87. * Arrows
  88. * Mathematical Operators
  89. * Miscellaneous Technical
  90. * Control Pictures
  91. * Optical Character Recognition
  92. * Enclosed Alphanumerics
  93. * Box Drawing
  94. * Block Elements
  95. * Geometric Shapes
  96. * Miscellaneous Symbols
  97. * Dingbats
  98. * Miscellaneous Mathematical Symbols-A
  99. * Supplemental Arrows-A
  100. * Braille Patterns
  101. * Supplemental Arrows-B
  102. * Miscellaneous Mathematical Symbols-B
  103. * Supplemental Mathematical Operators
  104. * Miscellaneous Symbols and Arrows
  105. */
  106. '\u2000-\u2BFF',
  107. // Supplemental Punctuation.
  108. '\u2E00-\u2E7F',
  109. ']'
  110. ].join( '' ), 'g' ),
  111. // Remove UTF-16 surrogate points, see https://en.wikipedia.org/wiki/UTF-16#U.2BD800_to_U.2BDFFF
  112. astralRegExp: /[\uD800-\uDBFF][\uDC00-\uDFFF]/g,
  113. wordsRegExp: /\S\s+/g,
  114. characters_excluding_spacesRegExp: /\S/g,
  115. /*
  116. * Match anything that is not a formatting character, excluding:
  117. * \f = form feed
  118. * \n = new line
  119. * \r = carriage return
  120. * \t = tab
  121. * \v = vertical tab
  122. * \u00AD = soft hyphen
  123. * \u2028 = line separator
  124. * \u2029 = paragraph separator
  125. */
  126. characters_including_spacesRegExp: /[^\f\n\r\t\v\u00AD\u2028\u2029]/g,
  127. l10n: window.wordCountL10n || {}
  128. };
  129. /**
  130. * Counts the number of words (or other specified type) in the specified text.
  131. *
  132. * @since 2.6.0
  133. *
  134. * @memberof wp.utils.wordcounter
  135. *
  136. * @param {string} text Text to count elements in.
  137. * @param {string} type Optional. Specify type to use.
  138. *
  139. * @return {number} The number of items counted.
  140. */
  141. WordCounter.prototype.count = function( text, type ) {
  142. var count = 0;
  143. // Use default type if none was provided.
  144. type = type || this.settings.l10n.type;
  145. // Sanitize type to one of three possibilities: 'words', 'characters_excluding_spaces' or 'characters_including_spaces'.
  146. if ( type !== 'characters_excluding_spaces' && type !== 'characters_including_spaces' ) {
  147. type = 'words';
  148. }
  149. // If we have any text at all.
  150. if ( text ) {
  151. text = text + '\n';
  152. // Replace all HTML with a new-line.
  153. text = text.replace( this.settings.HTMLRegExp, '\n' );
  154. // Remove all HTML comments.
  155. text = text.replace( this.settings.HTMLcommentRegExp, '' );
  156. // If a shortcode regular expression has been provided use it to remove shortcodes.
  157. if ( this.settings.shortcodesRegExp ) {
  158. text = text.replace( this.settings.shortcodesRegExp, '\n' );
  159. }
  160. // Normalize non-breaking space to a normal space.
  161. text = text.replace( this.settings.spaceRegExp, ' ' );
  162. if ( type === 'words' ) {
  163. // Remove HTML Entities.
  164. text = text.replace( this.settings.HTMLEntityRegExp, '' );
  165. // Convert connectors to spaces to count attached text as words.
  166. text = text.replace( this.settings.connectorRegExp, ' ' );
  167. // Remove unwanted characters.
  168. text = text.replace( this.settings.removeRegExp, '' );
  169. } else {
  170. // Convert HTML Entities to "a".
  171. text = text.replace( this.settings.HTMLEntityRegExp, 'a' );
  172. // Remove surrogate points.
  173. text = text.replace( this.settings.astralRegExp, 'a' );
  174. }
  175. // Match with the selected type regular expression to count the items.
  176. text = text.match( this.settings[ type + 'RegExp' ] );
  177. // If we have any matches, set the count to the number of items found.
  178. if ( text ) {
  179. count = text.length;
  180. }
  181. }
  182. return count;
  183. };
  184. // Add the WordCounter to the WP Utils.
  185. window.wp = window.wp || {};
  186. window.wp.utils = window.wp.utils || {};
  187. window.wp.utils.WordCounter = WordCounter;
  188. } )();