Use PHP functions in JavaScript

JavaScript str_word_count

Counts the number of words inside a string. If format of 1 is specified, then the function will return an array containing all the words found inside the string. If format of 2 is specified, then the function will return an associated array where the position of the word is the key and the word itself is the value. For the purpose of this function, 'word' is defined as a locale dependent string containing alphabetic characters, which also may contain, but not start with "'" and "-" characters.

1
2
3
4
56
7
8
9
1011
12
13
14
1516
17
18
19
2021
22
23
24
2526
27
28
29
3031
32
33
34
3536
37
38
39
4041
42
43
44
4546
47
48
49
5051
52
53
54
5556
57
58
59
6061
62
63
64
6566
67
68
69
7071
72
73
74
7576
77
78
79
8081
82
83
84
8586
87
88
89
9091
92
93
94
9596
97
98
99
100101
102
103
104
105
function str_word_count (str, format, charlist) {
    // Counts the number of words inside a string. If format of 1 is specified,     then the function will return an array containing all the words     found inside the string. If format of 2 is specified, then the function     will return an associated array where the position of the word is the key     and the word itself is the value.      For the purpose of this function, 'word' is defined as a locale dependent     string containing alphabetic characters, which also may contain, but not start     with "'" and "-" characters.  
    // 
    // version: 1109.2015
    // discuss at: http://phpjs.org/functions/str_word_count    // +   original by: Ole Vrijenhoek
    // +   bugfixed by: Kevin van Zonneveld (http://kevin.vanzonneveld.net)
    // +   bugfixed by: Brett Zamir (http://brett-zamir.me)
    // +   input by: Bug?
    // +   bugfixed by: Brett Zamir (http://brett-zamir.me)    // +   improved by: Brett Zamir (http://brett-zamir.me)
    // -   depends on: ctype_alpha
    // *     example 1: str_word_count("Hello fri3nd, you're\r\n       looking          good today!", 1);
    // *     returns 1: ['Hello', 'fri', 'nd', "you're", 'looking', 'good', 'today']
    // *     example 2: str_word_count("Hello fri3nd, you're\r\n       looking          good today!", 2);    // *     returns 2: {0: 'Hello', 6: 'fri', 10: 'nd', 14: "you're", 29: 'looking', 46: 'good', 51: 'today'}
    // *     example 3: str_word_count("Hello fri3nd, you're\r\n       looking          good today!", 1, '\u00e0\u00e1\u00e3\u00e73');
    // *     returns 3: ['Hello', 'fri3nd', 'youre', 'looking', 'good', 'today']
    var len = str.length,
        cl = charlist && charlist.length,        chr = '',
        tmpStr = '',
        i = 0,
        c = '',
        wArr = [],        wC = 0,
        assoc = {},
        aC = 0,
        reg = '',
        match = false; 
    // BEGIN STATIC
    var _preg_quote = function (str) {
        return (str + '').replace(/([\\\.\+\*\?\[\^\]\$\(\)\{\}\=\!<>\|\:])/g, '\\$1');
    },        _getWholeChar = function (str, i) { // Use for rare cases of non-BMP characters
            var code = str.charCodeAt(i);
            if (code < 0xD800 || code > 0xDFFF) {
                return str.charAt(i);
            }            if (0xD800 <= code && code <= 0xDBFF) { // High surrogate (could change last hex to 0xDB7F to treat high private surrogates as single characters)
                if (str.length <= (i + 1)) {
                    throw 'High surrogate without following low surrogate';
                }
                var next = str.charCodeAt(i + 1);                if (0xDC00 > next || next > 0xDFFF) {
                    throw 'High surrogate without following low surrogate';
                }
                return str.charAt(i) + str.charAt(i + 1);
            }            // Low surrogate (0xDC00 <= code && code <= 0xDFFF)
            if (i === 0) {
                throw 'Low surrogate without preceding high surrogate';
            }
            var prev = str.charCodeAt(i - 1);            if (0xD800 > prev || prev > 0xDBFF) { // (could change last hex to 0xDB7F to treat high private surrogates as single characters)
                throw 'Low surrogate without preceding high surrogate';
            }
            return false; // We can pass over low surrogates now as the second component in a pair which we have already processed
        };    // END STATIC
    if (cl) {
        reg = '^(' + _preg_quote(_getWholeChar(charlist, 0));
        for (i = 1; i < cl; i++) {
            if ((chr = _getWholeChar(charlist, i)) === false) {                continue;
            }
            reg += '|' + _preg_quote(chr);
        }
        reg += ')$';        reg = new RegExp(reg);
    }
 
    for (i = 0; i < len; i++) {
        if ((c = _getWholeChar(str, i)) === false) {            continue;
        }
        match = this.ctype_alpha(c) || (reg && c.search(reg) !== -1) || ((i !== 0 && i !== len - 1) && c === '-') || // No hyphen at beginning or end unless allowed in charlist (or locale)
        (i !== 0 && c === "'"); // No apostrophe at beginning unless allowed in charlist (or locale)
        if (match) {            if (tmpStr === '' && format === 2) {
                aC = i;
            }
            tmpStr = tmpStr + c;
        }        if (i === len - 1 || !match && tmpStr !== '') {
            if (format !== 2) {
                wArr[wArr.length] = tmpStr;
            } else {
                assoc[aC] = tmpStr;            }
            tmpStr = '';
            wC++;
        }
    } 
    if (!format) {
        return wC;
    } else if (format === 1) {
        return wArr;    } else if (format === 2) {
        return assoc;
    }
    throw 'You have supplied an incorrect format';
}
external links: original PHP docs | raw js source

Examples

» Example 1

Running

1
str_word_count("Hello fri3nd, you're\r\n       looking          good today!", 1);

Should return

1
['Hello', 'fri', 'nd', "you're", 'looking', 'good', 'today']

» Example 2

Running

1
str_word_count("Hello fri3nd, you're\r\n       looking          good today!", 2);

Should return

1
{0: 'Hello', 6: 'fri', 10: 'nd', 14: "you're", 29: 'looking', 46: 'good', 51: 'today'}

Dependencies

In order to use this function, you also need:

Open syntax issues

php.js uses JsLint to help us keep our code consistent and prevent some common bugs.

Eventually we want all code to pass or at least take into consideration most fixes suggested by JsLint, following this JsLint configuration we’ve decided on.


Authors

Thanks to the following developers, you get to have str_word_count goodness in JavaScript.

Comments

Add Comment
Use:
[CODE]
your_stuff('here');
[/CODE]
for proper code formatting
By submitting code here you are allowing us to use it in php.js hence dual licensing it under the MIT and GPL licenses

Gravatar
Brett Zamir
13 Feb '10 Permalink

q  @Bug?: Yes, you are correct. Thanks for the feedback. I have now fixed it in Git: http://github.com/kvz/phpjs/raw/master/functions/strings/str_word_count.js . Note that the new version requires ctype_alpha now (and which I also needed to update now along with a lot of other functions dependent on RegExp.test()), and that function depends on setlocale() because this function should check in a way that potentially supports what other locales consider a word. I also added support for the very rare non-BMP characters, and as per PHP, allowed hyphens in the middle or apostrophes at the middle or end (and everywhere if the charlist includes these).

Gravatar
Bug?
3 Feb '10 Permalink

q  The javascript function returns 5 words for this string:

Lorem ipsum dolor asdf asdf asdf



And 6 words for this one:

Lorem ipsum dolor asdf asdf asdf.



The PHP function returns 6 for both.

Cheers,

Chris

Gravatar
Brett Zamir
18 Jun '09 Permalink

q  I believe that to do this correctly for not only Chinese but other languages, we'll need to take a good look at the source code, specifically for PHP 6, since that is where full Unicode support is being added.

We might be able to use XRegExp (see http://stevenlevithan.com/regex/xregexp/ ) and its Unicode plug-in (at http://blog.stevenlevithan.com/archives/xregexp-unicode-plugin ) for our preg_ functions and then make str_word_count() dependent on it, though that won't really help determine what a "word" is (since, in Chinese, a character is technically only a graphical morpheme, and not necessarily also an independent word), though at least it will tell us definitively what a "letter" is. Of course, we can just go back to the source to see how PHP interprets a "word" since we're aiming for that anyways, but again, that will take some work, especially if we wish to make it work for other languages as well as Chinese. I'm pretty busy for now, but feel free to take a shot at it if you like.

FYI, as you can see by your Chinese characters getting mangled, the site is having some problems at the moment with Unicode characters, so if you need to refer to any in the future, maybe you could try using entities or the JavaScript Unicode escape sequences instead (e.g., \u0020). But I think the issue is beyond just Chinese (though Chinese in particular also raises the particular need for also handling characters beyond the Basic Multilingual Plane (BMP) since some Chinese characters fall beyond this plane--in JavaScript, such characters must be represented by two Unicode characters called surrogates (characters which are not used outside of such pairs), so we can't rely on the length of the string--see http://phpjs.org/functions/strlen for a solution).

Gravatar
Chris
13 Jun '09 Permalink

q  This function doesn't work quite like PHP's in that it fails to count each single Chinese character as an entire word, eg.:

?? hello ?

Is four words...


Contribute a New function