To split a str on any run of non-word characters I.e. Not A-Z, 0-9, and underscore.
var words=str.split(/\W+/); // assumes str does not begin nor end with whitespace
Or, assuming your target language is English, you can extract all semantically useful values from a string (i.e. “tokenizing” a string) using:
var str="Here\"s a (good, bad, indifferent, ...) '+
'example sentence to be used in this test '+
'of English language "token-extraction".',
punct="\\["+ '\\!'+ '\\"'+ '\\#'+ '\\$'+ // since javascript does not
'\\%'+ '\\&'+ '\\\''+ '\\('+ '\\)'+ // support POSIX character
'\\*'+ '\\+'+ '\\,'+ '\\\\'+ '\\-'+ // classes, we'll need our
'\\.'+ '\\/'+ '\\:'+ '\\;'+ '\\<'+ // own version of [:punct:]
'\\='+ '\\>'+ '\\?'+ '\\@'+ '\\['+
'\\]'+ '\\^'+ '\\_'+ '\\`'+ '\\{'+
'\\|'+ '\\}'+ '\\~'+ '\\]',
re=new RegExp( // tokenizer
'\\s*'+ // discard possible leading whitespace
'('+ // start capture group
'\\.{3}'+ // ellipsis (must appear before punct)
'|'+ // alternator
'\\w+\\-\\w+'+ // hyphenated words (must appear before punct)
'|'+ // alternator
'\\w+\'(?:\\w+)?'+ // compound words (must appear before punct)
'|'+ // alternator
'\\w+'+ // other words
'|'+ // alternator
'['+punct+']'+ // punct
')' // end capture group
);
// grep(ary[,filt]) - filters an array
// note: could use jQuery.grep() instead
// @param {Array} ary array of members to filter
// @param {Function} filt function to test truthiness of member,
// if omitted, "function(member){ if(member) return member; }" is assumed
// @returns {Array} all members of ary where result of filter is truthy
function grep(ary,filt) {
var result=[];
for(var i=0,len=ary.length;i++<len;) {
var member=ary[i]||'';
if(filt && (typeof filt === 'Function') ? filt(member) : member) {
result.push(member);
}
}
return result;
}
var tokens=grep( str.split(re) ); // note: filter function omitted
// since all we need to test
// for is truthiness
which produces:
tokens=[
'Here\'s',
'a',
'(',
'good',
',',
'bad',
',',
'indifferent',
',',
'...',
')',
'example',
'sentence',
'to',
'be',
'used',
'in',
'this',
'test',
'of',
'English',
'language',
'"',
'token-extraction',
'"',
'.'
]
EDIT
Also available as a Github Gist