Optimize long lists of fixed string alternatives in regex

You can build a regex trie out of the words at the MyRegexTester.com site:

  1. Grab the list of all the time zones from the site you trust as a list of words each on its own line
  2. Go to https://www.myregextester.com/index.php
  3. Select WORDLIST from the dropdown box right below HIGHLIGHT MATCHES check box
    enter image description here
  4. Paste the time zone string list and click enter image description here

You will get something like

^(?:A(?:C(?:WS|[DS])?T|DT|E[DS]?T|FT|K[DS]T|LMT|M[S]?T|NAS?T|QTT|RT|ST|W[DS]T|Z(?:OS|[OS])?T|oE|T)|B(?:RS|[NORST])T|C(?:AS|ES|H(?:A[DS]|OS|[OU])|ID?S|LS|hS|[ACDEKLOSVX])?T|D(?:AV|DU)T|E(?:ASS?|ES|GS|[ACDEGS])?T|F(?:JS|KS|[EJKN])T|G(?:A[LM]|IL|[EFMSY])T|H(?:OVS?|[DKS])T|I(?:R(?:KS|[DKS])|[CDOS])T|JST|K(?:OS|RAS?|UY|[GS])T|L(?:H[DS]|IN)T|M(?:A(?:GS|[GRW])T|DT|HT|MT|S[DKT]|UT|VT|YT|T)|N(?:FD|OVS?|Z[DS]|[CDFPRSU])T|O(?:MSS?|RA)T|P(?:ETS?|HO|M[DS]|ON|YS|[DEGHKSWY])?T|QYZT|R(?:OT|E)T|S(?:A[KMS]|RE|YO|[BCGRS])T|T(?:AH|OS|[FJKLMORV])T|U(?:LAS?T|TC|YS?T|ZT)|V(?:LAS?|OS|[EU])T|W(?:A(?:RS|[KS])?T|ES?T|FT|GS?T|I(?:TA|[BT])|ST|T)|Y(?:A(?:KS|[KP])|EKS?)T|[ABCDEFGHIKLMNOPQRSTUVWXYZ])$

This codes 244 time zones.

Now, it is time to build the final regex. Take out the anchors from the above, you do not need the ^ and $ when the pattern is used in the middle of another regex.

const tz = "(?:A(?:C(?:WS|[DS])?T|DT|E[DS]?T|FT|K[DS]T|LMT|M[S]?T|NAS?T|QTT|RT|ST|W[DS]T|Z(?:OS|[OS])?T|oE|T)|B(?:RS|[NORST])T|C(?:AS|ES|H(?:A[DS]|OS|[OU])|ID?S|LS|hS|[ACDEKLOSVX])?T|D(?:AV|DU)T|E(?:ASS?|ES|GS|[ACDEGS])?T|F(?:JS|KS|[EJKN])T|G(?:A[LM]|IL|[EFMSY])T|H(?:OVS?|[DKS])T|I(?:R(?:KS|[DKS])|[CDOS])T|JST|K(?:OS|RAS?|UY|[GS])T|L(?:H[DS]|IN)T|M(?:A(?:GS|[GRW])T|DT|HT|MT|S[DKT]|UT|VT|YT|T)|N(?:FD|OVS?|Z[DS]|[CDFPRSU])T|O(?:MSS?|RA)T|P(?:ETS?|HO|M[DS]|ON|YS|[DEGHKSWY])?T|QYZT|R(?:OT|E)T|S(?:A[KMS]|RE|YO|[BCGRS])T|T(?:AH|OS|[FJKLMORV])T|U(?:LAS?T|TC|YS?T|ZT)|V(?:LAS?|OS|[EU])T|W(?:A(?:RS|[KS])?T|ES?T|FT|GS?T|I(?:TA|[BT])|ST|T)|Y(?:A(?:KS|[KP])|EKS?)T|[ABCDEFGHIKLMNOPQRSTUVWXYZ])"
const regex = new RegExp(String.raw`(\d+)(?:[.: ](\d\d))*\s*([aApP][.: ]?[mM][.: ]?|o['’]clock)*\s*${tz}`, 'gm');
console.log(regex.source)

I am not sure the regex will work for you in the end, see its demo online, but now you see how you can use a collection of literal string patterns inside a regex pattern in the most optimal way.

To build regex tries dynamically inside your code, you need something like trie-regex. There are other applications that are capable of creating regex tries out of string lists.

Below, you can generate a regex trie for your given comma or linebreak separated word list:

!function(){return function t(e,n,r){function i(s,u){if(!n[s]){if(!e[s]){var c="function"==typeof require&&require;if(!u&&c)return c(s,!0);if(o)return o(s,!0);var f=new Error("Cannot find module '"+s+"'");throw f.code="MODULE_NOT_FOUND",f}var a=n[s]={exports:{}};e[s][0].call(a.exports,function(t){return i(e[s][1][t]||t)},a,a.exports,t,e,n,r)}return n[s].exports}for(var o="function"==typeof require&&require,s=0;s<r.length;s++)i(r[s]);return i}}()({1:[function(t,e,n){(function(t){(function(){!function(r){var i="object"==typeof n&&n,o="object"==typeof e&&e&&e.exports==i&&e,s="object"==typeof t&&t;s.global!==s&&s.window!==s||(r=s);var u={},c=u.hasOwnProperty,f=function(t,e){var n;for(n in t)c.call(t,n)&&e(n,t[n])},a=u.toString,p={'"':'\\"',"'":"\\'","\\":"\\\\","\b":"\\b","\f":"\\f","\n":"\\n","\r":"\\r","\t":"\\t"},_=/["'\\\b\f\n\r\t]/,h=/[0-9]/,l=/[ !#-&\(-\[\]-~]/,d=function(t,e){var n,r,i={escapeEverything:!1,quotes:"single",wrap:!1,es6:!1,json:!1,compact:!0,indent:"\t",__indent__:""},o=e&&e.json;o&&(i.quotes="double",i.wrap=!0),n=i,"single"!=(e=(r=e)?(f(r,function(t,e){n[t]=e}),n):n).quotes&&"double"!=e.quotes&&(e.quotes="single");var s,u,c,g="double"==e.quotes?'"':"'",y=e.compact,v=e.indent,b=y?"":"\n",j=!0;if(o&&t&&("function"==typeof(c=t.toJSON)||"[object Function]"==a.call(c))&&(t=t.toJSON()),!function(t){return"string"==typeof t||"[object String]"==a.call(t)}(t))return function(t){return"[object Array]"==a.call(t)}(t)?(u=[],e.wrap=!0,s=e.__indent__,v+=s,e.__indent__=v,function(t,e){for(var n=t.length,r=-1;++r<n;)e(t[r])}(t,function(t){j=!1,u.push((y?"":v)+d(t,e))}),j?"[]":"["+b+u.join(","+b)+b+(y?"":s)+"]"):function(t){return"[object Object]"==a.call(t)}(t)?(u=[],e.wrap=!0,s=e.__indent__,v+=s,e.__indent__=v,f(t,function(t,n){j=!1,u.push((y?"":v)+d(t,e)+":"+(y?"":" ")+d(n,e))}),j?"{}":"{"+b+u.join(","+b)+b+(y?"":s)+"}"):o?JSON.stringify(t)||"null":String(t);var w,m,x=t,q=-1,E=x.length;for(u="";++q<E;){var S=x.charAt(q);if(e.es6&&(w=x.charCodeAt(q))>=55296&&w<=56319&&E>q+1&&(m=x.charCodeAt(q+1))>=56320&&m<=57343)u+="\\u{"+(1024*(w-55296)+m-56320+65536).toString(16).toUpperCase()+"}",q++;else{if(!e.escapeEverything){if(l.test(S)){u+=S;continue}if('"'==S){u+=g==S?'\\"':S;continue}if("'"==S){u+=g==S?"\\'":S;continue}}if("\0"!=S||o||h.test(x.charAt(q+1)))if(_.test(S))u+=p[S];else{var O=S.charCodeAt(0).toString(16).toUpperCase(),A=O.length>2||o,N="\\"+(A?"u":"x")+("0000"+O).slice(A?-4:-2);u+=N}else u+="\\0"}}return e.wrap&&(u=g+u+g),u};d.version="0.5.0","function"==typeof define&&"object"==typeof define.amd&&define.amd?define(function(){return d}):i&&!i.nodeType?o?o.exports=d:i.jsesc=d:r.jsesc=d}(this)}).call(this)}).call(this,"undefined"!=typeof global?global:"undefined"!=typeof self?self:"undefined"!=typeof window?window:{})},{}],2:[function(t,e,n){var r=t("jsesc"),i=function(){"use strict";var t=function(){return this instanceof t?(this._num_phrases_in_trie=0,this._trie={},this):new t};return t.prototype.add=function(t){if(t instanceof Array&&t.forEach(this.add,this),t=this._coerce_to_string(t),!this._is_phrase_valid(t))return this;if(this.contains(t))return this;var e=this._trie;return t.split("").forEach(function(t){t in e?e=e[t]:(e[t]={},e=e[t])},this),e.end=!0,this._num_phrases_in_trie++,this},t.prototype.toRegExp=function(){if(0!==this._num_phrases_in_trie){var t=this.toString();return new RegExp(t)}},t.prototype.toString=function(){if(0!==this._num_phrases_in_trie){var t=function(e,n){var r=Object.keys(e),i=[],o=[],s=!1;return r.forEach(function(u){var c;"end"!==u?(c=this._quotemeta(u)+t(e[u],n),(r.length>1?[].push.bind(i):[].push.bind(o))(c)):s=!0},n),n._to_regex(i,o,s)};return t(this._trie,this)}},t.prototype._to_regex=function(t,e,n){var r="";return t.length>0?1===t.length?r+=t[0]:t.every(function(t){return 1===t.length})?r+="["+t.join("")+"]":r+="(?:"+t.join("|")+")":e.length>0&&(r+=e[0]),n&&r&&(1===r.length?r+="?":r="(?:"+r+")?"),r},t.prototype.contains=function(t){if(!this._is_phrase_valid(t)&&this._num_phrases_in_trie>0)return!1;var e=this._trie;try{return t.split("").forEach(function(t){e=e[t]}),e.hasOwnProperty("end")&&!0===e.end}catch(t){}return!1},t.prototype._coerce_to_string=function(t){return"number"!=typeof t||isNaN(t)||(t=t.toString()),t},t.prototype._is_phrase_valid=function(t){return"string"==typeof t&&t.length>0},t.prototype._quotemeta=function(t){return this._is_phrase_valid(t)?t.replace(/([\t\n\f\r\\\$\(\)\*\+\-\.\?\[\]\^\{\|\}])/g,"\\$1").replace(/[^\x20-\x7E]/g,r):t},t}();e.exports=i},{jsesc:1}],3:[function(t,e,n){$(document).ready(function(){$("#run").click(function(e){e.preventDefault(),$("#result").html("");const n=$("#inputdata").val().match(/[^\r\n,]+/g);let r=new(t("regex-trie"));r.add(n);const i=r.toRegExp();$("#result").append("<b>"+i.source.replace(/&/g,"&amp;").replace(/</g,"&lt;").replace(/>/g,"&gt;")+"</b><br/>")})})},{"regex-trie":2}]},{},[3]);
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<form>
  <p>Add the comma or linebreak separated words to the text area and click <i>Generate</i>.</p>
  <textarea id="inputdata"></textarea>
  <br/>
  <button id="run">Generate</BUTTON>
  <pre><code id="result" /></pre>
</form>

Leave a Comment