export const customTokenProcessor = (token: string): string | null => { // Remove dots and normalize case before processing const normalizedToken = token.replace(/\./g, '').toLowerCase() const step2list: Record = { ational: 'ate', tional: 'tion', enci: 'ence', anci: 'ance', izer: 'ize', bli: 'ble', alli: 'al', entli: 'ent', eli: 'e', ousli: 'ous', ization: 'ize', ation: 'ate', ator: 'ate', alism: 'al', iveness: 'ive', fulness: 'ful', ousness: 'ous', aliti: 'al', iviti: 'ive', biliti: 'ble', logi: 'log' } const step3list: Record = { icate: 'ic', ative: '', alize: 'al', iciti: 'ic', ical: 'ic', ful: '', ness: '' } const consonant = '[^aeiou]' const vowel = '[aeiouy]' const consonants = '(' + consonant + '[^aeiouy]*)' const vowels = '(' + vowel + '[aeiou]*)' const gt0 = new RegExp('^' + consonants + '?' + vowels + consonants) const eq1 = new RegExp( '^' + consonants + '?' + vowels + consonants + vowels + '?$' ) const gt1 = new RegExp( '^' + consonants + '?(' + vowels + consonants + '){2,}' ) const vowelInStem = new RegExp('^' + consonants + '?' + vowel) const consonantLike = new RegExp('^' + consonants + vowel + '[^aeiouwxy]$') const sfxLl = /ll$/ const sfxE = /^(.+?)e$/ const sfxY = /^(.+?)y$/ const sfxIon = /^(.+?(s|t))(ion)$/ const sfxEdOrIng = /^(.+?)(ed|ing)$/ const sfxAtOrBlOrIz = /(at|bl|iz)$/ const sfxEED = /^(.+?)eed$/ const sfxS = /^.+?[^s]s$/ const sfxSsesOrIes = /^.+?(ss|i)es$/ const sfxMultiConsonantLike = /([^aeiouylsz])\1$/ const step2 = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/ const step3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/ const step4 = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/ function stemmer(value: string) { let result = String(value).toLowerCase() // Exit early. if (result.length < 3) { return result } /** @type {boolean} */ let firstCharacterWasLowerCaseY = false // Detect initial `y`, make sure it never matches. if ( result.codePointAt(0) === 121 // Lowercase Y ) { firstCharacterWasLowerCaseY = true result = 'Y' + result.slice(1) } // Step 1a. if (sfxSsesOrIes.test(result)) { // Remove last two characters. result = result.slice(0, -2) } else if (sfxS.test(result)) { // Remove last character. result = result.slice(0, -1) } /** @type {RegExpMatchArray|null} */ let match // Step 1b. if ((match = sfxEED.exec(result))) { if (gt0.test(match[1])) { // Remove last character. result = result.slice(0, -1) } } else if ( (match = sfxEdOrIng.exec(result)) && vowelInStem.test(match[1]) ) { result = match[1] if (sfxAtOrBlOrIz.test(result)) { // Append `e`. result += 'e' } else if (sfxMultiConsonantLike.test(result)) { // Remove last character. result = result.slice(0, -1) } else if (consonantLike.test(result)) { // Append `e`. result += 'e' } } // Step 1c. if ((match = sfxY.exec(result)) && vowelInStem.test(match[1])) { // Remove suffixing `y` and append `i`. result = match[1] + 'i' } // Step 2. if ((match = step2.exec(result)) && gt0.test(match[1])) { result = match[1] + step2list[match[2]] } // Step 3. if ((match = step3.exec(result)) && gt0.test(match[1])) { result = match[1] + step3list[match[2]] } // Step 4. if ((match = step4.exec(result))) { if (gt1.test(match[1])) { result = match[1] } } else if ((match = sfxIon.exec(result)) && gt1.test(match[1])) { result = match[1] } // Step 5. if ( (match = sfxE.exec(result)) && (gt1.test(match[1]) || (eq1.test(match[1]) && !consonantLike.test(match[1]))) ) { result = match[1] } if (sfxLl.test(result) && gt1.test(result)) { result = result.slice(0, -1) } // Turn initial `Y` back to `y`. if (firstCharacterWasLowerCaseY) { result = 'y' + result.slice(1) } return result } // adapted from these two sources // https://gist.github.com/sebleier/554280 // https://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list const stopWords = new Set([ 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'cannot', 'com', 'could', 'couldn', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 'has', 'hasn', 'have', 'haven', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', 'it', 'its', 'itself', 'just', 'let', 'll', 'me', 'more', 'most', 'mustn', 'my', 'myself', 'no', 'nor', 'not', 'now', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', 'she', 'should', 'shouldn', 'so', 'some', 'such', 't', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'we', 'were', 'weren', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 'would', 'wouldn', 'you', 'your', 'yours', 'yourself', 'yourselves' ]) return stopWords.has(normalizedToken) ? null : stemmer(normalizedToken) } export const customTokenize = (text: string): string[] => { // Pre-process the text to handle dots in special cases // This will help with cases like "V.R" to match with "vr" by removing dots const preprocessedText = text.replace(/([A-Za-z])\.([A-Za-z])/g, '$1$2') // Remove dots between letters (like V.R -> VR) // This regular expression matches any Unicode space or punctuation character // Copied from https://github.com/lucaong/minisearch // which adapted from https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7BZ%7D%5Cp%7BP%7D&abb=on&c=on&esc=on const SPACE_OR_PUNCTUATION = /[\n\r -#%-*,-/:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]+/u // Split on any space or punctuation; same as minisearch default tokenizer // except i've corrected for the possibility for returning empty string const tokens = preprocessedText.split(SPACE_OR_PUNCTUATION).filter(Boolean) // Handle cases with capital letters in the middle (like "xManager" -> "x Manager") const expandedTokens: string[] = [] for (const token of tokens) { expandedTokens.push(token) // If token has a capital letter in the middle, add a version with space before it // This helps with cases like "xManager" to match with "x Manager" const splitOnCapitals = token.replace(/([a-z])([A-Z])/g, '$1 $2') if (splitOnCapitals !== token) { const additionalTokens = splitOnCapitals.split(' ').filter(Boolean) expandedTokens.push(...additionalTokens) } } return expandedTokens }