wip: init

2025-08-01 00:32:30 +10:00 · 2025-05-01 20:56:22 +00:00 · 2025-05-01 20:56:22 +00:00 · 2606a39f8d
commit 2606a39f8d
parent 45d9157281
10 changed files with 1670 additions and 55 deletions
--- a/docs/.vitepress/search.ts
+++ b/docs/.vitepress/search.ts
@ -0,0 +1,360 @@
+export const customTokenProcessor = (token: string): string | null => {
+  // Remove dots and normalize case before processing
+  const normalizedToken = token.replace(/\./g, '').toLowerCase()
+
+  const step2list: Record<string, string> = {
+    ational: 'ate',
+    tional: 'tion',
+    enci: 'ence',
+    anci: 'ance',
+    izer: 'ize',
+    bli: 'ble',
+    alli: 'al',
+    entli: 'ent',
+    eli: 'e',
+    ousli: 'ous',
+    ization: 'ize',
+    ation: 'ate',
+    ator: 'ate',
+    alism: 'al',
+    iveness: 'ive',
+    fulness: 'ful',
+    ousness: 'ous',
+    aliti: 'al',
+    iviti: 'ive',
+    biliti: 'ble',
+    logi: 'log'
+  }
+
+  const step3list: Record<string, string> = {
+    icate: 'ic',
+    ative: '',
+    alize: 'al',
+    iciti: 'ic',
+    ical: 'ic',
+    ful: '',
+    ness: ''
+  }
+
+  const consonant = '[^aeiou]'
+  const vowel = '[aeiouy]'
+  const consonants = '(' + consonant + '[^aeiouy]*)'
+  const vowels = '(' + vowel + '[aeiou]*)'
+
+  const gt0 = new RegExp('^' + consonants + '?' + vowels + consonants)
+  const eq1 = new RegExp(
+    '^' + consonants + '?' + vowels + consonants + vowels + '?$'
+  )
+  const gt1 = new RegExp(
+    '^' + consonants + '?(' + vowels + consonants + '){2,}'
+  )
+  const vowelInStem = new RegExp('^' + consonants + '?' + vowel)
+  const consonantLike = new RegExp('^' + consonants + vowel + '[^aeiouwxy]$')
+
+  const sfxLl = /ll$/
+  const sfxE = /^(.+?)e$/
+  const sfxY = /^(.+?)y$/
+  const sfxIon = /^(.+?(s|t))(ion)$/
+  const sfxEdOrIng = /^(.+?)(ed|ing)$/
+  const sfxAtOrBlOrIz = /(at|bl|iz)$/
+  const sfxEED = /^(.+?)eed$/
+  const sfxS = /^.+?[^s]s$/
+  const sfxSsesOrIes = /^.+?(ss|i)es$/
+  const sfxMultiConsonantLike = /([^aeiouylsz])\1$/
+  const step2 =
+    /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/
+  const step3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/
+  const step4 =
+    /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/
+
+  function stemmer(value: string) {
+    let result = String(value).toLowerCase()
+
+    // Exit early.
+    if (result.length < 3) {
+      return result
+    }
+
+    /** @type {boolean} */
+    let firstCharacterWasLowerCaseY = false
+
+    // Detect initial `y`, make sure it never matches.
+    if (
+      result.codePointAt(0) === 121 // Lowercase Y
+    ) {
+      firstCharacterWasLowerCaseY = true
+      result = 'Y' + result.slice(1)
+    }
+
+    // Step 1a.
+    if (sfxSsesOrIes.test(result)) {
+      // Remove last two characters.
+      result = result.slice(0, -2)
+    } else if (sfxS.test(result)) {
+      // Remove last character.
+      result = result.slice(0, -1)
+    }
+
+    /** @type {RegExpMatchArray|null} */
+    let match
+
+    // Step 1b.
+    if ((match = sfxEED.exec(result))) {
+      if (gt0.test(match[1])) {
+        // Remove last character.
+        result = result.slice(0, -1)
+      }
+    } else if (
+      (match = sfxEdOrIng.exec(result)) &&
+      vowelInStem.test(match[1])
+    ) {
+      result = match[1]
+
+      if (sfxAtOrBlOrIz.test(result)) {
+        // Append `e`.
+        result += 'e'
+      } else if (sfxMultiConsonantLike.test(result)) {
+        // Remove last character.
+        result = result.slice(0, -1)
+      } else if (consonantLike.test(result)) {
+        // Append `e`.
+        result += 'e'
+      }
+    }
+
+    // Step 1c.
+    if ((match = sfxY.exec(result)) && vowelInStem.test(match[1])) {
+      // Remove suffixing `y` and append `i`.
+      result = match[1] + 'i'
+    }
+
+    // Step 2.
+    if ((match = step2.exec(result)) && gt0.test(match[1])) {
+      result = match[1] + step2list[match[2]]
+    }
+
+    // Step 3.
+    if ((match = step3.exec(result)) && gt0.test(match[1])) {
+      result = match[1] + step3list[match[2]]
+    }
+
+    // Step 4.
+    if ((match = step4.exec(result))) {
+      if (gt1.test(match[1])) {
+        result = match[1]
+      }
+    } else if ((match = sfxIon.exec(result)) && gt1.test(match[1])) {
+      result = match[1]
+    }
+
+    // Step 5.
+    if (
+      (match = sfxE.exec(result)) &&
+      (gt1.test(match[1]) ||
+        (eq1.test(match[1]) && !consonantLike.test(match[1])))
+    ) {
+      result = match[1]
+    }
+
+    if (sfxLl.test(result) && gt1.test(result)) {
+      result = result.slice(0, -1)
+    }
+
+    // Turn initial `Y` back to `y`.
+    if (firstCharacterWasLowerCaseY) {
+      result = 'y' + result.slice(1)
+    }
+
+    return result
+  }
+  // adapted from these two sources
+  // https://gist.github.com/sebleier/554280
+  // https://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list
+  const stopWords = new Set([
+    'a',
+    'about',
+    'above',
+    'after',
+    'again',
+    'against',
+    'all',
+    'am',
+    'an',
+    'and',
+    'any',
+    'are',
+    'aren',
+    'as',
+    'at',
+    'be',
+    'because',
+    'been',
+    'before',
+    'being',
+    'below',
+    'between',
+    'both',
+    'but',
+    'by',
+    'can',
+    'cannot',
+    'com',
+    'could',
+    'couldn',
+    'did',
+    'didn',
+    'do',
+    'does',
+    'doesn',
+    'doing',
+    'down',
+    'during',
+    'each',
+    'few',
+    'for',
+    'from',
+    'further',
+    'had',
+    'hadn',
+    'has',
+    'hasn',
+    'have',
+    'haven',
+    'having',
+    'he',
+    'her',
+    'here',
+    'hers',
+    'herself',
+    'him',
+    'himself',
+    'his',
+    'how',
+    'i',
+    'if',
+    'in',
+    'into',
+    'is',
+    'isn',
+    'it',
+    'its',
+    'itself',
+    'just',
+    'let',
+    'll',
+    'me',
+    'more',
+    'most',
+    'mustn',
+    'my',
+    'myself',
+    'no',
+    'nor',
+    'not',
+    'now',
+    'of',
+    'off',
+    'on',
+    'once',
+    'only',
+    'or',
+    'other',
+    'ought',
+    'our',
+    'ours',
+    'ourselves',
+    'out',
+    'over',
+    'own',
+    're',
+    's',
+    'same',
+    'shan',
+    'she',
+    'should',
+    'shouldn',
+    'so',
+    'some',
+    'such',
+    't',
+    'than',
+    'that',
+    'the',
+    'their',
+    'theirs',
+    'them',
+    'themselves',
+    'then',
+    'there',
+    'these',
+    'they',
+    'this',
+    'those',
+    'through',
+    'to',
+    'too',
+    'under',
+    'until',
+    'up',
+    've',
+    'very',
+    'was',
+    'wasn',
+    'we',
+    'were',
+    'weren',
+    'what',
+    'when',
+    'where',
+    'which',
+    'while',
+    'who',
+    'whom',
+    'why',
+    'will',
+    'with',
+    'won',
+    'would',
+    'wouldn',
+    'you',
+    'your',
+    'yours',
+    'yourself',
+    'yourselves'
+  ])
+
+  return stopWords.has(normalizedToken) ? null : stemmer(normalizedToken)
+}
+
+export const customTokenize = (text: string): string[] => {
+  // Pre-process the text to handle dots in special cases
+  // This will help with cases like "V.R" to match with "vr" by removing dots
+  const preprocessedText = text.replace(/([A-Za-z])\.([A-Za-z])/g, '$1$2') // Remove dots between letters (like V.R -> VR)
+
+  // This regular expression matches any Unicode space or punctuation character
+  // Copied from https://github.com/lucaong/minisearch
+  // which adapted from https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7BZ%7D%5Cp%7BP%7D&abb=on&c=on&esc=on
+  const SPACE_OR_PUNCTUATION =
+    /[\n\r -#%-*,-/:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]+/u
+
+  // Split on any space or punctuation; same as minisearch default tokenizer
+  // except i've corrected for the possibility for returning empty string
+  const tokens = preprocessedText.split(SPACE_OR_PUNCTUATION).filter(Boolean)
+
+  // Handle cases with capital letters in the middle (like "xManager" -> "x Manager")
+  const expandedTokens: string[] = []
+
+  for (const token of tokens) {
+    expandedTokens.push(token)
+
+    // If token has a capital letter in the middle, add a version with space before it
+    // This helps with cases like "xManager" to match with "x Manager"
+    const splitOnCapitals = token.replace(/([a-z])([A-Z])/g, '$1 $2')
+    if (splitOnCapitals !== token) {
+      const additionalTokens = splitOnCapitals.split(' ').filter(Boolean)
+      expandedTokens.push(...additionalTokens)
+    }
+  }
+
+  return expandedTokens
+}