mirror of
https://github.com/fmhy/edit.git
synced 2025-08-01 00:32:30 +10:00
wip: init
This commit is contained in:
parent
45d9157281
commit
2606a39f8d
10 changed files with 1670 additions and 55 deletions
360
docs/.vitepress/search.ts
Normal file
360
docs/.vitepress/search.ts
Normal file
|
@ -0,0 +1,360 @@
|
|||
export const customTokenProcessor = (token: string): string | null => {
|
||||
// Remove dots and normalize case before processing
|
||||
const normalizedToken = token.replace(/\./g, '').toLowerCase()
|
||||
|
||||
const step2list: Record<string, string> = {
|
||||
ational: 'ate',
|
||||
tional: 'tion',
|
||||
enci: 'ence',
|
||||
anci: 'ance',
|
||||
izer: 'ize',
|
||||
bli: 'ble',
|
||||
alli: 'al',
|
||||
entli: 'ent',
|
||||
eli: 'e',
|
||||
ousli: 'ous',
|
||||
ization: 'ize',
|
||||
ation: 'ate',
|
||||
ator: 'ate',
|
||||
alism: 'al',
|
||||
iveness: 'ive',
|
||||
fulness: 'ful',
|
||||
ousness: 'ous',
|
||||
aliti: 'al',
|
||||
iviti: 'ive',
|
||||
biliti: 'ble',
|
||||
logi: 'log'
|
||||
}
|
||||
|
||||
const step3list: Record<string, string> = {
|
||||
icate: 'ic',
|
||||
ative: '',
|
||||
alize: 'al',
|
||||
iciti: 'ic',
|
||||
ical: 'ic',
|
||||
ful: '',
|
||||
ness: ''
|
||||
}
|
||||
|
||||
const consonant = '[^aeiou]'
|
||||
const vowel = '[aeiouy]'
|
||||
const consonants = '(' + consonant + '[^aeiouy]*)'
|
||||
const vowels = '(' + vowel + '[aeiou]*)'
|
||||
|
||||
const gt0 = new RegExp('^' + consonants + '?' + vowels + consonants)
|
||||
const eq1 = new RegExp(
|
||||
'^' + consonants + '?' + vowels + consonants + vowels + '?$'
|
||||
)
|
||||
const gt1 = new RegExp(
|
||||
'^' + consonants + '?(' + vowels + consonants + '){2,}'
|
||||
)
|
||||
const vowelInStem = new RegExp('^' + consonants + '?' + vowel)
|
||||
const consonantLike = new RegExp('^' + consonants + vowel + '[^aeiouwxy]$')
|
||||
|
||||
const sfxLl = /ll$/
|
||||
const sfxE = /^(.+?)e$/
|
||||
const sfxY = /^(.+?)y$/
|
||||
const sfxIon = /^(.+?(s|t))(ion)$/
|
||||
const sfxEdOrIng = /^(.+?)(ed|ing)$/
|
||||
const sfxAtOrBlOrIz = /(at|bl|iz)$/
|
||||
const sfxEED = /^(.+?)eed$/
|
||||
const sfxS = /^.+?[^s]s$/
|
||||
const sfxSsesOrIes = /^.+?(ss|i)es$/
|
||||
const sfxMultiConsonantLike = /([^aeiouylsz])\1$/
|
||||
const step2 =
|
||||
/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/
|
||||
const step3 = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/
|
||||
const step4 =
|
||||
/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/
|
||||
|
||||
function stemmer(value: string) {
|
||||
let result = String(value).toLowerCase()
|
||||
|
||||
// Exit early.
|
||||
if (result.length < 3) {
|
||||
return result
|
||||
}
|
||||
|
||||
/** @type {boolean} */
|
||||
let firstCharacterWasLowerCaseY = false
|
||||
|
||||
// Detect initial `y`, make sure it never matches.
|
||||
if (
|
||||
result.codePointAt(0) === 121 // Lowercase Y
|
||||
) {
|
||||
firstCharacterWasLowerCaseY = true
|
||||
result = 'Y' + result.slice(1)
|
||||
}
|
||||
|
||||
// Step 1a.
|
||||
if (sfxSsesOrIes.test(result)) {
|
||||
// Remove last two characters.
|
||||
result = result.slice(0, -2)
|
||||
} else if (sfxS.test(result)) {
|
||||
// Remove last character.
|
||||
result = result.slice(0, -1)
|
||||
}
|
||||
|
||||
/** @type {RegExpMatchArray|null} */
|
||||
let match
|
||||
|
||||
// Step 1b.
|
||||
if ((match = sfxEED.exec(result))) {
|
||||
if (gt0.test(match[1])) {
|
||||
// Remove last character.
|
||||
result = result.slice(0, -1)
|
||||
}
|
||||
} else if (
|
||||
(match = sfxEdOrIng.exec(result)) &&
|
||||
vowelInStem.test(match[1])
|
||||
) {
|
||||
result = match[1]
|
||||
|
||||
if (sfxAtOrBlOrIz.test(result)) {
|
||||
// Append `e`.
|
||||
result += 'e'
|
||||
} else if (sfxMultiConsonantLike.test(result)) {
|
||||
// Remove last character.
|
||||
result = result.slice(0, -1)
|
||||
} else if (consonantLike.test(result)) {
|
||||
// Append `e`.
|
||||
result += 'e'
|
||||
}
|
||||
}
|
||||
|
||||
// Step 1c.
|
||||
if ((match = sfxY.exec(result)) && vowelInStem.test(match[1])) {
|
||||
// Remove suffixing `y` and append `i`.
|
||||
result = match[1] + 'i'
|
||||
}
|
||||
|
||||
// Step 2.
|
||||
if ((match = step2.exec(result)) && gt0.test(match[1])) {
|
||||
result = match[1] + step2list[match[2]]
|
||||
}
|
||||
|
||||
// Step 3.
|
||||
if ((match = step3.exec(result)) && gt0.test(match[1])) {
|
||||
result = match[1] + step3list[match[2]]
|
||||
}
|
||||
|
||||
// Step 4.
|
||||
if ((match = step4.exec(result))) {
|
||||
if (gt1.test(match[1])) {
|
||||
result = match[1]
|
||||
}
|
||||
} else if ((match = sfxIon.exec(result)) && gt1.test(match[1])) {
|
||||
result = match[1]
|
||||
}
|
||||
|
||||
// Step 5.
|
||||
if (
|
||||
(match = sfxE.exec(result)) &&
|
||||
(gt1.test(match[1]) ||
|
||||
(eq1.test(match[1]) && !consonantLike.test(match[1])))
|
||||
) {
|
||||
result = match[1]
|
||||
}
|
||||
|
||||
if (sfxLl.test(result) && gt1.test(result)) {
|
||||
result = result.slice(0, -1)
|
||||
}
|
||||
|
||||
// Turn initial `Y` back to `y`.
|
||||
if (firstCharacterWasLowerCaseY) {
|
||||
result = 'y' + result.slice(1)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
// adapted from these two sources
|
||||
// https://gist.github.com/sebleier/554280
|
||||
// https://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list
|
||||
const stopWords = new Set([
|
||||
'a',
|
||||
'about',
|
||||
'above',
|
||||
'after',
|
||||
'again',
|
||||
'against',
|
||||
'all',
|
||||
'am',
|
||||
'an',
|
||||
'and',
|
||||
'any',
|
||||
'are',
|
||||
'aren',
|
||||
'as',
|
||||
'at',
|
||||
'be',
|
||||
'because',
|
||||
'been',
|
||||
'before',
|
||||
'being',
|
||||
'below',
|
||||
'between',
|
||||
'both',
|
||||
'but',
|
||||
'by',
|
||||
'can',
|
||||
'cannot',
|
||||
'com',
|
||||
'could',
|
||||
'couldn',
|
||||
'did',
|
||||
'didn',
|
||||
'do',
|
||||
'does',
|
||||
'doesn',
|
||||
'doing',
|
||||
'down',
|
||||
'during',
|
||||
'each',
|
||||
'few',
|
||||
'for',
|
||||
'from',
|
||||
'further',
|
||||
'had',
|
||||
'hadn',
|
||||
'has',
|
||||
'hasn',
|
||||
'have',
|
||||
'haven',
|
||||
'having',
|
||||
'he',
|
||||
'her',
|
||||
'here',
|
||||
'hers',
|
||||
'herself',
|
||||
'him',
|
||||
'himself',
|
||||
'his',
|
||||
'how',
|
||||
'i',
|
||||
'if',
|
||||
'in',
|
||||
'into',
|
||||
'is',
|
||||
'isn',
|
||||
'it',
|
||||
'its',
|
||||
'itself',
|
||||
'just',
|
||||
'let',
|
||||
'll',
|
||||
'me',
|
||||
'more',
|
||||
'most',
|
||||
'mustn',
|
||||
'my',
|
||||
'myself',
|
||||
'no',
|
||||
'nor',
|
||||
'not',
|
||||
'now',
|
||||
'of',
|
||||
'off',
|
||||
'on',
|
||||
'once',
|
||||
'only',
|
||||
'or',
|
||||
'other',
|
||||
'ought',
|
||||
'our',
|
||||
'ours',
|
||||
'ourselves',
|
||||
'out',
|
||||
'over',
|
||||
'own',
|
||||
're',
|
||||
's',
|
||||
'same',
|
||||
'shan',
|
||||
'she',
|
||||
'should',
|
||||
'shouldn',
|
||||
'so',
|
||||
'some',
|
||||
'such',
|
||||
't',
|
||||
'than',
|
||||
'that',
|
||||
'the',
|
||||
'their',
|
||||
'theirs',
|
||||
'them',
|
||||
'themselves',
|
||||
'then',
|
||||
'there',
|
||||
'these',
|
||||
'they',
|
||||
'this',
|
||||
'those',
|
||||
'through',
|
||||
'to',
|
||||
'too',
|
||||
'under',
|
||||
'until',
|
||||
'up',
|
||||
've',
|
||||
'very',
|
||||
'was',
|
||||
'wasn',
|
||||
'we',
|
||||
'were',
|
||||
'weren',
|
||||
'what',
|
||||
'when',
|
||||
'where',
|
||||
'which',
|
||||
'while',
|
||||
'who',
|
||||
'whom',
|
||||
'why',
|
||||
'will',
|
||||
'with',
|
||||
'won',
|
||||
'would',
|
||||
'wouldn',
|
||||
'you',
|
||||
'your',
|
||||
'yours',
|
||||
'yourself',
|
||||
'yourselves'
|
||||
])
|
||||
|
||||
return stopWords.has(normalizedToken) ? null : stemmer(normalizedToken)
|
||||
}
|
||||
|
||||
export const customTokenize = (text: string): string[] => {
|
||||
// Pre-process the text to handle dots in special cases
|
||||
// This will help with cases like "V.R" to match with "vr" by removing dots
|
||||
const preprocessedText = text.replace(/([A-Za-z])\.([A-Za-z])/g, '$1$2') // Remove dots between letters (like V.R -> VR)
|
||||
|
||||
// This regular expression matches any Unicode space or punctuation character
|
||||
// Copied from https://github.com/lucaong/minisearch
|
||||
// which adapted from https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7BZ%7D%5Cp%7BP%7D&abb=on&c=on&esc=on
|
||||
const SPACE_OR_PUNCTUATION =
|
||||
/[\n\r -#%-*,-/:;?@[-\]_{}\u00A0\u00A1\u00A7\u00AB\u00B6\u00B7\u00BB\u00BF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C77\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166E\u1680\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2000-\u200A\u2010-\u2029\u202F-\u2043\u2045-\u2051\u2053-\u205F\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4F\u3000-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]+/u
|
||||
|
||||
// Split on any space or punctuation; same as minisearch default tokenizer
|
||||
// except i've corrected for the possibility for returning empty string
|
||||
const tokens = preprocessedText.split(SPACE_OR_PUNCTUATION).filter(Boolean)
|
||||
|
||||
// Handle cases with capital letters in the middle (like "xManager" -> "x Manager")
|
||||
const expandedTokens: string[] = []
|
||||
|
||||
for (const token of tokens) {
|
||||
expandedTokens.push(token)
|
||||
|
||||
// If token has a capital letter in the middle, add a version with space before it
|
||||
// This helps with cases like "xManager" to match with "x Manager"
|
||||
const splitOnCapitals = token.replace(/([a-z])([A-Z])/g, '$1 $2')
|
||||
if (splitOnCapitals !== token) {
|
||||
const additionalTokens = splitOnCapitals.split(' ').filter(Boolean)
|
||||
expandedTokens.push(...additionalTokens)
|
||||
}
|
||||
}
|
||||
|
||||
return expandedTokens
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue