teidesu-scripts/scripts/infra/navidrome-find-duplicates.ts

import type { NavidromeSong } from '../../utils/navidrome.ts'
import { createRequire } from 'node:module'

import { join } from 'node:path'
import kuromoji from 'kuromoji'
import { isKana, toRomaji } from 'wanakana'

import { fetchSongs, navidromeFfetch as ffetch } from '../../utils/navidrome.ts'

const WHITELIST_KEYS = new Set([
  // actual different tracks with the same title
  '["sorry about my face","untitled track"]',
  '["kooeetekumogeemusu","neko bushou sengoku emaki"]',
  '["eve","merufuakutorii"]',
  // todo
  '["arm","legend of zelda"]',
  '["arm","tomorrow heart beat ~ ashita anata ni dokkidoki☆ ~"]',
  '["dwat","rotladatormarf"]',
  '["fujiwara mari sai","zenbuatashinokawaiino"]',
])

const moji = await new Promise<any>((resolve, reject) => {
  kuromoji.builder({
    dicPath: join(createRequire(import.meta.url).resolve('kuromoji/'), '../../dict'),
  }).build((err, tokenizer) => {
    if (err) return reject(err)
    resolve(tokenizer)
  })
})

function clean(s: string) {
  const str = s.toLowerCase()
    .replace(/\(Explicit\)/i, '')
    .replace(/[!@#$%^&*()_+=[\]{}\\|/,.;':"<>`~-]/g, '')

  if (str.match(/[\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\uFF00-\uFF9F\u4E00-\u9FAF\u3400-\u4DBF]/)) {
    // has japanese
    const tokens = moji.tokenize(str)

    let res = ''

    for (const token of tokens) {
      if (token.word_type === 'UNKNOWN') {
        res += isKana(token.surface_form) ? toRomaji(token.surface_form) : token.surface_form
      } else if (token.word_type === 'KNOWN') {
        res += `${toRomaji(token.reading)} `
      }
    }

    return res.trimEnd()
  }

  return str
}

const CHUNK_SIZE = 1000

function getSongKey(song: NavidromeSong) {
  return JSON.stringify([
    clean(song.artist),
    clean(song.title),
  ])
}

const seen = new Map<string, NavidromeSong[]>()

for (let offset = 0; ; offset += CHUNK_SIZE) {
  const songs = await fetchSongs(offset, CHUNK_SIZE)
  if (songs.length === 0) break

  for (const song of songs) {
    const key = getSongKey(song)
    if (WHITELIST_KEYS.has(key)) continue
    let arr = seen.get(key)
    if (!arr) {
      arr = []
      seen.set(key, arr)
    }

    arr.push(song)
  }

  console.log('⌛ fetched chunk %d (%d items)', Math.floor(offset / CHUNK_SIZE), songs.length)
}

const keysSorted = Array.from(seen.keys()).sort()

let duplicates = 0
for (const key of keysSorted) {
  const arr = seen.get(key)!
  if (arr.length === 1) continue

  duplicates += 1
  console.log()
  console.log('found duplicates for %s:', key)
  for (const song of arr) {
    console.log('  %s - %s (from %s - %s) (at %s)', song.artist, song.title, song.albumArtist, song.album, song.path)
  }
}

if (duplicates === 0) {
  console.log('✅ no duplicates found')
} else {
  console.log('🚨 %d duplicates found', duplicates)
}