test: add LLM retrieval accuracy tests

2026-02-09 20:40:39 +08:00 · 2025-10-27 11:48:33 +01:00
parent eb8f7e28e1
commit 3c840259fe
25 changed files with 21404 additions and 723 deletions
--- a/benchmarks/scripts/accuracy-benchmark.ts
+++ b/benchmarks/scripts/accuracy-benchmark.ts
@@ -0,0 +1,140 @@
+/**
+ * TOON LLM Accuracy Benchmark
+ *
+ * Main entry point that orchestrates the full benchmark:
+ * 1. Generate questions from datasets
+ * 2. Format data in all formats (JSON, TOON, YAML, Markdown-kv)
+ * 3. Evaluate each question with each format using LLMs
+ * 4. Generate reports
+ */
+
+import type { EvaluationResult, Question } from '../src/types'
+import * as fsp from 'node:fs/promises'
+import * as path from 'node:path'
+import { consola } from 'consola'
+import pMap from 'p-map'
+import { BENCHMARKS_DIR, DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, ROOT_DIR } from '../src/constants'
+import { datasets } from '../src/datasets'
+import { evaluateQuestion, models } from '../src/evaluate'
+import { formatters } from '../src/formatters'
+import { generateQuestions } from '../src/questions'
+import { calculateFormatResults, calculateTokenCounts, saveResults } from '../src/report'
+
+consola.start('LLM Accuracy Benchmark for TOON')
+
+// Check if results already exist
+const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
+const rawResultsPath = path.join(resultsDir, 'raw-results.json')
+const summaryPath = path.join(resultsDir, 'summary.json')
+
+let existingResults: EvaluationResult[] | undefined
+let existingTokenCounts: Record<string, number> | undefined
+
+try {
+  const [rawData, summaryData] = await Promise.all([
+    fsp.readFile(rawResultsPath, 'utf-8'),
+    fsp.readFile(summaryPath, 'utf-8'),
+  ])
+  existingResults = JSON.parse(rawData)
+  const summary = JSON.parse(summaryData)
+  existingTokenCounts = summary.tokenCounts
+  consola.info('Found existing results – regenerating report only')
+}
+catch {
+  // Results don't exist, will run full evaluation
+}
+
+if (DRY_RUN) {
+  consola.info('Limiting questions and models for dry run')
+}
+
+let questions = generateQuestions()
+
+// Apply dry run limits if enabled
+if (DRY_RUN && DRY_RUN_LIMITS.maxQuestions) {
+  questions = questions.slice(0, DRY_RUN_LIMITS.maxQuestions)
+}
+
+// Filter models for dry run
+const activeModels = DRY_RUN && DRY_RUN_LIMITS.allowedModels.length > 0
+  ? Object.fromEntries(
+      Object.entries(models).filter(([name]) => DRY_RUN_LIMITS.allowedModels.includes(name)),
+    )
+  : models
+
+let results: EvaluationResult[]
+let tokenCounts: Record<string, number>
+
+if (existingResults && existingTokenCounts) {
+  // Reuse existing results
+  results = existingResults
+  tokenCounts = existingTokenCounts
+}
+else {
+  // Run full evaluation
+  consola.info(`Evaluating ${questions.length} questions`)
+  consola.info(`Testing ${Object.keys(formatters).length} formats`)
+  consola.info(`Using ${Object.keys(activeModels).length} models: ${Object.keys(activeModels).join(', ')}`)
+
+  // Calculate token counts for all format+dataset combinations
+  tokenCounts = calculateTokenCounts(formatters)
+
+  // Format datasets once (reuse for all questions)
+  const formattedDatasets: Record<string, Record<string, string>> = {}
+  for (const [formatName, formatter] of Object.entries(formatters)) {
+    formattedDatasets[formatName] = {}
+    for (const dataset of datasets) {
+      const formatted = formatter(dataset.data)
+      formattedDatasets[formatName]![dataset.name] = formatted
+    }
+  }
+
+  // Generate evaluation tasks
+  const tasks: { question: Question, formatName: string, modelName: string }[] = []
+  for (const question of questions) {
+    for (const [formatName] of Object.entries(formatters)) {
+      for (const [modelName] of Object.entries(activeModels)) {
+        tasks.push({ question, formatName, modelName })
+      }
+    }
+  }
+
+  const total = tasks.length
+
+  consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`)
+
+  // Evaluate all tasks in parallel
+  results = await pMap(
+    tasks,
+    async (task, index) => {
+      const formattedData = formattedDatasets[task.formatName]![task.question.dataset]!
+      const model = activeModels[task.modelName as keyof typeof activeModels]
+
+      const result = await evaluateQuestion(
+        task.question,
+        task.formatName,
+        formattedData,
+        model,
+        task.modelName,
+      )
+
+      // Progress update
+      if ((index + 1) % 10 === 0) {
+        const percent = (((index + 1) / total) * 100).toFixed(1)
+        console.log(`⏳ Progress: ${index + 1}/${total} (${percent}%)`)
+      }
+
+      return result
+    },
+    { concurrency: DEFAULT_CONCURRENCY },
+  )
+
+  consola.success('Evaluation complete!')
+}
+
+// Generate/regenerate markdown report
+const formatResults = calculateFormatResults(results, tokenCounts)
+await saveResults(results, formatResults, questions, tokenCounts)
+
+consola.info(`Results saved to: \`${path.relative(ROOT_DIR, resultsDir)}\``)
+consola.success(existingResults ? 'Markdown report regenerated!' : 'Evaluation complete!')
--- a/benchmarks/scripts/fetch-github-data.ts
+++ b/benchmarks/scripts/fetch-github-data.ts
@@ -0,0 +1,78 @@
+import * as fsp from 'node:fs/promises'
+import * as path from 'node:path'
+import process from 'node:process'
+import { consola } from 'consola'
+import { ofetch } from 'ofetch'
+import { BENCHMARKS_DIR } from '../src/constants'
+
+try {
+  // Fetch top 100 repos from GitHub
+  const repoList = await searchTop100Repos()
+  const repos = await fetchRepoDetails(repoList)
+
+  if (repos.length === 0) {
+    consola.error('❌ No repositories fetched. Exiting.')
+    process.exit(1)
+  }
+
+  // Sort by stars descending
+  repos.sort((a, b) => b.stars - a.stars)
+
+  await saveRepos(repos)
+
+  consola.success('Done!')
+}
+catch (error) {
+  consola.error(error)
+  process.exit(1)
+}
+
+async function searchTop100Repos(): Promise<string[]> {
+  consola.start('Fetching top 100 starred repositories from GitHub API…')
+
+  const response = await ofetch<{ items: { full_name: string }[] }>(
+    'https://api.github.com/search/repositories',
+    {
+      query: {
+        q: 'stars:>1',
+        sort: 'stars',
+        order: 'desc',
+        per_page: 100,
+      },
+      headers: {
+        'Accept': 'application/vnd.github+json',
+        'X-GitHub-Api-Version': '2022-11-28',
+      },
+    },
+  )
+
+  return response.items.map(item => item.full_name)
+}
+
+async function fetchRepoDetails(repoList: string[]): Promise<Record<string, any>[]> {
+  consola.start(`Fetching ${repoList.length} GitHub repositories…`)
+
+  const repos: Record<string, any>[] = []
+
+  for (let i = 0; i < repoList.length; i++) {
+    const repoPath = repoList[i]!
+    console.log(`[${i + 1}/${repoList.length}] Fetching ${repoPath}…`)
+    const { repo } = await await ofetch(`https://ungh.cc/repos/${repoPath}`)
+    repos.push(repo)
+  }
+
+  consola.success(`Successfully fetched ${repos.length}/${repoList.length} repositories`)
+
+  return repos
+}
+
+async function saveRepos(repos: Record<string, any>[]): Promise<void> {
+  const outputDir = path.join(BENCHMARKS_DIR, 'data')
+  const outputFile = path.join(outputDir, 'github-repos.json')
+
+  await fsp.mkdir(outputDir, { recursive: true })
+  await fsp.writeFile(outputFile, JSON.stringify(repos, undefined, 2))
+
+  const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
+  consola.info(`Saved to \`${relativePath}\``)
+}
--- a/benchmarks/scripts/token-efficiency-benchmark.ts
+++ b/benchmarks/scripts/token-efficiency-benchmark.ts
@@ -0,0 +1,228 @@
+import * as fsp from 'node:fs/promises'
+import * as path from 'node:path'
+import { faker } from '@faker-js/faker'
+import { consola } from 'consola'
+import { encode as encodeTokens } from 'gpt-tokenizer' // o200k_base encoding (default)
+import { encode } from '../../src/index'
+import githubRepos from '../data/github-repos.json' with { type: 'json' }
+import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
+
+interface BenchmarkResult {
+  name: string
+  emoji: string
+  description: string
+  data: any
+  jsonTokens: number
+  toonTokens: number
+  savings: number
+  savingsPercent: string
+  showDetailed: boolean
+}
+
+const outputFilePath = path.join(BENCHMARKS_DIR, 'results', 'token-efficiency.md')
+
+const BENCHMARK_EXAMPLES = [
+  {
+    name: 'GitHub Repositories',
+    emoji: '⭐',
+    description: 'Top 100 GitHub repositories with stars, forks, and metadata',
+    getData: () => ({ repositories: githubRepos }),
+    showDetailed: true,
+  },
+  {
+    name: 'Analytics Time Series',
+    emoji: '📈',
+    description: '180 days of web metrics (views, clicks, conversions, revenue)',
+    getData: () => generateAnalytics(180),
+    showDetailed: true,
+  },
+  {
+    name: 'API Response',
+    emoji: '👥',
+    description: '50 user records with metadata and timestamps',
+    getData: () => generateUsers(50),
+    showDetailed: false,
+  },
+  {
+    name: 'E-commerce Order',
+    emoji: '🛒',
+    description: 'Nested order with customer and items',
+    getData: generateOrder,
+    showDetailed: false,
+  },
+] as const
+
+// Calculate total savings
+let totalJsonTokens = 0
+let totalToonTokens = 0
+
+const results: BenchmarkResult[] = []
+
+for (const example of BENCHMARK_EXAMPLES) {
+  const data = await example.getData()
+
+  const jsonString = JSON.stringify(data, undefined, 2)
+  const toonString = encode(data)
+
+  const jsonTokens = encodeTokens(jsonString).length
+  const toonTokens = encodeTokens(toonString).length
+  const savings = jsonTokens - toonTokens
+  const savingsPercent = ((savings / jsonTokens) * 100).toFixed(1)
+
+  totalJsonTokens += jsonTokens
+  totalToonTokens += toonTokens
+
+  results.push({
+    name: example.name,
+    emoji: example.emoji,
+    description: example.description,
+    data,
+    jsonTokens,
+    toonTokens,
+    savings,
+    savingsPercent,
+    showDetailed: example.showDetailed,
+  })
+}
+
+const totalSavings = totalJsonTokens - totalToonTokens
+const totalSavingsPercent = ((totalSavings / totalJsonTokens) * 100).toFixed(1)
+
+// Generate ASCII bar chart visualization
+const barChartSection = results
+  .map((result) => {
+    const percentage = Number.parseFloat(result.savingsPercent)
+    const bar = generateBarChart(100 - percentage) // Invert to show TOON tokens
+    const jsonStr = result.jsonTokens.toLocaleString('en-US')
+    const toonStr = result.toonTokens.toLocaleString('en-US')
+    return `${result.emoji} ${result.name.padEnd(25)} ${bar}  ${toonStr.padStart(6)} tokens  (JSON: ${jsonStr.padStart(6)})  💰 ${result.savingsPercent}% saved`
+  })
+  .join('\n')
+
+// Generate detailed examples (only for selected examples)
+const detailedExamples = results
+  .filter(result => result.showDetailed)
+  .map((result, i, filtered) => {
+    // Truncate large datasets for display
+    let displayData = result.data
+    if (result.name === 'GitHub Repositories') {
+      displayData = {
+        repositories: result.data.repositories.slice(0, 3).map((repo: any) => ({
+          ...repo,
+          description: repo.description?.slice(0, 80) + (repo.description?.length > 80 ? '...' : ''),
+        })),
+      }
+    }
+    else if (result.name === 'Analytics Time Series') {
+      displayData = { metrics: result.data.metrics.slice(0, 5) }
+    }
+
+    const separator = i < filtered.length - 1 ? '\n\n---' : ''
+
+    return `#### ${result.emoji} ${result.name}
+
+**Configuration:** ${result.description}
+
+**Savings:** ${result.savings.toLocaleString('en-US')} tokens (${result.savingsPercent}% reduction)
+
+**JSON** (${result.jsonTokens.toLocaleString('en-US')} tokens):
+
+\`\`\`json
+${JSON.stringify(displayData, undefined, 2)}
+\`\`\`
+
+**TOON** (${result.toonTokens.toLocaleString('en-US')} tokens):
+
+\`\`\`
+${encode(displayData)}
+\`\`\`${separator}`
+  })
+  .join('\n\n')
+
+const markdown = `### Token Efficiency
+
+\`\`\`
+${barChartSection}
+\`\`\`
+
+**Total:** ${totalToonTokens.toLocaleString('en-US')} tokens (TOON) vs ${totalJsonTokens.toLocaleString('en-US')} tokens (JSON) → ${totalSavingsPercent}% savings
+
+<details>
+<summary><strong>View detailed examples</strong></summary>
+
+${detailedExamples}
+
+</details>
+`.trimStart()
+
+console.log(markdown)
+
+await fsp.mkdir(path.join(BENCHMARKS_DIR, 'results'), { recursive: true })
+await fsp.writeFile(outputFilePath, markdown, 'utf-8')
+
+consola.success(`Benchmark written to \`${path.relative(ROOT_DIR, outputFilePath)}\``)
+
+// Generate ASCII bar chart
+function generateBarChart(percentage: number, maxWidth: number = 25): string {
+  const filled = Math.round((percentage / 100) * maxWidth)
+  const empty = maxWidth - filled
+  return '█'.repeat(filled) + '░'.repeat(empty)
+}
+
+// Generate analytics time series data
+function generateAnalytics(days: number) {
+  return {
+    metrics: Array.from({ length: days }, (_, i) => {
+      const date = new Date(2025, 0, 1)
+      date.setDate(date.getDate() + i)
+      return {
+        date: date.toISOString().split('T')[0],
+        views: Math.floor(Math.random() * 5000) + 1000,
+        clicks: Math.floor(Math.random() * 500) + 50,
+        conversions: Math.floor(Math.random() * 100) + 10,
+        revenue: Number((Math.random() * 1000 + 100).toFixed(2)),
+      }
+    }),
+  }
+}
+
+// Generate user API response
+function generateUsers(count: number) {
+  return {
+    users: Array.from({ length: count }, (_, i) => ({
+      id: i + 1,
+      name: faker.person.fullName(),
+      email: faker.internet.email(),
+      role: faker.helpers.arrayElement(['admin', 'user', 'moderator']),
+      active: faker.datatype.boolean(),
+      createdAt: faker.date.past({ years: 2 }).toISOString(),
+      lastLogin: faker.date.recent({ days: 30 }).toISOString(),
+    })),
+    total: count,
+    page: 1,
+  }
+}
+
+// Generate nested e-commerce order
+function generateOrder() {
+  return {
+    orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }),
+    customer: {
+      id: faker.number.int({ min: 1000, max: 9999 }),
+      name: faker.person.fullName(),
+      email: faker.internet.email(),
+      phone: faker.phone.number(),
+    },
+    items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({
+      sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }),
+      name: faker.commerce.productName(),
+      quantity: faker.number.int({ min: 1, max: 5 }),
+      price: Number(faker.commerce.price({ min: 10, max: 200 })),
+    })),
+    subtotal: Number(faker.commerce.price({ min: 100, max: 500 })),
+    tax: Number(faker.commerce.price({ min: 10, max: 50 })),
+    total: Number(faker.commerce.price({ min: 110, max: 550 })),
+    status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']),
+    createdAt: faker.date.recent({ days: 7 }).toISOString(),
+  }
+}