aboutsummaryrefslogtreecommitdiff
path: root/apps/docs/memorybench/extend-benchmark.mdx
blob: c66cf4d9572b1107a308c52e7f13f40bdb1b6e53 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
---
title: "Extend Benchmark"
description: "Add a custom benchmark dataset to MemoryBench"
sidebarTitle: "Extend Benchmark"
---

## Benchmark Interface

```typescript
interface Benchmark {
  name: string
  load(config?: BenchmarkConfig): Promise<void>
  getQuestions(filter?: QuestionFilter): UnifiedQuestion[]
  getHaystackSessions(questionId: string): UnifiedSession[]
  getGroundTruth(questionId: string): string
  getQuestionTypes(): QuestionTypeRegistry
}
```

---

## Adding a Custom Benchmark

### 1. Create the Benchmark

```typescript
// src/benchmarks/mybenchmark/index.ts
import type { Benchmark, UnifiedQuestion, UnifiedSession } from "../../types"

export class MyBenchmark implements Benchmark {
  name = "mybenchmark"
  private questions: UnifiedQuestion[] = []
  private sessions: Map<string, UnifiedSession[]> = new Map()

  async load() {
    const data = await this.loadDataset()
    this.processData(data)
  }

  getQuestions(filter?: QuestionFilter) {
    let result = [...this.questions]
    if (filter?.limit) result = result.slice(0, filter.limit)
    return result
  }

  getHaystackSessions(questionId: string) {
    return this.sessions.get(questionId) || []
  }

  getGroundTruth(questionId: string) {
    return this.questions.find(q => q.questionId === questionId)?.groundTruth || ""
  }

  getQuestionTypes() {
    return {
      "type1": { id: "type1", description: "Type 1 questions" },
      "type2": { id: "type2", description: "Type 2 questions" },
    }
  }
}
```

### 2. Register the Benchmark

```typescript
// src/benchmarks/index.ts
import { MyBenchmark } from "./mybenchmark"

export const benchmarks = {
  locomo: LoComoBenchmark,
  longmemeval: LongMemEvalBenchmark,
  convomem: ConvoMemBenchmark,
  mybenchmark: MyBenchmark,  // Add here
}
```