From 0a8c5fa0497754731cf8ff97eb0f503aaabfc97a Mon Sep 17 00:00:00 2001 From: Prasanna721 <106952318+Prasanna721@users.noreply.github.com> Date: Wed, 21 Jan 2026 03:58:26 +0000 Subject: Re - feat(pipecat-sdk): add speech-to-speech model support (Gemini Live) (#683) #### RE-RAISING Pipecat live speech PR ### Added native speech-to-speech model support ### Summary: - Speech-to-speech support - Auto-detect audio frames and inject memories to system prompt for native audio models (Gemini Live, etc.) - Fix memory bloating - Replace memories each turn using XML tags instead of accumulating - Add temporal context - Show recency on search results ([2d ago], [15 Jan]) - New inject_mode param - auto (default), system, or user ### Docs update - Update the docs for native speech-2-speech models --- apps/docs/integrations/pipecat.mdx | 66 ++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 13 deletions(-) (limited to 'apps/docs/integrations') diff --git a/apps/docs/integrations/pipecat.mdx b/apps/docs/integrations/pipecat.mdx index c96f27e1..a5cae1e9 100644 --- a/apps/docs/integrations/pipecat.mdx +++ b/apps/docs/integrations/pipecat.mdx @@ -28,7 +28,8 @@ You can obtain an API key from [console.supermemory.ai](https://console.supermem Supermemory integration is provided through the `SupermemoryPipecatService` class in Pipecat: ```python -from supermemory_pipecat import SupermemoryPipecatService, InputParams +from supermemory_pipecat import SupermemoryPipecatService +from supermemory_pipecat.service import InputParams memory = SupermemoryPipecatService( api_key=os.getenv("SUPERMEMORY_API_KEY"), @@ -78,11 +79,11 @@ Retrieved memories are formatted and injected into the LLM context before genera ## Memory Modes -| Mode | Static Profile | Dynamic Profile | Search Results | Use Case | -|------|----------------|-----------------|----------------|----------| -| `"profile"` | Yes | Yes | No | Personalization without search | -| `"query"` | No | No | Yes | Finding relevant past context | -| `"full"` | Yes | Yes | Yes | Complete memory (default) | +| Mode | Static Profile | Dynamic Profile | Search Results | Use Case | +| ----------- | -------------- | --------------- | -------------- | ------------------------------ | +| `"profile"` | Yes | Yes | No | Personalization without search | +| `"query"` | No | No | Yes | Finding relevant past context | +| `"full"` | Yes | Yes | Yes | Complete memory (default) | ## Configuration Options @@ -96,15 +97,41 @@ InputParams( search_limit=10, # Max memories to retrieve (default: 10) search_threshold=0.1, # Similarity threshold 0.0-1.0 (default: 0.1) system_prompt="Based on previous conversations:\n\n", + inject_mode="auto", # "auto" | "system" | "user" ) ``` -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `search_limit` | int | 10 | Maximum number of memories to retrieve per query | -| `search_threshold` | float | 0.1 | Minimum similarity threshold for memory retrieval | -| `mode` | str | "full" | Memory retrieval mode: `"profile"`, `"query"`, or `"full"` | -| `system_prompt` | str | "Based on previous conversations:\n\n" | Prefix text for memory context | +| Parameter | Type | Default | Description | +| ------------------ | ----- | -------------------------------------- | ------------------------------------------------------------ | +| `search_limit` | int | 10 | Maximum number of memories to retrieve per query | +| `search_threshold` | float | 0.1 | Minimum similarity threshold for memory retrieval | +| `mode` | str | "full" | Memory retrieval mode: `"profile"`, `"query"`, or `"full"` | +| `system_prompt` | str | "Based on previous conversations:\n\n" | Prefix text for memory context | +| `inject_mode` | str | "auto" | How memories are injected: `"auto"`, `"system"`, or `"user"` | + +## Injection Modes + +The `inject_mode` parameter controls how memories are added to the LLM context: + +| Mode | Behavior | +| ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `"auto"` | **Auto-detects** based on frame types. If audio frames detected → injects to system prompt (speech-to-speech). If only text frames → injects as user message (STT/TTS). | +| `"system"` | Always injects memories into the system prompt | +| `"user"` | Always injects memories as a user message | + +## Speech-to-Speech Models (Gemini Live, etc.) + +For speech-to-speech models like Gemini Live, the SDK **automatically detects** audio frames and injects memories into the system prompt. No configuration needed: + +```python +from supermemory_pipecat import SupermemoryPipecatService + +# Auto-detection works out of the box +memory = SupermemoryPipecatService( + api_key=os.getenv("SUPERMEMORY_API_KEY"), + user_id="unique_user_id", +) +``` ## Example: Voice Agent with Memory @@ -130,7 +157,8 @@ from pipecat.transports.websocket.fastapi import ( FastAPIWebsocketTransport, ) -from supermemory_pipecat import SupermemoryPipecatService, InputParams +from supermemory_pipecat import SupermemoryPipecatService +from supermemory_pipecat.service import InputParams app = FastAPI() @@ -201,3 +229,15 @@ if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000) ``` + +## Example: Gemini Live with Memory + +For a complete example using Gemini Live speech-to-speech with Supermemory, check out the reference implementation: + + + Full working example with Gemini Live, including frontend and backend code. + -- cgit v1.2.3