diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index bfd01eed78..048c05de29 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2906,6 +2906,12 @@ AUDIO_STT_MODEL = PersistentConfig( os.getenv("AUDIO_STT_MODEL", ""), ) +AUDIO_STT_SUPPORTED_CONTENT_TYPES = PersistentConfig( + "AUDIO_STT_SUPPORTED_CONTENT_TYPES", + "audio.stt.supported_content_types", + os.getenv("AUDIO_STT_SUPPORTED_CONTENT_TYPES", "").split(","), +) + AUDIO_STT_AZURE_API_KEY = PersistentConfig( "AUDIO_STT_AZURE_API_KEY", "audio.stt.azure.api_key", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 8fa2cf42fa..edc7431a85 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -159,6 +159,7 @@ from open_webui.config import ( # Audio AUDIO_STT_ENGINE, AUDIO_STT_MODEL, + AUDIO_STT_SUPPORTED_CONTENT_TYPES, AUDIO_STT_OPENAI_API_BASE_URL, AUDIO_STT_OPENAI_API_KEY, AUDIO_STT_AZURE_API_KEY, @@ -959,10 +960,12 @@ app.state.config.IMAGE_STEPS = IMAGE_STEPS # ######################################## -app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL -app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY app.state.config.STT_ENGINE = AUDIO_STT_ENGINE app.state.config.STT_MODEL = AUDIO_STT_MODEL +app.state.config.STT_SUPPORTED_CONTENT_TYPES = AUDIO_STT_SUPPORTED_CONTENT_TYPES + +app.state.config.STT_OPENAI_API_BASE_URL = AUDIO_STT_OPENAI_API_BASE_URL +app.state.config.STT_OPENAI_API_KEY = AUDIO_STT_OPENAI_API_KEY app.state.config.WHISPER_MODEL = WHISPER_MODEL app.state.config.WHISPER_VAD_FILTER = WHISPER_VAD_FILTER diff --git a/backend/open_webui/routers/audio.py b/backend/open_webui/routers/audio.py index ebf5e7a667..8821087dd9 100644 --- a/backend/open_webui/routers/audio.py +++ b/backend/open_webui/routers/audio.py @@ -10,7 +10,7 @@ from pydub.silence import split_on_silence from concurrent.futures import ThreadPoolExecutor from typing import Optional - +from fnmatch import fnmatch import aiohttp import aiofiles import requests @@ -168,6 +168,7 @@ class STTConfigForm(BaseModel): OPENAI_API_KEY: str ENGINE: str MODEL: str + SUPPORTED_CONTENT_TYPES: list[str] = [] WHISPER_MODEL: str DEEPGRAM_API_KEY: str AZURE_API_KEY: str @@ -202,6 +203,7 @@ async def get_audio_config(request: Request, user=Depends(get_admin_user)): "OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY, "ENGINE": request.app.state.config.STT_ENGINE, "MODEL": request.app.state.config.STT_MODEL, + "SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES, "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL, "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY, "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY, @@ -236,6 +238,10 @@ async def update_audio_config( request.app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY request.app.state.config.STT_ENGINE = form_data.stt.ENGINE request.app.state.config.STT_MODEL = form_data.stt.MODEL + request.app.state.config.STT_SUPPORTED_CONTENT_TYPES = ( + form_data.stt.SUPPORTED_CONTENT_TYPES + ) + request.app.state.config.WHISPER_MODEL = form_data.stt.WHISPER_MODEL request.app.state.config.DEEPGRAM_API_KEY = form_data.stt.DEEPGRAM_API_KEY request.app.state.config.AUDIO_STT_AZURE_API_KEY = form_data.stt.AZURE_API_KEY @@ -269,6 +275,7 @@ async def update_audio_config( "OPENAI_API_KEY": request.app.state.config.STT_OPENAI_API_KEY, "ENGINE": request.app.state.config.STT_ENGINE, "MODEL": request.app.state.config.STT_MODEL, + "SUPPORTED_CONTENT_TYPES": request.app.state.config.STT_SUPPORTED_CONTENT_TYPES, "WHISPER_MODEL": request.app.state.config.WHISPER_MODEL, "DEEPGRAM_API_KEY": request.app.state.config.DEEPGRAM_API_KEY, "AZURE_API_KEY": request.app.state.config.AUDIO_STT_AZURE_API_KEY, @@ -910,10 +917,14 @@ def transcription( ): log.info(f"file.content_type: {file.content_type}") - SUPPORTED_CONTENT_TYPES = {"video/webm"} # Extend if you add more video types! - if not ( - file.content_type.startswith("audio/") - or file.content_type in SUPPORTED_CONTENT_TYPES + supported_content_types = request.app.state.config.STT_SUPPORTED_CONTENT_TYPES or [ + "audio/*", + "video/webm", + ] + + if not any( + fnmatch(file.content_type, content_type) + for content_type in supported_content_types ): raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, diff --git a/backend/open_webui/routers/files.py b/backend/open_webui/routers/files.py index ba6758671e..b9bb15c7b4 100644 --- a/backend/open_webui/routers/files.py +++ b/backend/open_webui/routers/files.py @@ -155,9 +155,18 @@ def upload_file( if process: try: if file.content_type: - if file.content_type.startswith("audio/") or file.content_type in { - "video/webm" - }: + stt_supported_content_types = ( + request.app.state.config.STT_SUPPORTED_CONTENT_TYPES + or [ + "audio/*", + "video/webm", + ] + ) + + if any( + fnmatch(file.content_type, content_type) + for content_type in stt_supported_content_types + ): file_path = Storage.get_file(file_path) result = transcribe(request, file_path, file_metadata) diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 960f3497ac..e3ddb1fd23 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -39,6 +39,7 @@ let STT_OPENAI_API_KEY = ''; let STT_ENGINE = ''; let STT_MODEL = ''; + let STT_SUPPORTED_CONTENT_TYPES = ''; let STT_WHISPER_MODEL = ''; let STT_AZURE_API_KEY = ''; let STT_AZURE_REGION = ''; @@ -114,6 +115,7 @@ OPENAI_API_KEY: STT_OPENAI_API_KEY, ENGINE: STT_ENGINE, MODEL: STT_MODEL, + SUPPORTED_CONTENT_TYPES: STT_SUPPORTED_CONTENT_TYPES.split(','), WHISPER_MODEL: STT_WHISPER_MODEL, DEEPGRAM_API_KEY: STT_DEEPGRAM_API_KEY, AZURE_API_KEY: STT_AZURE_API_KEY, @@ -160,6 +162,7 @@ STT_ENGINE = res.stt.ENGINE; STT_MODEL = res.stt.MODEL; + STT_SUPPORTED_CONTENT_TYPES = (res?.stt?.SUPPORTED_CONTENT_TYPES ?? []).join(','); STT_WHISPER_MODEL = res.stt.WHISPER_MODEL; STT_AZURE_API_KEY = res.stt.AZURE_API_KEY; STT_AZURE_REGION = res.stt.AZURE_REGION; @@ -184,9 +187,11 @@
-
{$i18n.t('STT Settings')}
+
{$i18n.t('Speech-to-Text')}
-
+
+ +
{$i18n.t('Speech-to-Text Engine')}
+
+
+
+ {#if STT_ENGINE === 'openai'}
@@ -220,7 +238,7 @@
-
{$i18n.t('STT Model')}
+
{$i18n.t('STT Model')}
-
{$i18n.t('STT Model')}
+
{$i18n.t('STT Model')}
-
{$i18n.t('Azure Region')}
+
{$i18n.t('Azure Region')}
-
{$i18n.t('Language Locales')}
+
{$i18n.t('Language Locales')}
-
{$i18n.t('Endpoint URL')}
+
{$i18n.t('Endpoint URL')}
-
{$i18n.t('Max Speakers')}
+
{$i18n.t('Max Speakers')}
{:else if STT_ENGINE === ''}
-
{$i18n.t('STT Model')}
+
{$i18n.t('STT Model')}
@@ -416,12 +434,12 @@ {/if}
-
-
-
{$i18n.t('TTS Settings')}
+
{$i18n.t('Text-to-Speech')}
-
+
+ +
{$i18n.t('Text-to-Speech Engine')}
-
{$i18n.t('Endpoint URL')}
+
{$i18n.t('Endpoint URL')}
{/if} -
- - {#if TTS_ENGINE === ''} -
-
{$i18n.t('TTS Voice')}
-
-
- -
-
-
- {:else if TTS_ENGINE === 'transformers'} -
-
{$i18n.t('TTS Model')}
-
-
- - - - -
-
-
- {$i18n.t(`Open WebUI uses SpeechT5 and CMU Arctic speaker embeddings.`)} - - To learn more about SpeechT5, - - - {$i18n.t(`click here`, { - name: 'SpeechT5' - })}. - - To see the available CMU Arctic speaker embeddings, - - {$i18n.t(`click here`)}. - -
-
- {:else if TTS_ENGINE === 'openai'} -
-
-
{$i18n.t('TTS Voice')}
+
+ {#if TTS_ENGINE === ''} +
+
{$i18n.t('TTS Voice')}
- - - + > + {#each voices as voice} - + {/each} - +
-
-
{$i18n.t('TTS Model')}
+ {:else if TTS_ENGINE === 'transformers'} +
+
{$i18n.t('TTS Model')}
- - {#each models as model} -
-
-
- {:else if TTS_ENGINE === 'elevenlabs'} -
-
-
{$i18n.t('TTS Voice')}
-
-
- +
+ {$i18n.t(`Open WebUI uses SpeechT5 and CMU Arctic speaker embeddings.`)} - - {#each voices as voice} - - {/each} - -
-
-
-
-
{$i18n.t('TTS Model')}
-
-
- + To learn more about SpeechT5, - - {#each models as model} - -
-
-
-
- {:else if TTS_ENGINE === 'azure'} -
-
-
{$i18n.t('TTS Voice')}
-
-
- - - - {#each voices as voice} - - {/each} - -
-
-
-
-
- {$i18n.t('Output format')} - {$i18n.t('Available list')} + {$i18n.t(`click here`, { + name: 'SpeechT5' + })}. + + To see the available CMU Arctic speaker embeddings, + + {$i18n.t(`click here`)}.
-
-
- +
+ {:else if TTS_ENGINE === 'openai'} +
+
+
{$i18n.t('TTS Voice')}
+
+
+ + + + {#each voices as voice} + + {/each} + +
+
+
+
+
{$i18n.t('TTS Model')}
+
+
+ + + + {#each models as model} + +
-
- {/if} + {:else if TTS_ENGINE === 'elevenlabs'} +
+
+
{$i18n.t('TTS Voice')}
+
+
+ -
+ + {#each voices as voice} + + {/each} + +
+
+
+
+
{$i18n.t('TTS Model')}
+
+
+ + + + {#each models as model} + +
+
+
+
+ {:else if TTS_ENGINE === 'azure'} +
+
+
{$i18n.t('TTS Voice')}
+
+
+ + + + {#each voices as voice} + + {/each} + +
+
+
+
+
+ {$i18n.t('Output format')} + + {$i18n.t('Available list')} + +
+
+
+ +
+
+
+
+ {/if} +
{$i18n.t('Response splitting')}