diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 25e223dccc..5a1e798d41 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -5,6 +5,7 @@ import os import shutil import asyncio +import re import uuid from datetime import datetime from pathlib import Path @@ -1690,50 +1691,34 @@ def process_text( ) -@router.post("/process/youtube") -def process_youtube_video( - request: Request, form_data: ProcessUrlForm, user=Depends(get_verified_user) -): - try: - collection_name = form_data.collection_name - if not collection_name: - collection_name = calculate_sha256_string(form_data.url)[:63] +def is_youtube_url(url: str) -> bool: + youtube_regex = r"^(https?://)?(www\.)?(youtube\.com|youtu\.be)/.+$" + return re.match(youtube_regex, url) is not None - loader = YoutubeLoader( - form_data.url, + +def get_loader(request, url: str): + if is_youtube_url(url): + return YoutubeLoader( + url, language=request.app.state.config.YOUTUBE_LOADER_LANGUAGE, proxy_url=request.app.state.config.YOUTUBE_LOADER_PROXY_URL, ) - - docs = loader.load() - content = " ".join([doc.page_content for doc in docs]) - log.debug(f"text_content: {content}") - - save_docs_to_vector_db( - request, docs, collection_name, overwrite=True, user=user - ) - - return { - "status": True, - "collection_name": collection_name, - "filename": form_data.url, - "file": { - "data": { - "content": content, - }, - "meta": { - "name": form_data.url, - }, - }, - } - except Exception as e: - log.exception(e) - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.DEFAULT(e), + else: + return get_web_loader( + url, + verify_ssl=request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION, + requests_per_second=request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS, ) +def get_content_from_url(request, url: str) -> str: + loader = get_loader(request, url) + docs = loader.load() + content = " ".join([doc.page_content for doc in docs]) + return content, docs + + +@router.post("/process/youtube") @router.post("/process/web") def process_web( request: Request, form_data: ProcessUrlForm, user=Depends(get_verified_user) @@ -1743,14 +1728,7 @@ def process_web( if not collection_name: collection_name = calculate_sha256_string(form_data.url)[:63] - loader = get_web_loader( - form_data.url, - verify_ssl=request.app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION, - requests_per_second=request.app.state.config.WEB_LOADER_CONCURRENT_REQUESTS, - ) - docs = loader.load() - content = " ".join([doc.page_content for doc in docs]) - + content, docs = get_content_from_url(request, form_data.url) log.debug(f"text_content: {content}") if not request.app.state.config.BYPASS_WEB_SEARCH_EMBEDDING_AND_RETRIEVAL: diff --git a/backend/open_webui/utils/middleware.py b/backend/open_webui/utils/middleware.py index 1ae340ae7b..7501f803f1 100644 --- a/backend/open_webui/utils/middleware.py +++ b/backend/open_webui/utils/middleware.py @@ -1001,11 +1001,11 @@ async def process_chat_payload(request, form_data, user, metadata, model): log.debug(f"form_data: {form_data}") system_message = get_system_message(form_data.get("messages", [])) - if system_message: + if system_message: # Chat Controls/User Settings try: form_data = apply_system_prompt_to_body( system_message.get("content"), form_data, metadata, user, replace=True - ) + ) # Required to handle system prompt variables except: pass diff --git a/package-lock.json b/package-lock.json index b9aeb596a5..38cf347d3e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -94,6 +94,7 @@ "undici": "^7.3.0", "uuid": "^9.0.1", "vega": "^6.2.0", + "vega-lite": "^6.4.1", "vite-plugin-static-copy": "^2.2.0", "y-prosemirror": "^1.3.7", "yaml": "^2.7.1", @@ -5593,6 +5594,99 @@ "node": ">=8" } }, + "node_modules/cliui": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-9.0.1.tgz", + "integrity": "sha512-k7ndgKhwoQveBL+/1tqGJYNz097I7WOvwbmmU2AR5+magtbjPWQTS1C5vzGkBC8Ym8UWRzfKUzUUqFLypY4Q+w==", + "license": "ISC", + "dependencies": { + "string-width": "^7.2.0", + "strip-ansi": "^7.1.0", + "wrap-ansi": "^9.0.0" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/cliui/node_modules/ansi-regex": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.2.2.tgz", + "integrity": "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, + "node_modules/cliui/node_modules/ansi-styles": { + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.3.tgz", + "integrity": "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/cliui/node_modules/emoji-regex": { + "version": "10.5.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.5.0.tgz", + "integrity": "sha512-lb49vf1Xzfx080OKA0o6l8DQQpV+6Vg95zyCJX9VB/BqKYlhG7N4wgROUUHRA+ZPUefLnteQOad7z1kT2bV7bg==", + "license": "MIT" + }, + "node_modules/cliui/node_modules/string-width": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz", + "integrity": "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^10.3.0", + "get-east-asian-width": "^1.0.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/cliui/node_modules/strip-ansi": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.2.tgz", + "integrity": "sha512-gmBGslpoQJtgnMAvOVqGZpEz9dyoKTCzy2nfz/n8aIFhN/jCE/rCmcxabB6jOOHV+0WNnylOxaxBQPSvcWklhA==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, + "node_modules/cliui/node_modules/wrap-ansi": { + "version": "9.0.2", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-9.0.2.tgz", + "integrity": "sha512-42AtmgqjV+X1VpdOfyTGOYRi0/zsoLqtXQckTmqTeybT+BDIbM/Guxo7x3pE2vtpr1ok6xRqM9OpBe+Jyoqyww==", + "license": "MIT", + "dependencies": { + "ansi-styles": "^6.2.1", + "string-width": "^7.0.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, "node_modules/clone": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/clone/-/clone-2.1.2.tgz", @@ -7069,6 +7163,15 @@ "@esbuild/win32-x64": "0.25.1" } }, + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/escape-string-regexp": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", @@ -7781,6 +7884,27 @@ "resolved": "https://registry.npmjs.org/gc-hook/-/gc-hook-0.3.1.tgz", "integrity": "sha512-E5M+O/h2o7eZzGhzRZGex6hbB3k4NWqO0eA+OzLRLXxhdbYPajZnynPwAtphnh+cRHPwsj5Z80dqZlfI4eK55A==" }, + "node_modules/get-caller-file": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", + "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==", + "license": "ISC", + "engines": { + "node": "6.* || 8.* || >= 10.*" + } + }, + "node_modules/get-east-asian-width": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.4.0.tgz", + "integrity": "sha512-QZjmEOC+IT1uk6Rx0sX22V6uHWVwbdbxf1faPqJ1QhLdGgsRGCZoyaQBm/piRdJy/D2um6hM1UP7ZEeQ4EkP+Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/get-func-name": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz", @@ -8794,6 +8918,12 @@ "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==", "dev": true }, + "node_modules/json-stringify-pretty-compact": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/json-stringify-pretty-compact/-/json-stringify-pretty-compact-4.0.0.tgz", + "integrity": "sha512-3CNZ2DnrpByG9Nqj6Xo8vqbjT4F6N+tb4Gb28ESAZjYZ5yqvmc56J+/kuIwkaAMOyblTQhUW7PxMkUb8Q36N3Q==", + "license": "MIT" + }, "node_modules/json-stringify-safe": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz", @@ -13281,6 +13411,35 @@ "vega-util": "^2.1.0" } }, + "node_modules/vega-lite": { + "version": "6.4.1", + "resolved": "https://registry.npmjs.org/vega-lite/-/vega-lite-6.4.1.tgz", + "integrity": "sha512-KO3ybHNouRK4A0al/+2fN9UqgTEfxrd/ntGLY933Hg5UOYotDVQdshR3zn7OfXwQ7uj0W96Vfa5R+QxO8am3IQ==", + "license": "BSD-3-Clause", + "dependencies": { + "json-stringify-pretty-compact": "~4.0.0", + "tslib": "~2.8.1", + "vega-event-selector": "~4.0.0", + "vega-expression": "~6.1.0", + "vega-util": "~2.1.0", + "yargs": "~18.0.0" + }, + "bin": { + "vl2pdf": "bin/vl2pdf", + "vl2png": "bin/vl2png", + "vl2svg": "bin/vl2svg", + "vl2vg": "bin/vl2vg" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://app.hubspot.com/payments/GyPC972GD9Rt" + }, + "peerDependencies": { + "vega": "^6.0.0" + } + }, "node_modules/vega-loader": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/vega-loader/-/vega-loader-5.1.0.tgz", @@ -14642,6 +14801,15 @@ "yjs": "^13.0.0" } }, + "node_modules/y18n": { + "version": "5.0.8", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", + "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==", + "license": "ISC", + "engines": { + "node": ">=10" + } + }, "node_modules/yallist": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz", @@ -14663,6 +14831,82 @@ "node": ">= 14" } }, + "node_modules/yargs": { + "version": "18.0.0", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-18.0.0.tgz", + "integrity": "sha512-4UEqdc2RYGHZc7Doyqkrqiln3p9X2DZVxaGbwhn2pi7MrRagKaOcIKe8L3OxYcbhXLgLFUS3zAYuQjKBQgmuNg==", + "license": "MIT", + "dependencies": { + "cliui": "^9.0.1", + "escalade": "^3.1.1", + "get-caller-file": "^2.0.5", + "string-width": "^7.2.0", + "y18n": "^5.0.5", + "yargs-parser": "^22.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=23" + } + }, + "node_modules/yargs-parser": { + "version": "22.0.0", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-22.0.0.tgz", + "integrity": "sha512-rwu/ClNdSMpkSrUb+d6BRsSkLUq1fmfsY6TOpYzTwvwkg1/NRG85KBy3kq++A8LKQwX6lsu+aWad+2khvuXrqw==", + "license": "ISC", + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=23" + } + }, + "node_modules/yargs/node_modules/ansi-regex": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.2.2.tgz", + "integrity": "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, + "node_modules/yargs/node_modules/emoji-regex": { + "version": "10.5.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.5.0.tgz", + "integrity": "sha512-lb49vf1Xzfx080OKA0o6l8DQQpV+6Vg95zyCJX9VB/BqKYlhG7N4wgROUUHRA+ZPUefLnteQOad7z1kT2bV7bg==", + "license": "MIT" + }, + "node_modules/yargs/node_modules/string-width": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz", + "integrity": "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^10.3.0", + "get-east-asian-width": "^1.0.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/yargs/node_modules/strip-ansi": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.2.tgz", + "integrity": "sha512-gmBGslpoQJtgnMAvOVqGZpEz9dyoKTCzy2nfz/n8aIFhN/jCE/rCmcxabB6jOOHV+0WNnylOxaxBQPSvcWklhA==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, "node_modules/yauzl": { "version": "2.10.0", "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz",