diff --git a/CHANGELOG.md b/CHANGELOG.md index 95a795d515..86ca8d6ed3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,28 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.6.12] - 2025-05-29 + +### Added + +- 🧩 **Custom Advanced Model Parameters**: You can now add your own tailor-made advanced parameters to any model, empowering you to fine-tune behavior and unlock greater flexibility beyond just the built-in options—accelerate your experimentation. +- 🪧 **Datalab Marker API Content Extraction Support**: Seamlessly extract content from files and documents using the Datalab Marker API directly in your workflows, enabling more robust structured data extraction for RAG and document processing with just a simple engine switch in the UI. +- ⚡ **Parallelized Base Model Fetching**: Experience noticeably faster startup and model refresh times—base model data now loads in parallel, drastically shortening delays in busy or large-scale deployments. +- 🧠 **Efficient Function Loading and Caching**: Functions are now only reloaded if their content changes, preventing unnecessary duplicate loads, saving bandwidth, and boosting performance. +- 🌍 **Localization & Translation Enhancements**: Improved and expanded Simplified, Traditional Chinese, and Russian translations, providing smoother, more accurate, and context-aware experiences for global users. + +### Fixed + +- 💬 **Stable Message Input Box**: Fixed an issue where the message input box would shift unexpectedly (especially on mobile or with screen reader support), ensuring a smooth and reliable typing experience for every user. +- 🔊 **Reliable Read Aloud (Text-to-Speech)**: Read aloud now works seamlessly across messages, so users depending on TTS for accessibility or multitasking will experience uninterrupted and clear voice playback. +- 🖼 **Image Preview and Download Restored**: Fixed problems with image preview and downloads, ensuring frictionless creation, previewing, and downloading of images in your chats—no more interruptions in creative or documentation workflows. +- 📱 **Improved Mobile Styling for Workspace Capabilities**: Capabilities management is now readable and easy-to-use even on mobile devices, empowering admins and users to manage access quickly on the go. +- 🔁 **/api/v1/retrieval/query/collection Endpoint Reliability**: Queries to retrieval collections now return the expected results, bolstering the reliability of your knowledge workflows and citation-ready responses. + +### Removed + +- 🧹 **Duplicate CSS Elements**: Streamlined the UI by removing redundant CSS, reducing clutter and improving load times for a smoother visual experience. + ## [0.6.11] - 2025-05-27 ### Added diff --git a/README.md b/README.md index 8445b5a392..7d58c768dc 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,17 @@ Want to learn more about Open WebUI's features? Check out our [Open WebUI docume - Does your interface have a backend yet?
Try n8n + N8N • Does your interface have a backend yet?
Try n8n + + + + + + n8n + + + + Wrap • The intelligent terminal for developers diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 441c99efbf..950a379cde 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1848,6 +1848,61 @@ CONTENT_EXTRACTION_ENGINE = PersistentConfig( os.environ.get("CONTENT_EXTRACTION_ENGINE", "").lower(), ) +DATALAB_MARKER_API_KEY = PersistentConfig( + "DATALAB_MARKER_API_KEY", + "rag.datalab_marker_api_key", + os.environ.get("DATALAB_MARKER_API_KEY", ""), +) + +DATALAB_MARKER_LANGS = PersistentConfig( + "DATALAB_MARKER_LANGS", + "rag.datalab_marker_langs", + os.environ.get("DATALAB_MARKER_LANGS", ""), +) + +DATALAB_MARKER_USE_LLM = PersistentConfig( + "DATALAB_MARKER_USE_LLM", + "rag.DATALAB_MARKER_USE_LLM", + os.environ.get("DATALAB_MARKER_USE_LLM", "false").lower() == "true", +) + +DATALAB_MARKER_SKIP_CACHE = PersistentConfig( + "DATALAB_MARKER_SKIP_CACHE", + "rag.datalab_marker_skip_cache", + os.environ.get("DATALAB_MARKER_SKIP_CACHE", "false").lower() == "true", +) + +DATALAB_MARKER_FORCE_OCR = PersistentConfig( + "DATALAB_MARKER_FORCE_OCR", + "rag.datalab_marker_force_ocr", + os.environ.get("DATALAB_MARKER_FORCE_OCR", "false").lower() == "true", +) + +DATALAB_MARKER_PAGINATE = PersistentConfig( + "DATALAB_MARKER_PAGINATE", + "rag.datalab_marker_paginate", + os.environ.get("DATALAB_MARKER_PAGINATE", "false").lower() == "true", +) + +DATALAB_MARKER_STRIP_EXISTING_OCR = PersistentConfig( + "DATALAB_MARKER_STRIP_EXISTING_OCR", + "rag.datalab_marker_strip_existing_ocr", + os.environ.get("DATALAB_MARKER_STRIP_EXISTING_OCR", "false").lower() == "true", +) + +DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = PersistentConfig( + "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", + "rag.datalab_marker_disable_image_extraction", + os.environ.get("DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", "false").lower() + == "true", +) + +DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig( + "DATALAB_MARKER_OUTPUT_FORMAT", + "rag.datalab_marker_output_format", + os.environ.get("DATALAB_MARKER_OUTPUT_FORMAT", "markdown"), +) + EXTERNAL_DOCUMENT_LOADER_URL = PersistentConfig( "EXTERNAL_DOCUMENT_LOADER_URL", "rag.external_document_loader_url", diff --git a/backend/open_webui/functions.py b/backend/open_webui/functions.py index aa7dbccf95..6d8203839a 100644 --- a/backend/open_webui/functions.py +++ b/backend/open_webui/functions.py @@ -28,7 +28,10 @@ from open_webui.socket.main import ( from open_webui.models.functions import Functions from open_webui.models.models import Models -from open_webui.utils.plugin import load_function_module_by_id +from open_webui.utils.plugin import ( + load_function_module_by_id, + get_function_module_from_cache, +) from open_webui.utils.tools import get_tools from open_webui.utils.access_control import has_access @@ -53,9 +56,7 @@ log.setLevel(SRC_LOG_LEVELS["MAIN"]) def get_function_module_by_id(request: Request, pipe_id: str): - # Check if function is already loaded - function_module, _, _ = load_function_module_by_id(pipe_id) - request.app.state.FUNCTIONS[pipe_id] = function_module + function_module, _, _ = get_function_module_from_cache(request, pipe_id) if hasattr(function_module, "valves") and hasattr(function_module, "Valves"): valves = Functions.get_function_valves_by_id(pipe_id) diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 999993e84b..b57ed59f29 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -212,6 +212,15 @@ from open_webui.config import ( CHUNK_OVERLAP, CHUNK_SIZE, CONTENT_EXTRACTION_ENGINE, + DATALAB_MARKER_API_KEY, + DATALAB_MARKER_LANGS, + DATALAB_MARKER_SKIP_CACHE, + DATALAB_MARKER_FORCE_OCR, + DATALAB_MARKER_PAGINATE, + DATALAB_MARKER_STRIP_EXISTING_OCR, + DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, + DATALAB_MARKER_OUTPUT_FORMAT, + DATALAB_MARKER_USE_LLM, EXTERNAL_DOCUMENT_LOADER_URL, EXTERNAL_DOCUMENT_LOADER_API_KEY, TIKA_SERVER_URL, @@ -637,8 +646,12 @@ app.state.WEBUI_AUTH_SIGNOUT_REDIRECT_URL = WEBUI_AUTH_SIGNOUT_REDIRECT_URL app.state.EXTERNAL_PWA_MANIFEST_URL = EXTERNAL_PWA_MANIFEST_URL app.state.USER_COUNT = None + app.state.TOOLS = {} +app.state.TOOL_CONTENTS = {} + app.state.FUNCTIONS = {} +app.state.FUNCTION_CONTENTS = {} ######################################## # @@ -662,6 +675,17 @@ app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERIFICATION app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE +app.state.config.DATALAB_MARKER_API_KEY = DATALAB_MARKER_API_KEY +app.state.config.DATALAB_MARKER_LANGS = DATALAB_MARKER_LANGS +app.state.config.DATALAB_MARKER_SKIP_CACHE = DATALAB_MARKER_SKIP_CACHE +app.state.config.DATALAB_MARKER_FORCE_OCR = DATALAB_MARKER_FORCE_OCR +app.state.config.DATALAB_MARKER_PAGINATE = DATALAB_MARKER_PAGINATE +app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = DATALAB_MARKER_STRIP_EXISTING_OCR +app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = ( + DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION +) +app.state.config.DATALAB_MARKER_USE_LLM = DATALAB_MARKER_USE_LLM +app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = DATALAB_MARKER_OUTPUT_FORMAT app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY = EXTERNAL_DOCUMENT_LOADER_API_KEY app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL diff --git a/backend/open_webui/retrieval/loaders/datalab_marker.py b/backend/open_webui/retrieval/loaders/datalab_marker.py new file mode 100644 index 0000000000..104c2830df --- /dev/null +++ b/backend/open_webui/retrieval/loaders/datalab_marker.py @@ -0,0 +1,251 @@ +import os +import time +import requests +import logging +import json +from typing import List, Optional +from langchain_core.documents import Document +from fastapi import HTTPException, status + +log = logging.getLogger(__name__) + + +class DatalabMarkerLoader: + def __init__( + self, + file_path: str, + api_key: str, + langs: Optional[str] = None, + use_llm: bool = False, + skip_cache: bool = False, + force_ocr: bool = False, + paginate: bool = False, + strip_existing_ocr: bool = False, + disable_image_extraction: bool = False, + output_format: str = None, + ): + self.file_path = file_path + self.api_key = api_key + self.langs = langs + self.use_llm = use_llm + self.skip_cache = skip_cache + self.force_ocr = force_ocr + self.paginate = paginate + self.strip_existing_ocr = strip_existing_ocr + self.disable_image_extraction = disable_image_extraction + self.output_format = output_format + + def _get_mime_type(self, filename: str) -> str: + ext = filename.rsplit(".", 1)[-1].lower() + mime_map = { + "pdf": "application/pdf", + "xls": "application/vnd.ms-excel", + "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "ods": "application/vnd.oasis.opendocument.spreadsheet", + "doc": "application/msword", + "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "odt": "application/vnd.oasis.opendocument.text", + "ppt": "application/vnd.ms-powerpoint", + "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "odp": "application/vnd.oasis.opendocument.presentation", + "html": "text/html", + "epub": "application/epub+zip", + "png": "image/png", + "jpeg": "image/jpeg", + "jpg": "image/jpeg", + "webp": "image/webp", + "gif": "image/gif", + "tiff": "image/tiff", + } + return mime_map.get(ext, "application/octet-stream") + + def check_marker_request_status(self, request_id: str) -> dict: + url = f"https://www.datalab.to/api/v1/marker/{request_id}" + headers = {"X-Api-Key": self.api_key} + try: + response = requests.get(url, headers=headers) + response.raise_for_status() + result = response.json() + log.info(f"Marker API status check for request {request_id}: {result}") + return result + except requests.HTTPError as e: + log.error(f"Error checking Marker request status: {e}") + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail=f"Failed to check Marker request: {e}", + ) + except ValueError as e: + log.error(f"Invalid JSON checking Marker request: {e}") + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, detail=f"Invalid JSON: {e}" + ) + + def load(self) -> List[Document]: + url = "https://www.datalab.to/api/v1/marker" + filename = os.path.basename(self.file_path) + mime_type = self._get_mime_type(filename) + headers = {"X-Api-Key": self.api_key} + + form_data = { + "langs": self.langs, + "use_llm": str(self.use_llm).lower(), + "skip_cache": str(self.skip_cache).lower(), + "force_ocr": str(self.force_ocr).lower(), + "paginate": str(self.paginate).lower(), + "strip_existing_ocr": str(self.strip_existing_ocr).lower(), + "disable_image_extraction": str(self.disable_image_extraction).lower(), + "output_format": self.output_format, + } + + log.info( + f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}" + ) + + try: + with open(self.file_path, "rb") as f: + files = {"file": (filename, f, mime_type)} + response = requests.post( + url, data=form_data, files=files, headers=headers + ) + response.raise_for_status() + result = response.json() + except FileNotFoundError: + raise HTTPException( + status.HTTP_404_NOT_FOUND, detail=f"File not found: {self.file_path}" + ) + except requests.HTTPError as e: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"Datalab Marker request failed: {e}", + ) + except ValueError as e: + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, detail=f"Invalid JSON response: {e}" + ) + except Exception as e: + raise HTTPException(status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + + if not result.get("success"): + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"Datalab Marker request failed: {result.get('error', 'Unknown error')}", + ) + + check_url = result.get("request_check_url") + request_id = result.get("request_id") + if not check_url: + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, detail="No request_check_url returned." + ) + + for _ in range(300): # Up to 10 minutes + time.sleep(2) + try: + poll_response = requests.get(check_url, headers=headers) + poll_response.raise_for_status() + poll_result = poll_response.json() + except (requests.HTTPError, ValueError) as e: + raw_body = poll_response.text + log.error(f"Polling error: {e}, response body: {raw_body}") + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}" + ) + + status_val = poll_result.get("status") + success_val = poll_result.get("success") + + if status_val == "complete": + summary = { + k: poll_result.get(k) + for k in ( + "status", + "output_format", + "success", + "error", + "page_count", + "total_cost", + ) + } + log.info( + f"Marker processing completed successfully: {json.dumps(summary, indent=2)}" + ) + break + + if status_val == "failed" or success_val is False: + log.error( + f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}" + ) + error_msg = ( + poll_result.get("error") + or "Marker returned failure without error message" + ) + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"Marker processing failed: {error_msg}", + ) + else: + raise HTTPException( + status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out" + ) + + if not poll_result.get("success", False): + error_msg = poll_result.get("error") or "Unknown processing error" + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"Final processing failed: {error_msg}", + ) + + content_key = self.output_format.lower() + raw_content = poll_result.get(content_key) + + if content_key == "json": + full_text = json.dumps(raw_content, indent=2) + elif content_key in {"markdown", "html"}: + full_text = str(raw_content).strip() + else: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"Unsupported output format: {self.output_format}", + ) + + if not full_text: + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail="Datalab Marker returned empty content", + ) + + marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output") + os.makedirs(marker_output_dir, exist_ok=True) + + file_ext_map = {"markdown": "md", "json": "json", "html": "html"} + file_ext = file_ext_map.get(content_key, "txt") + output_filename = f"{os.path.splitext(filename)[0]}.{file_ext}" + output_path = os.path.join(marker_output_dir, output_filename) + + try: + with open(output_path, "w", encoding="utf-8") as f: + f.write(full_text) + log.info(f"Saved Marker output to: {output_path}") + except Exception as e: + log.warning(f"Failed to write marker output to disk: {e}") + + metadata = { + "source": filename, + "output_format": poll_result.get("output_format", self.output_format), + "page_count": poll_result.get("page_count", 0), + "processed_with_llm": self.use_llm, + "request_id": request_id or "", + } + + images = poll_result.get("images", {}) + if images: + metadata["image_count"] = len(images) + metadata["images"] = json.dumps(list(images.keys())) + + for k, v in metadata.items(): + if isinstance(v, (dict, list)): + metadata[k] = json.dumps(v) + elif v is None: + metadata[k] = "" + + return [Document(page_content=full_text, metadata=metadata)] diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 22397b3b4a..0d0ff851b7 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -21,9 +21,11 @@ from langchain_community.document_loaders import ( ) from langchain_core.documents import Document - from open_webui.retrieval.loaders.external_document import ExternalDocumentLoader + from open_webui.retrieval.loaders.mistral import MistralLoader +from open_webui.retrieval.loaders.datalab_marker import DatalabMarkerLoader + from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL @@ -236,6 +238,49 @@ class Loader: mime_type=file_content_type, extract_images=self.kwargs.get("PDF_EXTRACT_IMAGES"), ) + elif ( + self.engine == "datalab_marker" + and self.kwargs.get("DATALAB_MARKER_API_KEY") + and file_ext + in [ + "pdf", + "xls", + "xlsx", + "ods", + "doc", + "docx", + "odt", + "ppt", + "pptx", + "odp", + "html", + "epub", + "png", + "jpeg", + "jpg", + "webp", + "gif", + "tiff", + ] + ): + loader = DatalabMarkerLoader( + file_path=file_path, + api_key=self.kwargs["DATALAB_MARKER_API_KEY"], + langs=self.kwargs.get("DATALAB_MARKER_LANGS"), + use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False), + skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False), + force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False), + paginate=self.kwargs.get("DATALAB_MARKER_PAGINATE", False), + strip_existing_ocr=self.kwargs.get( + "DATALAB_MARKER_STRIP_EXISTING_OCR", False + ), + disable_image_extraction=self.kwargs.get( + "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False + ), + output_format=self.kwargs.get( + "DATALAB_MARKER_OUTPUT_FORMAT", "markdown" + ), + ) elif self.engine == "docling" and self.kwargs.get("DOCLING_SERVER_URL"): if self._is_text_file(file_ext, file_content_type): loader = TextLoader(file_path, autodetect_encoding=True) diff --git a/backend/open_webui/routers/functions.py b/backend/open_webui/routers/functions.py index 2748fa95ce..355093335a 100644 --- a/backend/open_webui/routers/functions.py +++ b/backend/open_webui/routers/functions.py @@ -12,7 +12,11 @@ from open_webui.models.functions import ( FunctionResponse, Functions, ) -from open_webui.utils.plugin import load_function_module_by_id, replace_imports +from open_webui.utils.plugin import ( + load_function_module_by_id, + replace_imports, + get_function_module_from_cache, +) from open_webui.config import CACHE_DIR from open_webui.constants import ERROR_MESSAGES from fastapi import APIRouter, Depends, HTTPException, Request, status @@ -358,8 +362,9 @@ async def get_function_valves_spec_by_id( ): function = Functions.get_function_by_id(id) if function: - function_module, function_type, frontmatter = load_function_module_by_id(id) - request.app.state.FUNCTIONS[id] = function_module + function_module, function_type, frontmatter = get_function_module_from_cache( + request, id + ) if hasattr(function_module, "Valves"): Valves = function_module.Valves @@ -383,8 +388,9 @@ async def update_function_valves_by_id( ): function = Functions.get_function_by_id(id) if function: - function_module, function_type, frontmatter = load_function_module_by_id(id) - request.app.state.FUNCTIONS[id] = function_module + function_module, function_type, frontmatter = get_function_module_from_cache( + request, id + ) if hasattr(function_module, "Valves"): Valves = function_module.Valves @@ -443,8 +449,9 @@ async def get_function_user_valves_spec_by_id( ): function = Functions.get_function_by_id(id) if function: - function_module, function_type, frontmatter = load_function_module_by_id(id) - request.app.state.FUNCTIONS[id] = function_module + function_module, function_type, frontmatter = get_function_module_from_cache( + request, id + ) if hasattr(function_module, "UserValves"): UserValves = function_module.UserValves @@ -464,8 +471,9 @@ async def update_function_user_valves_by_id( function = Functions.get_function_by_id(id) if function: - function_module, function_type, frontmatter = load_function_module_by_id(id) - request.app.state.FUNCTIONS[id] = function_module + function_module, function_type, frontmatter = get_function_module_from_cache( + request, id + ) if hasattr(function_module, "UserValves"): UserValves = function_module.UserValves diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 98f79c7fee..d652ff0255 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -353,6 +353,15 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): # Content extraction settings "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, + "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY, + "DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS, + "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE, + "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR, + "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE, + "DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, + "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, + "DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM, + "DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, "EXTERNAL_DOCUMENT_LOADER_API_KEY": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY, "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL, @@ -500,6 +509,15 @@ class ConfigForm(BaseModel): # Content extraction settings CONTENT_EXTRACTION_ENGINE: Optional[str] = None PDF_EXTRACT_IMAGES: Optional[bool] = None + DATALAB_MARKER_API_KEY: Optional[str] = None + DATALAB_MARKER_LANGS: Optional[str] = None + DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None + DATALAB_MARKER_FORCE_OCR: Optional[bool] = None + DATALAB_MARKER_PAGINATE: Optional[bool] = None + DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None + DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None + DATALAB_MARKER_USE_LLM: Optional[bool] = None + DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None EXTERNAL_DOCUMENT_LOADER_API_KEY: Optional[str] = None @@ -599,6 +617,51 @@ async def update_rag_config( if form_data.PDF_EXTRACT_IMAGES is not None else request.app.state.config.PDF_EXTRACT_IMAGES ) + request.app.state.config.DATALAB_MARKER_API_KEY = ( + form_data.DATALAB_MARKER_API_KEY + if form_data.DATALAB_MARKER_API_KEY is not None + else request.app.state.config.DATALAB_MARKER_API_KEY + ) + request.app.state.config.DATALAB_MARKER_LANGS = ( + form_data.DATALAB_MARKER_LANGS + if form_data.DATALAB_MARKER_LANGS is not None + else request.app.state.config.DATALAB_MARKER_LANGS + ) + request.app.state.config.DATALAB_MARKER_SKIP_CACHE = ( + form_data.DATALAB_MARKER_SKIP_CACHE + if form_data.DATALAB_MARKER_SKIP_CACHE is not None + else request.app.state.config.DATALAB_MARKER_SKIP_CACHE + ) + request.app.state.config.DATALAB_MARKER_FORCE_OCR = ( + form_data.DATALAB_MARKER_FORCE_OCR + if form_data.DATALAB_MARKER_FORCE_OCR is not None + else request.app.state.config.DATALAB_MARKER_FORCE_OCR + ) + request.app.state.config.DATALAB_MARKER_PAGINATE = ( + form_data.DATALAB_MARKER_PAGINATE + if form_data.DATALAB_MARKER_PAGINATE is not None + else request.app.state.config.DATALAB_MARKER_PAGINATE + ) + request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = ( + form_data.DATALAB_MARKER_STRIP_EXISTING_OCR + if form_data.DATALAB_MARKER_STRIP_EXISTING_OCR is not None + else request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR + ) + request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = ( + form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION + if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None + else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION + ) + request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = ( + form_data.DATALAB_MARKER_OUTPUT_FORMAT + if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None + else request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT + ) + request.app.state.config.DATALAB_MARKER_USE_LLM = ( + form_data.DATALAB_MARKER_USE_LLM + if form_data.DATALAB_MARKER_USE_LLM is not None + else request.app.state.config.DATALAB_MARKER_USE_LLM + ) request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = ( form_data.EXTERNAL_DOCUMENT_LOADER_URL if form_data.EXTERNAL_DOCUMENT_LOADER_URL is not None @@ -853,6 +916,15 @@ async def update_rag_config( # Content extraction settings "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, + "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY, + "DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS, + "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE, + "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR, + "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE, + "DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, + "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, + "DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM, + "DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, "EXTERNAL_DOCUMENT_LOADER_API_KEY": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY, "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL, @@ -1178,6 +1250,15 @@ def process_file( file_path = Storage.get_file(file_path) loader = Loader( engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, + DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY, + DATALAB_MARKER_LANGS=request.app.state.config.DATALAB_MARKER_LANGS, + DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE, + DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR, + DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE, + DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, + DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, + DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM, + DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, EXTERNAL_DOCUMENT_LOADER_API_KEY=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY, TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL, @@ -1835,6 +1916,7 @@ class QueryCollectionsForm(BaseModel): k_reranker: Optional[int] = None r: Optional[float] = None hybrid: Optional[bool] = None + hybrid_bm25_weight: Optional[float] = None @router.post("/query/collection") diff --git a/backend/open_webui/routers/tools.py b/backend/open_webui/routers/tools.py index bd1ce8f625..f726368eba 100644 --- a/backend/open_webui/routers/tools.py +++ b/backend/open_webui/routers/tools.py @@ -2,6 +2,9 @@ import logging from pathlib import Path from typing import Optional import time +import re +import aiohttp +from pydantic import BaseModel, HttpUrl from open_webui.models.tools import ( ToolForm, @@ -21,6 +24,7 @@ from open_webui.env import SRC_LOG_LEVELS from open_webui.utils.tools import get_tool_servers_data + log = logging.getLogger(__name__) log.setLevel(SRC_LOG_LEVELS["MAIN"]) @@ -95,6 +99,81 @@ async def get_tool_list(user=Depends(get_verified_user)): return tools +############################ +# LoadFunctionFromLink +############################ + + +class LoadUrlForm(BaseModel): + url: HttpUrl + + +def github_url_to_raw_url(url: str) -> str: + # Handle 'tree' (folder) URLs (add main.py at the end) + m1 = re.match(r"https://github\.com/([^/]+)/([^/]+)/tree/([^/]+)/(.*)", url) + if m1: + org, repo, branch, path = m1.groups() + return f"https://raw.githubusercontent.com/{org}/{repo}/refs/heads/{branch}/{path.rstrip('/')}/main.py" + + # Handle 'blob' (file) URLs + m2 = re.match(r"https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.*)", url) + if m2: + org, repo, branch, path = m2.groups() + return ( + f"https://raw.githubusercontent.com/{org}/{repo}/refs/heads/{branch}/{path}" + ) + + # No match; return as-is + return url + + +@router.post("/load/url", response_model=Optional[dict]) +async def load_tool_from_url( + request: Request, form_data: LoadUrlForm, user=Depends(get_admin_user) +): + # NOTE: This is NOT a SSRF vulnerability: + # This endpoint is admin-only (see get_admin_user), meant for *trusted* internal use, + # and does NOT accept untrusted user input. Access is enforced by authentication. + + url = str(form_data.url) + if not url: + raise HTTPException(status_code=400, detail="Please enter a valid URL") + + url = github_url_to_raw_url(url) + url_parts = url.rstrip("/").split("/") + + file_name = url_parts[-1] + tool_name = ( + file_name[:-3] + if ( + file_name.endswith(".py") + and (not file_name.startswith(("main.py", "index.py", "__init__.py"))) + ) + else url_parts[-2] if len(url_parts) > 1 else "function" + ) + + try: + async with aiohttp.ClientSession() as session: + async with session.get( + url, headers={"Content-Type": "application/json"} + ) as resp: + if resp.status != 200: + raise HTTPException( + status_code=resp.status, detail="Failed to fetch the tool" + ) + data = await resp.text() + if not data: + raise HTTPException( + status_code=400, detail="No data received from the URL" + ) + return { + "name": tool_name, + "content": data, + } + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error importing tool: {e}") + + ############################ # ExportTools ############################ diff --git a/backend/open_webui/static/assets/pdf-style.css b/backend/open_webui/static/assets/pdf-style.css index 7cb5b0cd24..8b4e8d2370 100644 --- a/backend/open_webui/static/assets/pdf-style.css +++ b/backend/open_webui/static/assets/pdf-style.css @@ -269,11 +269,6 @@ tbody + tbody { margin-bottom: 0; } -/* Add a rule to reset margin-bottom for

not followed by