diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 74e9f02f11..ad9b1430c2 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2032,10 +2032,16 @@ DATALAB_MARKER_API_KEY = PersistentConfig( os.environ.get("DATALAB_MARKER_API_KEY", ""), ) -DATALAB_MARKER_LANGS = PersistentConfig( - "DATALAB_MARKER_LANGS", - "rag.datalab_marker_langs", - os.environ.get("DATALAB_MARKER_LANGS", ""), +DATALAB_MARKER_API_BASE_URL = PersistentConfig( + "DATALAB_MARKER_API_BASE_URL", + "rag.datalab_marker_api_base_url", + os.environ.get("DATALAB_MARKER_API_BASE_URL", ""), +) + +DATALAB_MARKER_ADDITIONAL_CONFIG = PersistentConfig( + "DATALAB_MARKER_ADDITIONAL_CONFIG", + "rag.datalab_marker_additional_config", + os.environ.get("DATALAB_MARKER_ADDITIONAL_CONFIG", ""), ) DATALAB_MARKER_USE_LLM = PersistentConfig( @@ -2075,6 +2081,12 @@ DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = PersistentConfig( == "true", ) +DATALAB_MARKER_FORMAT_LINES = PersistentConfig( + "DATALAB_MARKER_FORMAT_LINES", + "rag.datalab_marker_format_lines", + os.environ.get("DATALAB_MARKER_FORMAT_LINES", "false").lower() == "true", +) + DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig( "DATALAB_MARKER_OUTPUT_FORMAT", "rag.datalab_marker_output_format", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 19cdd87e98..72e677103c 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -226,12 +226,14 @@ from open_webui.config import ( CHUNK_SIZE, CONTENT_EXTRACTION_ENGINE, DATALAB_MARKER_API_KEY, - DATALAB_MARKER_LANGS, + DATALAB_MARKER_API_BASE_URL, + DATALAB_MARKER_ADDITIONAL_CONFIG, DATALAB_MARKER_SKIP_CACHE, DATALAB_MARKER_FORCE_OCR, DATALAB_MARKER_PAGINATE, DATALAB_MARKER_STRIP_EXISTING_OCR, DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, + DATALAB_MARKER_FORMAT_LINES, DATALAB_MARKER_OUTPUT_FORMAT, DATALAB_MARKER_USE_LLM, EXTERNAL_DOCUMENT_LOADER_URL, @@ -771,7 +773,8 @@ app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERI app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE app.state.config.DATALAB_MARKER_API_KEY = DATALAB_MARKER_API_KEY -app.state.config.DATALAB_MARKER_LANGS = DATALAB_MARKER_LANGS +app.state.config.DATALAB_MARKER_API_BASE_URL = DATALAB_MARKER_API_BASE_URL +app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = DATALAB_MARKER_ADDITIONAL_CONFIG app.state.config.DATALAB_MARKER_SKIP_CACHE = DATALAB_MARKER_SKIP_CACHE app.state.config.DATALAB_MARKER_FORCE_OCR = DATALAB_MARKER_FORCE_OCR app.state.config.DATALAB_MARKER_PAGINATE = DATALAB_MARKER_PAGINATE @@ -779,6 +782,7 @@ app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = DATALAB_MARKER_STRIP_EXISTI app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = ( DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION ) +app.state.config.DATALAB_MARKER_FORMAT_LINES = DATALAB_MARKER_FORMAT_LINES app.state.config.DATALAB_MARKER_USE_LLM = DATALAB_MARKER_USE_LLM app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = DATALAB_MARKER_OUTPUT_FORMAT app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL diff --git a/backend/open_webui/retrieval/loaders/datalab_marker.py b/backend/open_webui/retrieval/loaders/datalab_marker.py index 104c2830df..edcf502e7a 100644 --- a/backend/open_webui/retrieval/loaders/datalab_marker.py +++ b/backend/open_webui/retrieval/loaders/datalab_marker.py @@ -15,24 +15,28 @@ class DatalabMarkerLoader: self, file_path: str, api_key: str, - langs: Optional[str] = None, + api_base_url: str, + additional_config: Optional[str] = None, use_llm: bool = False, skip_cache: bool = False, force_ocr: bool = False, paginate: bool = False, strip_existing_ocr: bool = False, disable_image_extraction: bool = False, + format_lines: bool = False, output_format: str = None, ): self.file_path = file_path self.api_key = api_key - self.langs = langs + self.api_base_url = api_base_url + self.additional_config = additional_config self.use_llm = use_llm self.skip_cache = skip_cache self.force_ocr = force_ocr self.paginate = paginate self.strip_existing_ocr = strip_existing_ocr self.disable_image_extraction = disable_image_extraction + self.format_lines = format_lines self.output_format = output_format def _get_mime_type(self, filename: str) -> str: @@ -60,7 +64,7 @@ class DatalabMarkerLoader: return mime_map.get(ext, "application/octet-stream") def check_marker_request_status(self, request_id: str) -> dict: - url = f"https://www.datalab.to/api/v1/marker/{request_id}" + url = f"{self.api_base_url}/{request_id}" headers = {"X-Api-Key": self.api_key} try: response = requests.get(url, headers=headers) @@ -81,22 +85,25 @@ class DatalabMarkerLoader: ) def load(self) -> List[Document]: - url = "https://www.datalab.to/api/v1/marker" + url = self.api_base_url filename = os.path.basename(self.file_path) mime_type = self._get_mime_type(filename) headers = {"X-Api-Key": self.api_key} form_data = { - "langs": self.langs, "use_llm": str(self.use_llm).lower(), "skip_cache": str(self.skip_cache).lower(), "force_ocr": str(self.force_ocr).lower(), "paginate": str(self.paginate).lower(), "strip_existing_ocr": str(self.strip_existing_ocr).lower(), "disable_image_extraction": str(self.disable_image_extraction).lower(), + "format_lines": str(self.format_lines).lower(), "output_format": self.output_format, } + if self.additional_config and self.additional_config.strip(): + form_data["additional_config"] = self.additional_config + log.info( f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}" ) @@ -133,74 +140,92 @@ class DatalabMarkerLoader: check_url = result.get("request_check_url") request_id = result.get("request_id") - if not check_url: - raise HTTPException( - status.HTTP_502_BAD_GATEWAY, detail="No request_check_url returned." - ) - for _ in range(300): # Up to 10 minutes - time.sleep(2) - try: - poll_response = requests.get(check_url, headers=headers) - poll_response.raise_for_status() - poll_result = poll_response.json() - except (requests.HTTPError, ValueError) as e: - raw_body = poll_response.text - log.error(f"Polling error: {e}, response body: {raw_body}") - raise HTTPException( - status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}" - ) - - status_val = poll_result.get("status") - success_val = poll_result.get("success") - - if status_val == "complete": - summary = { - k: poll_result.get(k) - for k in ( - "status", - "output_format", - "success", - "error", - "page_count", - "total_cost", + # Check if this is a direct response (self-hosted) or polling response (DataLab) + if check_url: + # DataLab polling pattern + for _ in range(300): # Up to 10 minutes + time.sleep(2) + try: + poll_response = requests.get(check_url, headers=headers) + poll_response.raise_for_status() + poll_result = poll_response.json() + except (requests.HTTPError, ValueError) as e: + raw_body = poll_response.text + log.error(f"Polling error: {e}, response body: {raw_body}") + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}" ) - } - log.info( - f"Marker processing completed successfully: {json.dumps(summary, indent=2)}" - ) - break - if status_val == "failed" or success_val is False: - log.error( - f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}" - ) - error_msg = ( - poll_result.get("error") - or "Marker returned failure without error message" + status_val = poll_result.get("status") + success_val = poll_result.get("success") + + if status_val == "complete": + summary = { + k: poll_result.get(k) + for k in ( + "status", + "output_format", + "success", + "error", + "page_count", + "total_cost", + ) + } + log.info( + f"Marker processing completed successfully: {json.dumps(summary, indent=2)}" + ) + break + + if status_val == "failed" or success_val is False: + log.error( + f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}" + ) + error_msg = ( + poll_result.get("error") + or "Marker returned failure without error message" + ) + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"Marker processing failed: {error_msg}", + ) + else: + raise HTTPException( + status.HTTP_504_GATEWAY_TIMEOUT, + detail="Marker processing timed out", ) + + if not poll_result.get("success", False): + error_msg = poll_result.get("error") or "Unknown processing error" raise HTTPException( status.HTTP_400_BAD_REQUEST, - detail=f"Marker processing failed: {error_msg}", + detail=f"Final processing failed: {error_msg}", ) + + # DataLab format - content in format-specific fields + content_key = self.output_format.lower() + raw_content = poll_result.get(content_key) + final_result = poll_result else: - raise HTTPException( - status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out" - ) + # Self-hosted direct response - content in "output" field + if "output" in result: + log.info("Self-hosted Marker returned direct response without polling") + raw_content = result.get("output") + final_result = result + else: + available_fields = ( + list(result.keys()) + if isinstance(result, dict) + else "non-dict response" + ) + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail=f"Custom Marker endpoint returned success but no 'output' field found. Available fields: {available_fields}. Expected either 'request_check_url' for polling or 'output' field for direct response.", + ) - if not poll_result.get("success", False): - error_msg = poll_result.get("error") or "Unknown processing error" - raise HTTPException( - status.HTTP_400_BAD_REQUEST, - detail=f"Final processing failed: {error_msg}", - ) - - content_key = self.output_format.lower() - raw_content = poll_result.get(content_key) - - if content_key == "json": + if self.output_format.lower() == "json": full_text = json.dumps(raw_content, indent=2) - elif content_key in {"markdown", "html"}: + elif self.output_format.lower() in {"markdown", "html"}: full_text = str(raw_content).strip() else: raise HTTPException( @@ -211,14 +236,14 @@ class DatalabMarkerLoader: if not full_text: raise HTTPException( status.HTTP_400_BAD_REQUEST, - detail="Datalab Marker returned empty content", + detail="Marker returned empty content", ) marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output") os.makedirs(marker_output_dir, exist_ok=True) file_ext_map = {"markdown": "md", "json": "json", "html": "html"} - file_ext = file_ext_map.get(content_key, "txt") + file_ext = file_ext_map.get(self.output_format.lower(), "txt") output_filename = f"{os.path.splitext(filename)[0]}.{file_ext}" output_path = os.path.join(marker_output_dir, output_filename) @@ -231,13 +256,13 @@ class DatalabMarkerLoader: metadata = { "source": filename, - "output_format": poll_result.get("output_format", self.output_format), - "page_count": poll_result.get("page_count", 0), + "output_format": final_result.get("output_format", self.output_format), + "page_count": final_result.get("page_count", 0), "processed_with_llm": self.use_llm, "request_id": request_id or "", } - images = poll_result.get("images", {}) + images = final_result.get("images", {}) if images: metadata["image_count"] = len(images) metadata["images"] = json.dumps(list(images.keys())) diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index dfbe5a6a55..b502827adf 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -281,10 +281,15 @@ class Loader: "tiff", ] ): + api_base_url = self.kwargs.get("DATALAB_MARKER_API_BASE_URL", "") + if not api_base_url or api_base_url.strip() == "": + api_base_url = "https://www.datalab.to/api/v1/marker" + loader = DatalabMarkerLoader( file_path=file_path, api_key=self.kwargs["DATALAB_MARKER_API_KEY"], - langs=self.kwargs.get("DATALAB_MARKER_LANGS"), + api_base_url=api_base_url, + additional_config=self.kwargs.get("DATALAB_MARKER_ADDITIONAL_CONFIG"), use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False), skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False), force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False), @@ -295,6 +300,7 @@ class Loader: disable_image_extraction=self.kwargs.get( "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False ), + format_lines=self.kwargs.get("DATALAB_MARKER_FORMAT_LINES", False), output_format=self.kwargs.get( "DATALAB_MARKER_OUTPUT_FORMAT", "markdown" ), diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 731e93c50f..09bbc5b193 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -401,12 +401,14 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY, - "DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS, + "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL, + "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG, "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE, "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR, "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE, "DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, + "DATALAB_MARKER_FORMAT_LINES": request.app.state.config.DATALAB_MARKER_FORMAT_LINES, "DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM, "DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, @@ -566,12 +568,14 @@ class ConfigForm(BaseModel): CONTENT_EXTRACTION_ENGINE: Optional[str] = None PDF_EXTRACT_IMAGES: Optional[bool] = None DATALAB_MARKER_API_KEY: Optional[str] = None - DATALAB_MARKER_LANGS: Optional[str] = None + DATALAB_MARKER_API_BASE_URL: Optional[str] = None + DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None DATALAB_MARKER_FORCE_OCR: Optional[bool] = None DATALAB_MARKER_PAGINATE: Optional[bool] = None DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None + DATALAB_MARKER_FORMAT_LINES: Optional[bool] = None DATALAB_MARKER_USE_LLM: Optional[bool] = None DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None @@ -683,10 +687,15 @@ async def update_rag_config( if form_data.DATALAB_MARKER_API_KEY is not None else request.app.state.config.DATALAB_MARKER_API_KEY ) - request.app.state.config.DATALAB_MARKER_LANGS = ( - form_data.DATALAB_MARKER_LANGS - if form_data.DATALAB_MARKER_LANGS is not None - else request.app.state.config.DATALAB_MARKER_LANGS + request.app.state.config.DATALAB_MARKER_API_BASE_URL = ( + form_data.DATALAB_MARKER_API_BASE_URL + if form_data.DATALAB_MARKER_API_BASE_URL is not None + else request.app.state.config.DATALAB_MARKER_API_BASE_URL + ) + request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = ( + form_data.DATALAB_MARKER_ADDITIONAL_CONFIG + if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None + else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG ) request.app.state.config.DATALAB_MARKER_SKIP_CACHE = ( form_data.DATALAB_MARKER_SKIP_CACHE @@ -713,6 +722,11 @@ async def update_rag_config( if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION ) + request.app.state.config.DATALAB_MARKER_FORMAT_LINES = ( + form_data.DATALAB_MARKER_FORMAT_LINES + if form_data.DATALAB_MARKER_FORMAT_LINES is not None + else request.app.state.config.DATALAB_MARKER_FORMAT_LINES + ) request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = ( form_data.DATALAB_MARKER_OUTPUT_FORMAT if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None @@ -1006,7 +1020,8 @@ async def update_rag_config( "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY, - "DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS, + "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL, + "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG, "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE, "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR, "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE, @@ -1393,12 +1408,14 @@ def process_file( loader = Loader( engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY, - DATALAB_MARKER_LANGS=request.app.state.config.DATALAB_MARKER_LANGS, + DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL, + DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG, DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE, DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR, DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE, DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, + DATALAB_MARKER_FORMAT_LINES=request.app.state.config.DATALAB_MARKER_FORMAT_LINES, DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM, DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 993cc6553f..ac332d3e1f 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -170,6 +170,19 @@ return; } + if ( + RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker' && + RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG && + RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG.trim() !== '' + ) { + try { + JSON.parse(RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG); + } catch (e) { + toast.error($i18n.t('Invalid JSON format in Additional Config')); + return; + } + } + if ( RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' && (RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT === '' || @@ -195,10 +208,6 @@ ALLOWED_FILE_EXTENSIONS: RAGConfig.ALLOWED_FILE_EXTENSIONS.split(',') .map((ext) => ext.trim()) .filter((ext) => ext !== ''), - DATALAB_MARKER_LANGS: RAGConfig.DATALAB_MARKER_LANGS.split(',') - .map((code) => code.trim()) - .filter((code) => code !== '') - .join(', '), DOCLING_PICTURE_DESCRIPTION_LOCAL: JSON.parse( RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL || '{}' ), @@ -243,6 +252,11 @@ 2 ); + // Set default API Base URL if empty + if (!config.DATALAB_MARKER_API_BASE_URL) { + config.DATALAB_MARKER_API_BASE_URL = 'https://www.datalab.to/api/v1/marker'; + } + RAGConfig = config; }); @@ -336,6 +350,21 @@ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'} +