From f31cc07a9d2c49267cb161cfdf73dab031fd8ff1 Mon Sep 17 00:00:00 2001 From: Hisma Date: Tue, 22 Jul 2025 20:49:28 -0400 Subject: [PATCH] feat: update marker api --- backend/open_webui/config.py | 14 +- backend/open_webui/main.py | 6 +- .../retrieval/loaders/datalab_marker.py | 155 ++++++++++-------- backend/open_webui/retrieval/loaders/main.py | 7 +- backend/open_webui/routers/retrieval.py | 25 ++- .../admin/Settings/Documents.svelte | 62 +++++-- 6 files changed, 174 insertions(+), 95 deletions(-) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 49ab1a9aad..d403f6acdd 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2018,10 +2018,16 @@ DATALAB_MARKER_API_KEY = PersistentConfig( os.environ.get("DATALAB_MARKER_API_KEY", ""), ) -DATALAB_MARKER_LANGS = PersistentConfig( - "DATALAB_MARKER_LANGS", - "rag.datalab_marker_langs", - os.environ.get("DATALAB_MARKER_LANGS", ""), +DATALAB_MARKER_API_BASE_URL = PersistentConfig( + "DATALAB_MARKER_API_BASE_URL", + "rag.datalab_marker_api_base_url", + os.environ.get("DATALAB_MARKER_API_BASE_URL", ""), +) + +DATALAB_MARKER_ADDITIONAL_CONFIG = PersistentConfig( + "DATALAB_MARKER_ADDITIONAL_CONFIG", + "rag.datalab_marker_additional_config", + os.environ.get("DATALAB_MARKER_ADDITIONAL_CONFIG", ""), ) DATALAB_MARKER_USE_LLM = PersistentConfig( diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index f89db29d74..55e94dce35 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -227,7 +227,8 @@ from open_webui.config import ( CHUNK_SIZE, CONTENT_EXTRACTION_ENGINE, DATALAB_MARKER_API_KEY, - DATALAB_MARKER_LANGS, + DATALAB_MARKER_API_BASE_URL, + DATALAB_MARKER_ADDITIONAL_CONFIG, DATALAB_MARKER_SKIP_CACHE, DATALAB_MARKER_FORCE_OCR, DATALAB_MARKER_PAGINATE, @@ -767,7 +768,8 @@ app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERI app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE app.state.config.DATALAB_MARKER_API_KEY = DATALAB_MARKER_API_KEY -app.state.config.DATALAB_MARKER_LANGS = DATALAB_MARKER_LANGS +app.state.config.DATALAB_MARKER_API_BASE_URL = DATALAB_MARKER_API_BASE_URL +app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = DATALAB_MARKER_ADDITIONAL_CONFIG app.state.config.DATALAB_MARKER_SKIP_CACHE = DATALAB_MARKER_SKIP_CACHE app.state.config.DATALAB_MARKER_FORCE_OCR = DATALAB_MARKER_FORCE_OCR app.state.config.DATALAB_MARKER_PAGINATE = DATALAB_MARKER_PAGINATE diff --git a/backend/open_webui/retrieval/loaders/datalab_marker.py b/backend/open_webui/retrieval/loaders/datalab_marker.py index 104c2830df..25534bbb68 100644 --- a/backend/open_webui/retrieval/loaders/datalab_marker.py +++ b/backend/open_webui/retrieval/loaders/datalab_marker.py @@ -15,7 +15,8 @@ class DatalabMarkerLoader: self, file_path: str, api_key: str, - langs: Optional[str] = None, + api_base_url: str, + additional_config: Optional[str] = None, use_llm: bool = False, skip_cache: bool = False, force_ocr: bool = False, @@ -26,7 +27,8 @@ class DatalabMarkerLoader: ): self.file_path = file_path self.api_key = api_key - self.langs = langs + self.api_base_url = api_base_url + self.additional_config = additional_config self.use_llm = use_llm self.skip_cache = skip_cache self.force_ocr = force_ocr @@ -60,7 +62,7 @@ class DatalabMarkerLoader: return mime_map.get(ext, "application/octet-stream") def check_marker_request_status(self, request_id: str) -> dict: - url = f"https://www.datalab.to/api/v1/marker/{request_id}" + url = f"{self.api_base_url}/{request_id}" headers = {"X-Api-Key": self.api_key} try: response = requests.get(url, headers=headers) @@ -81,13 +83,12 @@ class DatalabMarkerLoader: ) def load(self) -> List[Document]: - url = "https://www.datalab.to/api/v1/marker" + url = self.api_base_url filename = os.path.basename(self.file_path) mime_type = self._get_mime_type(filename) headers = {"X-Api-Key": self.api_key} form_data = { - "langs": self.langs, "use_llm": str(self.use_llm).lower(), "skip_cache": str(self.skip_cache).lower(), "force_ocr": str(self.force_ocr).lower(), @@ -97,6 +98,9 @@ class DatalabMarkerLoader: "output_format": self.output_format, } + if self.additional_config and self.additional_config.strip(): + form_data["additional_config"] = self.additional_config + log.info( f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}" ) @@ -133,74 +137,87 @@ class DatalabMarkerLoader: check_url = result.get("request_check_url") request_id = result.get("request_id") - if not check_url: - raise HTTPException( - status.HTTP_502_BAD_GATEWAY, detail="No request_check_url returned." - ) - - for _ in range(300): # Up to 10 minutes - time.sleep(2) - try: - poll_response = requests.get(check_url, headers=headers) - poll_response.raise_for_status() - poll_result = poll_response.json() - except (requests.HTTPError, ValueError) as e: - raw_body = poll_response.text - log.error(f"Polling error: {e}, response body: {raw_body}") - raise HTTPException( - status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}" - ) - - status_val = poll_result.get("status") - success_val = poll_result.get("success") - - if status_val == "complete": - summary = { - k: poll_result.get(k) - for k in ( - "status", - "output_format", - "success", - "error", - "page_count", - "total_cost", + + # Check if this is a direct response (self-hosted) or polling response (DataLab) + if check_url: + # DataLab polling pattern + for _ in range(300): # Up to 10 minutes + time.sleep(2) + try: + poll_response = requests.get(check_url, headers=headers) + poll_response.raise_for_status() + poll_result = poll_response.json() + except (requests.HTTPError, ValueError) as e: + raw_body = poll_response.text + log.error(f"Polling error: {e}, response body: {raw_body}") + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}" ) - } - log.info( - f"Marker processing completed successfully: {json.dumps(summary, indent=2)}" - ) - break - if status_val == "failed" or success_val is False: - log.error( - f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}" - ) - error_msg = ( - poll_result.get("error") - or "Marker returned failure without error message" + status_val = poll_result.get("status") + success_val = poll_result.get("success") + + if status_val == "complete": + summary = { + k: poll_result.get(k) + for k in ( + "status", + "output_format", + "success", + "error", + "page_count", + "total_cost", + ) + } + log.info( + f"Marker processing completed successfully: {json.dumps(summary, indent=2)}" + ) + break + + if status_val == "failed" or success_val is False: + log.error( + f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}" + ) + error_msg = ( + poll_result.get("error") + or "Marker returned failure without error message" + ) + raise HTTPException( + status.HTTP_400_BAD_REQUEST, + detail=f"Marker processing failed: {error_msg}", + ) + else: + raise HTTPException( + status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out" ) + + if not poll_result.get("success", False): + error_msg = poll_result.get("error") or "Unknown processing error" raise HTTPException( status.HTTP_400_BAD_REQUEST, - detail=f"Marker processing failed: {error_msg}", + detail=f"Final processing failed: {error_msg}", ) + + # DataLab format - content in format-specific fields + content_key = self.output_format.lower() + raw_content = poll_result.get(content_key) + final_result = poll_result else: - raise HTTPException( - status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out" - ) + # Self-hosted direct response - content in "output" field + if "output" in result: + log.info("Self-hosted Marker returned direct response without polling") + raw_content = result.get("output") + final_result = result + else: + available_fields = list(result.keys()) if isinstance(result, dict) else "non-dict response" + raise HTTPException( + status.HTTP_502_BAD_GATEWAY, + detail=f"Custom Marker endpoint returned success but no 'output' field found. Available fields: {available_fields}. Expected either 'request_check_url' for polling or 'output' field for direct response." + ) - if not poll_result.get("success", False): - error_msg = poll_result.get("error") or "Unknown processing error" - raise HTTPException( - status.HTTP_400_BAD_REQUEST, - detail=f"Final processing failed: {error_msg}", - ) - - content_key = self.output_format.lower() - raw_content = poll_result.get(content_key) - - if content_key == "json": + if self.output_format.lower() == "json": full_text = json.dumps(raw_content, indent=2) - elif content_key in {"markdown", "html"}: + elif self.output_format.lower() in {"markdown", "html"}: full_text = str(raw_content).strip() else: raise HTTPException( @@ -211,14 +228,14 @@ class DatalabMarkerLoader: if not full_text: raise HTTPException( status.HTTP_400_BAD_REQUEST, - detail="Datalab Marker returned empty content", + detail="Marker returned empty content", ) marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output") os.makedirs(marker_output_dir, exist_ok=True) file_ext_map = {"markdown": "md", "json": "json", "html": "html"} - file_ext = file_ext_map.get(content_key, "txt") + file_ext = file_ext_map.get(self.output_format.lower(), "txt") output_filename = f"{os.path.splitext(filename)[0]}.{file_ext}" output_path = os.path.join(marker_output_dir, output_filename) @@ -231,13 +248,13 @@ class DatalabMarkerLoader: metadata = { "source": filename, - "output_format": poll_result.get("output_format", self.output_format), - "page_count": poll_result.get("page_count", 0), + "output_format": final_result.get("output_format", self.output_format), + "page_count": final_result.get("page_count", 0), "processed_with_llm": self.use_llm, "request_id": request_id or "", } - images = poll_result.get("images", {}) + images = final_result.get("images", {}) if images: metadata["image_count"] = len(images) metadata["images"] = json.dumps(list(images.keys())) diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index e57323e1eb..dd34a70669 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -281,10 +281,15 @@ class Loader: "tiff", ] ): + api_base_url = self.kwargs.get("DATALAB_MARKER_API_BASE_URL", "") + if not api_base_url or api_base_url.strip() == "": + api_base_url = "https://www.datalab.to/api/v1/marker" + loader = DatalabMarkerLoader( file_path=file_path, api_key=self.kwargs["DATALAB_MARKER_API_KEY"], - langs=self.kwargs.get("DATALAB_MARKER_LANGS"), + api_base_url=api_base_url, + additional_config=self.kwargs.get("DATALAB_MARKER_ADDITIONAL_CONFIG"), use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False), skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False), force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False), diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index fac5706f03..62f1782864 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -401,7 +401,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY, - "DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS, + "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL, + "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG, "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE, "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR, "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE, @@ -566,7 +567,8 @@ class ConfigForm(BaseModel): CONTENT_EXTRACTION_ENGINE: Optional[str] = None PDF_EXTRACT_IMAGES: Optional[bool] = None DATALAB_MARKER_API_KEY: Optional[str] = None - DATALAB_MARKER_LANGS: Optional[str] = None + DATALAB_MARKER_API_BASE_URL: Optional[str] = None + DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None DATALAB_MARKER_FORCE_OCR: Optional[bool] = None DATALAB_MARKER_PAGINATE: Optional[bool] = None @@ -683,10 +685,15 @@ async def update_rag_config( if form_data.DATALAB_MARKER_API_KEY is not None else request.app.state.config.DATALAB_MARKER_API_KEY ) - request.app.state.config.DATALAB_MARKER_LANGS = ( - form_data.DATALAB_MARKER_LANGS - if form_data.DATALAB_MARKER_LANGS is not None - else request.app.state.config.DATALAB_MARKER_LANGS + request.app.state.config.DATALAB_MARKER_API_BASE_URL = ( + form_data.DATALAB_MARKER_API_BASE_URL + if form_data.DATALAB_MARKER_API_BASE_URL is not None + else request.app.state.config.DATALAB_MARKER_API_BASE_URL + ) + request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = ( + form_data.DATALAB_MARKER_ADDITIONAL_CONFIG + if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None + else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG ) request.app.state.config.DATALAB_MARKER_SKIP_CACHE = ( form_data.DATALAB_MARKER_SKIP_CACHE @@ -1006,7 +1013,8 @@ async def update_rag_config( "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, "DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY, - "DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS, + "DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL, + "DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG, "DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE, "DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR, "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE, @@ -1406,7 +1414,8 @@ def process_file( loader = Loader( engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY, - DATALAB_MARKER_LANGS=request.app.state.config.DATALAB_MARKER_LANGS, + DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL, + DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG, DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE, DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR, DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE, diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 993cc6553f..6ef1f876bc 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -170,6 +170,19 @@ return; } + if ( + RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker' && + RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG && + RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG.trim() !== '' + ) { + try { + JSON.parse(RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG); + } catch (e) { + toast.error($i18n.t('Invalid JSON format in Additional Config')); + return; + } + } + if ( RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' && (RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT === '' || @@ -243,6 +256,11 @@ 2 ); + // Set default API Base URL if empty + if (!config.DATALAB_MARKER_API_BASE_URL) { + config.DATALAB_MARKER_API_BASE_URL = 'https://www.datalab.to/api/v1/marker'; + } + RAGConfig = config; }); @@ -337,6 +355,19 @@ {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}
+ + +
-
-
- {$i18n.t('Languages')} +
+
+
+ {$i18n.t('Additional Config')} +
+
+ +