From a99e20cc3d9b7d17f79495a134589c0f182e1b7c Mon Sep 17 00:00:00 2001 From: Hisma Date: Tue, 22 Jul 2025 21:06:29 -0400 Subject: [PATCH] add format_lines --- backend/open_webui/config.py | 6 ++++++ backend/open_webui/main.py | 2 ++ .../retrieval/loaders/datalab_marker.py | 3 +++ backend/open_webui/retrieval/loaders/main.py | 1 + backend/open_webui/routers/retrieval.py | 8 ++++++++ .../components/admin/Settings/Documents.svelte | 15 +++++++++++++++ 6 files changed, 35 insertions(+) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index d403f6acdd..de6abcda60 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2067,6 +2067,12 @@ DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = PersistentConfig( == "true", ) +DATALAB_MARKER_FORMAT_LINES = PersistentConfig( + "DATALAB_MARKER_FORMAT_LINES", + "rag.datalab_marker_format_lines", + os.environ.get("DATALAB_MARKER_FORMAT_LINES", "false").lower() == "true", +) + DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig( "DATALAB_MARKER_OUTPUT_FORMAT", "rag.datalab_marker_output_format", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 55e94dce35..68d5acc5a2 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -234,6 +234,7 @@ from open_webui.config import ( DATALAB_MARKER_PAGINATE, DATALAB_MARKER_STRIP_EXISTING_OCR, DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, + DATALAB_MARKER_FORMAT_LINES, DATALAB_MARKER_OUTPUT_FORMAT, DATALAB_MARKER_USE_LLM, EXTERNAL_DOCUMENT_LOADER_URL, @@ -777,6 +778,7 @@ app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = DATALAB_MARKER_STRIP_EXISTI app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = ( DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION ) +app.state.config.DATALAB_MARKER_FORMAT_LINES = DATALAB_MARKER_FORMAT_LINES app.state.config.DATALAB_MARKER_USE_LLM = DATALAB_MARKER_USE_LLM app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = DATALAB_MARKER_OUTPUT_FORMAT app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL diff --git a/backend/open_webui/retrieval/loaders/datalab_marker.py b/backend/open_webui/retrieval/loaders/datalab_marker.py index 25534bbb68..87d7c226c7 100644 --- a/backend/open_webui/retrieval/loaders/datalab_marker.py +++ b/backend/open_webui/retrieval/loaders/datalab_marker.py @@ -23,6 +23,7 @@ class DatalabMarkerLoader: paginate: bool = False, strip_existing_ocr: bool = False, disable_image_extraction: bool = False, + format_lines: bool = False, output_format: str = None, ): self.file_path = file_path @@ -35,6 +36,7 @@ class DatalabMarkerLoader: self.paginate = paginate self.strip_existing_ocr = strip_existing_ocr self.disable_image_extraction = disable_image_extraction + self.format_lines = format_lines self.output_format = output_format def _get_mime_type(self, filename: str) -> str: @@ -95,6 +97,7 @@ class DatalabMarkerLoader: "paginate": str(self.paginate).lower(), "strip_existing_ocr": str(self.strip_existing_ocr).lower(), "disable_image_extraction": str(self.disable_image_extraction).lower(), + "format_lines": str(self.format_lines).lower(), "output_format": self.output_format, } diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index dd34a70669..763b6ba650 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -300,6 +300,7 @@ class Loader: disable_image_extraction=self.kwargs.get( "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False ), + format_lines=self.kwargs.get("DATALAB_MARKER_FORMAT_LINES", False), output_format=self.kwargs.get( "DATALAB_MARKER_OUTPUT_FORMAT", "markdown" ), diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 62f1782864..b920032acd 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -408,6 +408,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE, "DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, + "DATALAB_MARKER_FORMAT_LINES": request.app.state.config.DATALAB_MARKER_FORMAT_LINES, "DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM, "DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, @@ -574,6 +575,7 @@ class ConfigForm(BaseModel): DATALAB_MARKER_PAGINATE: Optional[bool] = None DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None + DATALAB_MARKER_FORMAT_LINES: Optional[bool] = None DATALAB_MARKER_USE_LLM: Optional[bool] = None DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None @@ -720,6 +722,11 @@ async def update_rag_config( if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION ) + request.app.state.config.DATALAB_MARKER_FORMAT_LINES = ( + form_data.DATALAB_MARKER_FORMAT_LINES + if form_data.DATALAB_MARKER_FORMAT_LINES is not None + else request.app.state.config.DATALAB_MARKER_FORMAT_LINES + ) request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = ( form_data.DATALAB_MARKER_OUTPUT_FORMAT if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None @@ -1421,6 +1428,7 @@ def process_file( DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE, DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, + DATALAB_MARKER_FORMAT_LINES=request.app.state.config.DATALAB_MARKER_FORMAT_LINES, DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM, DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 6ef1f876bc..6860899e1b 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -485,6 +485,21 @@ +
+
+ + {$i18n.t('Format Lines')} + +
+
+ +
+