add format_lines

This commit is contained in:
Hisma 2025-07-22 21:06:29 -04:00
parent f31cc07a9d
commit a99e20cc3d
6 changed files with 35 additions and 0 deletions

View file

@ -2067,6 +2067,12 @@ DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = PersistentConfig(
== "true", == "true",
) )
DATALAB_MARKER_FORMAT_LINES = PersistentConfig(
"DATALAB_MARKER_FORMAT_LINES",
"rag.datalab_marker_format_lines",
os.environ.get("DATALAB_MARKER_FORMAT_LINES", "false").lower() == "true",
)
DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig( DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig(
"DATALAB_MARKER_OUTPUT_FORMAT", "DATALAB_MARKER_OUTPUT_FORMAT",
"rag.datalab_marker_output_format", "rag.datalab_marker_output_format",

View file

@ -234,6 +234,7 @@ from open_webui.config import (
DATALAB_MARKER_PAGINATE, DATALAB_MARKER_PAGINATE,
DATALAB_MARKER_STRIP_EXISTING_OCR, DATALAB_MARKER_STRIP_EXISTING_OCR,
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
DATALAB_MARKER_FORMAT_LINES,
DATALAB_MARKER_OUTPUT_FORMAT, DATALAB_MARKER_OUTPUT_FORMAT,
DATALAB_MARKER_USE_LLM, DATALAB_MARKER_USE_LLM,
EXTERNAL_DOCUMENT_LOADER_URL, EXTERNAL_DOCUMENT_LOADER_URL,
@ -777,6 +778,7 @@ app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = DATALAB_MARKER_STRIP_EXISTI
app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = ( app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = (
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
) )
app.state.config.DATALAB_MARKER_FORMAT_LINES = DATALAB_MARKER_FORMAT_LINES
app.state.config.DATALAB_MARKER_USE_LLM = DATALAB_MARKER_USE_LLM app.state.config.DATALAB_MARKER_USE_LLM = DATALAB_MARKER_USE_LLM
app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = DATALAB_MARKER_OUTPUT_FORMAT app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = DATALAB_MARKER_OUTPUT_FORMAT
app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL

View file

@ -23,6 +23,7 @@ class DatalabMarkerLoader:
paginate: bool = False, paginate: bool = False,
strip_existing_ocr: bool = False, strip_existing_ocr: bool = False,
disable_image_extraction: bool = False, disable_image_extraction: bool = False,
format_lines: bool = False,
output_format: str = None, output_format: str = None,
): ):
self.file_path = file_path self.file_path = file_path
@ -35,6 +36,7 @@ class DatalabMarkerLoader:
self.paginate = paginate self.paginate = paginate
self.strip_existing_ocr = strip_existing_ocr self.strip_existing_ocr = strip_existing_ocr
self.disable_image_extraction = disable_image_extraction self.disable_image_extraction = disable_image_extraction
self.format_lines = format_lines
self.output_format = output_format self.output_format = output_format
def _get_mime_type(self, filename: str) -> str: def _get_mime_type(self, filename: str) -> str:
@ -95,6 +97,7 @@ class DatalabMarkerLoader:
"paginate": str(self.paginate).lower(), "paginate": str(self.paginate).lower(),
"strip_existing_ocr": str(self.strip_existing_ocr).lower(), "strip_existing_ocr": str(self.strip_existing_ocr).lower(),
"disable_image_extraction": str(self.disable_image_extraction).lower(), "disable_image_extraction": str(self.disable_image_extraction).lower(),
"format_lines": str(self.format_lines).lower(),
"output_format": self.output_format, "output_format": self.output_format,
} }

View file

@ -300,6 +300,7 @@ class Loader:
disable_image_extraction=self.kwargs.get( disable_image_extraction=self.kwargs.get(
"DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False
), ),
format_lines=self.kwargs.get("DATALAB_MARKER_FORMAT_LINES", False),
output_format=self.kwargs.get( output_format=self.kwargs.get(
"DATALAB_MARKER_OUTPUT_FORMAT", "markdown" "DATALAB_MARKER_OUTPUT_FORMAT", "markdown"
), ),

View file

@ -408,6 +408,7 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE, "DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
"DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, "DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
"DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, "DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
"DATALAB_MARKER_FORMAT_LINES": request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
"DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM, "DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM,
"DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, "DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
"EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
@ -574,6 +575,7 @@ class ConfigForm(BaseModel):
DATALAB_MARKER_PAGINATE: Optional[bool] = None DATALAB_MARKER_PAGINATE: Optional[bool] = None
DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None
DATALAB_MARKER_FORMAT_LINES: Optional[bool] = None
DATALAB_MARKER_USE_LLM: Optional[bool] = None DATALAB_MARKER_USE_LLM: Optional[bool] = None
DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None
EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None
@ -720,6 +722,11 @@ async def update_rag_config(
if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None
else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
) )
request.app.state.config.DATALAB_MARKER_FORMAT_LINES = (
form_data.DATALAB_MARKER_FORMAT_LINES
if form_data.DATALAB_MARKER_FORMAT_LINES is not None
else request.app.state.config.DATALAB_MARKER_FORMAT_LINES
)
request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = ( request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = (
form_data.DATALAB_MARKER_OUTPUT_FORMAT form_data.DATALAB_MARKER_OUTPUT_FORMAT
if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None
@ -1421,6 +1428,7 @@ def process_file(
DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE, DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE,
DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR, DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION, DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
DATALAB_MARKER_FORMAT_LINES=request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM, DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM,
DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT, DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,

View file

@ -485,6 +485,21 @@
<Switch bind:state={RAGConfig.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION} /> <Switch bind:state={RAGConfig.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION} />
</div> </div>
</div> </div>
<div class="flex justify-between w-full mt-2">
<div class="self-center text-xs font-medium">
<Tooltip
content={$i18n.t(
'Format the lines in the output. Defaults to False. If set to True, the lines will be formatted to detect inline math and styles.'
)}
placement="top-start"
>
{$i18n.t('Format Lines')}
</Tooltip>
</div>
<div class="flex items-center">
<Switch bind:state={RAGConfig.DATALAB_MARKER_FORMAT_LINES} />
</div>
</div>
<div class="flex justify-between w-full mt-2"> <div class="flex justify-between w-full mt-2">
<div class="self-center text-xs font-medium"> <div class="self-center text-xs font-medium">
<Tooltip <Tooltip